[llvm] [LoadStoreVectorizer] Support vectorization of mixed-type contiguous accesses (PR #177908)

Anshil Gandhi via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 1 08:13:28 PDT 2026


https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/177908

>From 03d178b098b91bb74b2188bdb704dfa87da2c2c1 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 20 Dec 2025 02:32:15 -0500
Subject: [PATCH] [LoadStoreVectorizer] Support vectorization of mixed-type
 contiguous accesses

Relaxes the LoadStoreVectorizer to support merging contiguous loads and
stores of different element types (e.g., merging an i32 and an i16 load).

Key changes:
- Remove `ElementSize` from the `EqClassKey`, allowing instructions with
  different scalar types to be grouped together if they share the same
  underlying object and address space.
- Calculate the vector element type using the Greatest Common Divisor (GCD)
  of all scalar bitwidths in the chain.
- Add `splitChainByPointerWidth` to ensure pointer-typed elements whose bit-
  width exceeds the GCD are partitioned into separate chains. This prevents
  vectorizing pointers at a width smaller than their native size.
- Add `resizeAndBitcast` to handle mismatched type conversions during
  vectorization.
- Update `vectorizeChain` to use shuffles, extracts, and bitcasts to
  reconstruct original values from the vectorized result (for loads) or
  prepare values for insertion (for stores).
- Update and add extensive tests covering mixed integer sizes, pointer
  types, and floating-point mixes. Existing AMDGPU and X86 tests are
  updated to reflect the more aggressive vectorization.

Resolves #97715
---
 .../Vectorize/LoadStoreVectorizer.cpp         | 311 ++++++--
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 139 ++--
 .../GlobalISel/llvm.amdgcn.trig.preop.ll      |  17 +-
 llvm/test/CodeGen/AMDGPU/add_i128.ll          |  14 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     | 234 +++---
 .../AMDGPU/av-split-dead-valno-crash.ll       |  58 +-
 .../branch-folding-implicit-def-subreg.ll     | 560 ++++++++-------
 .../AMDGPU/branch-relaxation-gfx1250.ll       |  94 ++-
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 138 ++--
 llvm/test/CodeGen/AMDGPU/ctpop64.ll           |  60 +-
 .../AMDGPU/divergence-driven-trunc-to-i1.ll   |  30 +-
 .../CodeGen/AMDGPU/extract_vector_elt-f16.ll  |  58 +-
 .../CodeGen/AMDGPU/extract_vector_elt-f64.ll  |  10 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i16.ll  |  29 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i64.ll  |  14 +-
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      | 300 ++++----
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |  38 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     | 672 ++++++++----------
 .../AMDGPU/fp64-min-max-buffer-atomics.ll     | 101 ++-
 .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll | 101 ++-
 .../CodeGen/AMDGPU/gep-const-address-space.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    | 380 ++++++----
 .../AMDGPU/infer-addrspace-flat-atomic.ll     |  91 +--
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |  90 +--
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 228 +++---
 .../CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll    |  26 +-
 ...llvm.amdgcn.raw.buffer.store.format.d16.ll |  16 +-
 ....amdgcn.raw.ptr.buffer.store.format.d16.ll |  16 +-
 ...m.amdgcn.struct.buffer.store.format.d16.ll |  16 +-
 ...dgcn.struct.ptr.buffer.store.format.d16.ll |  16 +-
 ...lvm.amdgcn.struct.ptr.tbuffer.store.d16.ll |  50 +-
 .../llvm.amdgcn.struct.tbuffer.store.d16.ll   |  50 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 280 ++++----
 ...ne-sink-temporal-divergence-swdev407790.ll |  51 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         | 102 ++-
 .../AMDGPU/memset-param-combinations.ll       | 135 ++--
 .../CodeGen/AMDGPU/misched-remat-revert.ll    | 123 ++--
 llvm/test/CodeGen/AMDGPU/permute_i8.ll        |  34 +-
 llvm/test/CodeGen/AMDGPU/pr155452.ll          |  49 +-
 llvm/test/CodeGen/AMDGPU/select-vectors.ll    |   2 -
 .../siloadstoreopt-misaligned-regsequence.ll  |  10 +-
 .../AMDGPU/simulated-trap-pseudo-expand.ll    |  15 +-
 .../AMDGPU/tuple-allocation-failure.ll        | 272 ++++---
 .../AMDGPU/undef-handling-crash-in-ra.ll      |  28 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |  36 +-
 llvm/test/CodeGen/NVPTX/dag-cse.ll            |   6 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |  10 +-
 .../AMDGPU/merge-stores.ll                    |   4 +-
 .../AMDGPU/merge-vectors-complex.ll           |  71 +-
 .../AMDGPU/merge-vectors.ll                   |   8 +-
 .../AMDGPU/pointer-elements.ll                |   3 +-
 .../LoadStoreVectorizer/X86/merge-tbaa.ll     |   4 +-
 .../X86/mismatched-gep-stride.ll              |  16 +-
 .../vectorize-i8-nested-add-inseltpoison.ll   |  10 +-
 .../X86/vectorize-i8-nested-add.ll            |  60 +-
 55 files changed, 2682 insertions(+), 2608 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 89ba9227d8952..51bfd87e2419a 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -126,21 +126,16 @@ namespace {
 
 // Equivalence class key, the initial tuple by which we group loads/stores.
 // Loads/stores with different EqClassKeys are never merged.
-//
-// (We could in theory remove element-size from the this tuple.  We'd just need
-// to fix up the vector packing/unpacking code.)
 using EqClassKey =
     std::tuple<const Value * /* result of getUnderlyingObject() */,
                unsigned /* AddrSpace */,
-               unsigned /* Load/Store element size bits */,
                char /* IsLoad; char b/c bool can't be a DenseMap key */
                >;
 [[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                                const EqClassKey &K) {
-  const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
+  const auto &[UnderlyingObject, AddrSpace, IsLoad] = K;
   return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
-            << " of element size " << ElementSize << " bits in addrspace "
-            << AddrSpace;
+            << " in addrspace " << AddrSpace;
 }
 
 // A Chain is a set of instructions such that:
@@ -298,6 +293,24 @@ class Vectorizer {
   /// Discards any length-1 subchains.
   std::vector<Chain> splitChainByAlignment(Chain &C);
 
+  /// Splits out pointer-typed elements whose scalar size exceeds the GCD
+  /// element size of the chain, so that pointers are never vectorized at a
+  /// width smaller than their native size. Non-pointer elements and pointers
+  /// whose size matches the GCD remain in the main chain.
+  std::vector<Chain> splitChainByPointerWidth(Chain &C);
+
+  /// Inserts a cast or a series of casts to convert \p V to \p NewTy.
+  ///
+  /// This function handles cases where the bit-widths of the source and
+  /// destination types differ by first converting the value to a scalar
+  /// integer of the original width, resizing it via zero-extension or
+  /// truncation, and then casting it to the final destination type.
+  ///
+  /// \param V The value to be cast.
+  /// \param NewTy The target type for the value.
+  /// \returns The resulting casted Value.
+  Value *resizeAndBitcast(Value *V, Type *NewTy);
+
   /// Converts the instrs in the chain into a single vectorized load or store.
   /// Adds the old scalar loads/stores to ToErase.
   bool vectorizeChain(Chain &C);
@@ -314,8 +327,15 @@ class Vectorizer {
                                                 unsigned Depth);
 
   /// Gets the element type of the vector that the chain will load or store.
-  /// This is nontrivial because the chain may contain elements of different
-  /// types; e.g. it's legal to have a chain that contains both i32 and float.
+  ///
+  /// The element type is determined by taking the GCD of the bitwidths of all
+  /// elements in the chain. Among types with this matching bitwidth, we prefer:
+  /// - A type that appears most frequently in the chain, to minimize casts.
+  /// - An integer type if the chain contains pointers, to avoid direct
+  ///   pointer-to-floating-point conversions.
+  ///
+  /// \param C The chain of instructions to be vectorized.
+  /// \returns The chosen element type for the vectorized load or store.
   Type *getChainElemTy(const Chain &C);
 
   /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
@@ -555,10 +575,11 @@ bool Vectorizer::runOnChain(Chain &C) {
   // all at worst O(n lg n) in the number of instructions, and splitting chains
   // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
   bool Changed = false;
-  for (auto &C : splitChainByMayAliasInstrs(C))
-    for (auto &C : splitChainByContiguity(C))
-      for (auto &C : splitChainByAlignment(C))
-        Changed |= vectorizeChain(C);
+  for (auto &C : splitChainByPointerWidth(C))
+    for (auto &C : splitChainByMayAliasInstrs(C))
+      for (auto &C : splitChainByContiguity(C))
+        for (auto &C : splitChainByAlignment(C))
+          Changed |= vectorizeChain(C);
   return Changed;
 }
 
@@ -793,29 +814,36 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
 
 Type *Vectorizer::getChainElemTy(const Chain &C) {
   assert(!C.empty());
-  // The rules are:
-  //  - If there are any pointer types in the chain, use an integer type.
-  //  - Prefer an integer type if it appears in the chain.
-  //  - Otherwise, use the first type in the chain.
-  //
-  // The rule about pointer types is a simplification when we merge e.g.  a load
-  // of a ptr and a double.  There's no direct conversion from a ptr to a
-  // double; it requires a ptrtoint followed by a bitcast.
-  //
-  // It's unclear to me if the other rules have any practical effect, but we do
-  // it to match this pass's previous behavior.
-  if (any_of(C, [](const ChainElem &E) {
-        return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
-      })) {
-    return Type::getIntNTy(
-        F.getContext(),
-        DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
+
+  unsigned GCDSize = 0;
+  bool HasPointers = false;
+  for (const ChainElem &E : C) {
+    Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
+    if (Ty->isPointerTy())
+      HasPointers = true;
+    unsigned Sz = DL.getTypeSizeInBits(Ty);
+    GCDSize = GCDSize == 0 ? Sz : std::gcd(GCDSize, Sz);
   }
 
-  for (const ChainElem &E : C)
-    if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
-      return T;
-  return getLoadStoreType(C[0].Inst)->getScalarType();
+  // Among non-pointer types whose size matches the GCD, prefer the most
+  // common one to minimize bitcasts. If the chain contains pointers, only
+  // consider integer types (there's no direct FP-to-pointer cast).
+  DenseMap<Type *, unsigned> TypeCounts;
+  Type *BestTy = nullptr;
+  unsigned BestCount = 0;
+  for (const ChainElem &E : C) {
+    Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
+    if (Ty->isPointerTy() || DL.getTypeSizeInBits(Ty) != GCDSize ||
+        (HasPointers && !Ty->isIntegerTy()))
+      continue;
+    unsigned Count = ++TypeCounts[Ty];
+    if (Count > BestCount) {
+      BestCount = Count;
+      BestTy = Ty;
+    }
+  }
+
+  return BestTy ? BestTy : Type::getIntNTy(F.getContext(), GCDSize);
 }
 
 std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
@@ -1075,6 +1103,125 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
   return Ret;
 }
 
+std::vector<Chain> Vectorizer::splitChainByPointerWidth(Chain &C) {
+  if (C.empty())
+    return {};
+
+  sortChainInOffsetOrder(C);
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: splitChainByPointerWidth considering chain:\n";
+    dumpChain(C);
+  });
+
+  unsigned GCDSize = 0;
+  for (const ChainElem &E : C) {
+    Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
+    unsigned Sz = DL.getTypeSizeInBits(Ty);
+    GCDSize = GCDSize == 0 ? Sz : std::gcd(GCDSize, Sz);
+  }
+
+  Chain MainChain;
+  DenseMap<unsigned, Chain> PtrGroups;
+
+  for (ChainElem &E : C) {
+    Type *ScalarTy = getLoadStoreType(E.Inst)->getScalarType();
+    unsigned Sz = DL.getTypeSizeInBits(ScalarTy);
+    if (ScalarTy->isPointerTy() && Sz > GCDSize)
+      PtrGroups[Sz].push_back(E);
+    else
+      MainChain.push_back(E);
+  }
+
+  std::vector<Chain> Ret;
+  if (MainChain.size() > 1)
+    Ret.push_back(std::move(MainChain));
+  for (auto &[Size, Group] : PtrGroups)
+    if (Group.size() > 1)
+      Ret.push_back(std::move(Group));
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: splitChainByPointerWidth produced " << Ret.size()
+           << " chains\n";
+    for (const auto &Chain : Ret) {
+      dbgs() << "  Chain:\n";
+      dumpChain(Chain);
+    }
+  });
+
+  return Ret;
+}
+
+/// Insert a cast from V's type to NewTy, preserving the underlying bit pattern.
+///
+/// This is used during vectorization to convert between the original types of
+/// chain elements and the vectorized element type (VecElemTy). For example,
+/// when vectorizing a chain of [i32, ptr addrspace(3), float], the vectorized
+/// type might be <3 x i32>, so each element must be cast to/from i32.
+///
+/// The function handles two cases:
+///
+/// 1. Mismatched bitwidths (e.g., i16 value into an i32 vector slot):
+///    Convert the source to a scalar integer of its original width, then
+///    zero-extend or truncate to the target width. Finally, cast to the
+///    destination type via CreateBitOrPointerCast.
+///
+/// 2. Same bitwidth (the common case):
+///    Delegate directly to CreateBitOrPointerCast, which handles
+///    int<->int, int<->ptr, int<->float, and ptr<->ptr casts natively.
+///
+/// Note: FP<->Ptr casts cannot occur here because getChainElemTy returns an FP
+/// type only when ALL chain elements are FP (so no element has a pointer type),
+/// and it never returns a pointer type itself.
+Value *Vectorizer::resizeAndBitcast(Value *V, Type *NewTy) {
+  auto *VTy = V->getType();
+  assert(VTy != NewTy && "Types should mismatch in this function.");
+  assert(!(VTy->isFPOrFPVectorTy() && NewTy->isPtrOrPtrVectorTy()) &&
+         "FP->Ptr cast should be unreachable: getChainElemTy never produces "
+         "FP when the chain contains pointers.");
+  assert(!(VTy->isPtrOrPtrVectorTy() && NewTy->isFPOrFPVectorTy()) &&
+         "Ptr->FP cast should be unreachable: getChainElemTy never returns "
+         "a pointer type.");
+
+  unsigned VTySize = DL.getTypeSizeInBits(VTy);
+  unsigned NewTySize = DL.getTypeSizeInBits(NewTy);
+
+  // Case 1: Mismatched bitwidths. The source value V is always integer-typed
+  // here because:
+  //   - In the load path, V comes from extractelement/shufflevector of the
+  //     vectorized load, which has element type VecElemTy (always int or FP).
+  //   - In the store path, splitChainByPointerWidth guarantees pointer values
+  //     have scalar size == GCD == VecElemTy size, so sizes always match.
+  //   - For FP values, the total bitwidth of V always equals the target's
+  //     bitwidth (NumElems is computed to exactly span the original type).
+  // Therefore, only integer-to-integer resizing (with possible vector
+  // flattening) is needed before the final cast.
+  if (VTySize != NewTySize) {
+    assert(!VTy->isPtrOrPtrVectorTy() &&
+           "Pointer values should never have mismatched bitwidths: "
+           "splitChainByPointerWidth guarantees ptr size == GCD == "
+           "VecElemTy size.");
+    assert(!VTy->isFPOrFPVectorTy() &&
+           "FP values should never have mismatched bitwidths: "
+           "V is always VecElemTy-typed from extract/shuffle.");
+    assert(VTy->isIntOrIntVectorTy() &&
+           "V should be integer-typed in mismatched-bitwidth path.");
+
+    // Flatten vector integers (e.g., <2 x i8> from a shufflevector) to a
+    // single scalar integer so we can perform ZExt/Trunc.
+    if (V->getType()->isVectorTy())
+      V = Builder.CreateBitCast(V, Type::getIntNTy(F.getContext(), VTySize));
+
+    // Resize to the target bitwidth.
+    V = Builder.CreateZExtOrTrunc(V,
+                                  Type::getIntNTy(F.getContext(), NewTySize));
+  }
+
+  // Case 2 (or final step of Case 1): same-width cast to the destination type.
+  // CreateBitPreservingCastChain handles int<->int, int<->ptr, and int<->float.
+  return Builder.CreateBitOrPointerCast(V, NewTy);
+}
+
 bool Vectorizer::vectorizeChain(Chain &C) {
   if (C.size() < 2)
     return false;
@@ -1095,8 +1242,10 @@ bool Vectorizer::vectorizeChain(Chain &C) {
   });
 
   Type *VecElemTy = getChainElemTy(C);
+  unsigned VecElemSize = DL.getTypeSizeInBits(VecElemTy);
   bool IsLoadChain = isa<LoadInst>(C[0].Inst);
   unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+
   unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
   APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
   unsigned ChainBytes = BytesAdded;
@@ -1110,7 +1259,9 @@ bool Vectorizer::vectorizeChain(Chain &C) {
     PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
   }
 
-  assert(8 * ChainBytes % DL.getTypeSizeInBits(VecElemTy) == 0);
+  assert(8 * ChainBytes % DL.getTypeSizeInBits(VecElemTy) == 0 &&
+         "ChainBytes must be a multiple of VecElemTy size");
+
   // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
   // than 1 byte (e.g. VecTy == <32 x i1>).
   unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
@@ -1126,11 +1277,17 @@ bool Vectorizer::vectorizeChain(Chain &C) {
                                    MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
   }
 
-  // All elements of the chain must have the same scalar-type size.
 #ifndef NDEBUG
-  for (const ChainElem &E : C)
-    assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
-           DL.getTypeStoreSize(VecElemTy));
+  for (const ChainElem &E : C) {
+    unsigned ElementSize =
+        DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType());
+    unsigned VectorElementSize = DL.getTypeStoreSize(VecElemTy);
+    // Verify that the individual element size is a multiple of the vector
+    // element size.
+    assert(ElementSize % VectorElementSize == 0 &&
+           "Element size is not a multiple of vector element size, which would "
+           "break vectorization layout.");
+  }
 #endif
 
   Instruction *VecInst;
@@ -1164,6 +1321,8 @@ bool Vectorizer::vectorizeChain(Chain &C) {
     for (const ChainElem &E : C) {
       Instruction *I = E.Inst;
       Value *V;
+      unsigned NumElems =
+          DL.getTypeSizeInBits(getLoadStoreType(I)) / VecElemSize;
       Type *T = getLoadStoreType(I);
       unsigned EOffset =
           (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
@@ -1174,13 +1333,23 @@ bool Vectorizer::vectorizeChain(Chain &C) {
         auto Mask = llvm::to_vector<8>(
             llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
         V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
+      } else if (NumElems > 1) {
+        auto Mask =
+            llvm::to_vector<8>(llvm::seq<int>(VecIdx, VecIdx + NumElems));
+        V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
       } else {
         V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
                                          I->getName());
       }
-      if (V->getType() != I->getType())
-        V = Builder.CreateBitOrPointerCast(V, I->getType());
+
+      Type *ExpectedTy = getLoadStoreType(I);
+      if (V->getType() != ExpectedTy) {
+        LLVM_DEBUG(dbgs() << "LSV: Casting " << *V << " to " << *ExpectedTy
+                          << '\n');
+        V = resizeAndBitcast(V, ExpectedTy);
+      }
       I->replaceAllUsesWith(V);
+      VecIdx += NumElems;
     }
 
     // Finally, we need to reorder the instrs in the BB so that the (transitive)
@@ -1211,26 +1380,33 @@ bool Vectorizer::vectorizeChain(Chain &C) {
 
     // Build the vector to store.
     Value *Vec = PoisonValue::get(VecTy);
-    auto InsertElem = [&](Value *V, unsigned VecIdx) {
-      if (V->getType() != VecElemTy)
-        V = Builder.CreateBitOrPointerCast(V, VecElemTy);
-      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
-    };
+    unsigned VecIdx = 0;
+
     for (const ChainElem &E : C) {
       auto *I = cast<StoreInst>(E.Inst);
-      unsigned EOffset =
-          (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
-      unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
-      if (FixedVectorType *VT =
-              dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
-        for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
-          InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
-                                                  Builder.getInt32(J)),
-                     VecIdx++);
-        }
+      Value *Val = I->getValueOperand();
+      unsigned VecElemSize = DL.getTypeSizeInBits(VecElemTy);
+      unsigned NumElems = DL.getTypeSizeInBits(Val->getType()) / VecElemSize;
+
+      Value *CastVal = Val;
+      if (NumElems == 1) {
+        if (CastVal->getType() != VecElemTy)
+          CastVal = resizeAndBitcast(CastVal, VecElemTy);
+        Vec =
+            Builder.CreateInsertElement(Vec, CastVal, Builder.getInt32(VecIdx));
       } else {
-        InsertElem(I->getValueOperand(), VecIdx);
+        Type *SubVecTy = FixedVectorType::get(VecElemTy, NumElems);
+        if (CastVal->getType() != SubVecTy)
+          CastVal = resizeAndBitcast(CastVal, SubVecTy);
+
+        for (unsigned ElmIdx = 0; ElmIdx < NumElems; ++ElmIdx) {
+          Value *Elt =
+              Builder.CreateExtractElement(CastVal, Builder.getInt32(ElmIdx));
+          Vec = Builder.CreateInsertElement(Vec, Elt,
+                                            Builder.getInt32(VecIdx + ElmIdx));
+        }
       }
+      VecIdx += NumElems;
     }
 
     // If the chain originates from extra stores, we need to vectorize into a
@@ -1580,13 +1756,12 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
     return;
 
   // The reduced key has all elements of the ECClassKey except the underlying
-  // object. Check that EqClassKey has 4 elements and define the reduced key.
-  static_assert(std::tuple_size_v<EqClassKey> == 4,
+  // object. Check that EqClassKey has 3 elements and define the reduced key.
+  static_assert(std::tuple_size_v<EqClassKey> == 3,
                 "EqClassKey has changed - EqClassReducedKey needs changes too");
   using EqClassReducedKey =
       std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
-                 std::tuple_element_t<2, EqClassKey> /* Element size */,
-                 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
+                 std::tuple_element_t<2, EqClassKey> /* IsLoad; */>;
   using ECReducedKeyToUnderlyingObjectMap =
       MapVector<EqClassReducedKey,
                 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
@@ -1598,8 +1773,7 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
   bool FoundPotentiallyOptimizableEC = false;
   for (const auto &EC : EQClasses) {
     const auto &Key = EC.first;
-    EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
-                             std::get<3>(Key)};
+    EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key)};
     auto &UOMap = RedKeyToUOMap[RedKey];
     UOMap.insert(std::get<0>(Key));
     if (UOMap.size() > 1)
@@ -1620,8 +1794,7 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
     dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
     for (const auto &RedKeyToUO : RedKeyToUOMap) {
       dbgs() << "  Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
-             << std::get<1>(RedKeyToUO.first) << ", "
-             << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
+             << static_cast<int>(std::get<1>(RedKeyToUO.first)) << "} --> "
              << RedKeyToUO.second.size() << " underlying objects:\n";
       for (auto UObject : RedKeyToUO.second)
         dbgs() << "    " << *UObject << '\n';
@@ -1661,10 +1834,9 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
       if (UObject == UltimateTarget)
         continue;
 
-      EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
-                         std::get<2>(RedKey)};
-      EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
-                       std::get<2>(RedKey)};
+      EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey)};
+      EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey),
+                       std::get<1>(RedKey)};
       // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
       // request the reference to the instructions vector for KeyTo first.
       const auto &VecTo = EQClasses[KeyTo];
@@ -1756,7 +1928,6 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
       continue;
 
     Ret[{GetUnderlyingObject(Ptr), AS,
-         DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
          /*IsLoad=*/LI != nullptr}]
         .emplace_back(&I);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index bdf54bd66c866..e532266b2197d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -686,106 +686,123 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x6
+; GFX7-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s10
-; GFX7-NEXT:    v_mov_b32_e32 v2, s12
-; GFX7-NEXT:    v_mov_b32_e32 v4, s14
-; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s11
-; GFX7-NEXT:    v_mov_b32_e32 v3, s13
-; GFX7-NEXT:    v_mov_b32_e32 v5, s15
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX7-NEXT:    s_cselect_b64 vcc, exec, 0
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX7-NEXT:    s_mov_b32 s10, -1
-; GFX7-NEXT:    s_mov_b32 s11, 0xf000
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x20
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x18
+; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s10
-; GFX8-NEXT:    v_mov_b32_e32 v2, s12
-; GFX8-NEXT:    v_mov_b32_e32 v4, s14
-; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s11
-; GFX8-NEXT:    v_mov_b32_e32 v3, s13
-; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    s_cselect_b64 vcc, exec, 0
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
-; GFX10_W32-NEXT:    s_clause 0x1
-; GFX10_W32-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX10_W32-NEXT:    s_load_dword s0, s[4:5], 0x20
+; GFX10_W32-NEXT:    s_clause 0x2
+; GFX10_W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX10_W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x18
+; GFX10_W32-NEXT:    s_load_dword s8, s[4:5], 0x20
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s12
-; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s14
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s13
-; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s15
-; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10_W32-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10_W32-NEXT:    s_cselect_b32 vcc_lo, exec_lo, 0
-; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
+; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10_W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10_W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_clause 0x1
-; GFX10_W64-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX10_W64-NEXT:    s_load_dword s0, s[4:5], 0x20
+; GFX10_W64-NEXT:    s_clause 0x2
+; GFX10_W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX10_W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x18
+; GFX10_W64-NEXT:    s_load_dword s8, s[4:5], 0x20
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s12
-; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s14
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s13
-; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s15
-; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10_W64-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10_W64-NEXT:    s_cselect_b64 vcc, exec, 0
-; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
+; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10_W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10_W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f64:
 ; GFX11_W32:       ; %bb.0:
-; GFX11_W32-NEXT:    s_clause 0x1
-; GFX11_W32-NEXT:    s_load_b256 s[8:15], s[4:5], 0x0
-; GFX11_W32-NEXT:    s_load_b32 s0, s[4:5], 0x20
+; GFX11_W32-NEXT:    s_clause 0x2
+; GFX11_W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX11_W32-NEXT:    s_load_b64 s[6:7], s[4:5], 0x18
+; GFX11_W32-NEXT:    s_load_b32 s8, s[4:5], 0x20
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11_W32-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11_W32-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11_W32-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11_W32-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11_W32-NEXT:    s_cselect_b32 vcc_lo, exec_lo, 0
-; GFX11_W32-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
+; GFX11_W32-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11_W32-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
+; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11_W32-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f64:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_clause 0x1
-; GFX11_W64-NEXT:    s_load_b256 s[8:15], s[4:5], 0x0
-; GFX11_W64-NEXT:    s_load_b32 s0, s[4:5], 0x20
+; GFX11_W64-NEXT:    s_clause 0x2
+; GFX11_W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX11_W64-NEXT:    s_load_b64 s[6:7], s[4:5], 0x18
+; GFX11_W64-NEXT:    s_load_b32 s8, s[4:5], 0x20
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s12
-; GFX11_W64-NEXT:    v_mov_b32_e32 v2, s14
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s13
-; GFX11_W64-NEXT:    v_mov_b32_e32 v3, s15
-; GFX11_W64-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11_W64-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11_W64-NEXT:    v_mov_b32_e32 v3, s7
+; GFX11_W64-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11_W64-NEXT:    s_cselect_b64 vcc, exec, 0
-; GFX11_W64-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
+; GFX11_W64-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11_W64-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
+; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11_W64-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
   store double %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index b58e04d80a229..e50607d93e11b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -40,8 +40,7 @@ define double @v_trig_preop_f64_imm(double %a, i32 %b) {
 define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; CI-LABEL: s_trig_preop_f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
@@ -60,8 +59,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ;
 ; VI-LABEL: s_trig_preop_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
@@ -80,8 +78,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ;
 ; GFX9-LABEL: s_trig_preop_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -97,9 +94,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; GFX10-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], s2
 ; GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
@@ -108,9 +103,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ;
 ; GFX11-LABEL: s_trig_preop_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], s2
 ; GFX11-NEXT:    flat_store_b64 v[0:1], v[0:1] dlc
diff --git a/llvm/test/CodeGen/AMDGPU/add_i128.ll b/llvm/test/CodeGen/AMDGPU/add_i128.ll
index dcaa856cc1016..5a3c378b901d0 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i128.ll
@@ -90,15 +90,17 @@ define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out,
 define amdgpu_kernel void @test_sreg(ptr addrspace(1) noalias %out, i128 %a, i128 %b) {
 ; GCN-LABEL: test_sreg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x11
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s4, s8, s12
-; GCN-NEXT:    s_addc_u32 s5, s9, s13
-; GCN-NEXT:    s_addc_u32 s6, s10, s14
-; GCN-NEXT:    s_addc_u32 s7, s11, s15
+; GCN-NEXT:    s_add_u32 s4, s10, s14
+; GCN-NEXT:    s_addc_u32 s5, s11, s15
+; GCN-NEXT:    s_addc_u32 s6, s12, s0
+; GCN-NEXT:    s_addc_u32 s7, s13, s1
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s0, s8
+; GCN-NEXT:    s_mov_b32 s1, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index bb7beb8d0b9e2..957a0c4fb62aa 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -510,52 +510,50 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-LABEL: introduced_copy_to_sgpr:
 ; GFX908:       ; %bb.0: ; %bb
 ; GFX908-NEXT:    global_load_ushort v16, v[0:1], off glc
-; GFX908-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX908-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX908-NEXT:    s_load_dword s0, s[8:9], 0x18
-; GFX908-NEXT:    s_mov_b32 s12, 0
-; GFX908-NEXT:    s_mov_b32 s9, s12
-; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX908-NEXT:    s_sub_i32 s1, 0, s7
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s0
+; GFX908-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX908-NEXT:    s_mov_b32 s4, 0
+; GFX908-NEXT:    s_mov_b32 s7, s4
 ; GFX908-NEXT:    v_mov_b32_e32 v17, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX908-NEXT:    s_sub_i32 s0, 0, s15
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s18
 ; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX908-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX908-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX908-NEXT:    s_mul_i32 s1, s1, s2
-; GFX908-NEXT:    s_mul_hi_u32 s1, s2, s1
-; GFX908-NEXT:    s_add_i32 s2, s2, s1
-; GFX908-NEXT:    s_mul_hi_u32 s1, s6, s2
-; GFX908-NEXT:    s_mul_i32 s2, s1, s7
-; GFX908-NEXT:    s_sub_i32 s2, s6, s2
-; GFX908-NEXT:    s_add_i32 s3, s1, 1
-; GFX908-NEXT:    s_sub_i32 s6, s2, s7
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX908-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX908-NEXT:    s_mul_i32 s0, s0, s1
+; GFX908-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX908-NEXT:    s_add_i32 s1, s1, s0
+; GFX908-NEXT:    s_mul_hi_u32 s0, s14, s1
+; GFX908-NEXT:    s_mul_i32 s1, s0, s15
+; GFX908-NEXT:    s_sub_i32 s1, s14, s1
+; GFX908-NEXT:    s_add_i32 s2, s0, 1
+; GFX908-NEXT:    s_sub_i32 s3, s1, s15
+; GFX908-NEXT:    s_cmp_ge_u32 s1, s15
+; GFX908-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX908-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX908-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX908-NEXT:    s_add_i32 s3, s1, 1
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX908-NEXT:    s_cselect_b32 s8, s3, s1
-; GFX908-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX908-NEXT:    s_add_i32 s2, s0, 1
+; GFX908-NEXT:    s_cmp_ge_u32 s1, s15
+; GFX908-NEXT:    s_cselect_b32 s6, s2, s0
+; GFX908-NEXT:    s_lshr_b32 s2, s18, 16
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v19, s2
-; GFX908-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
-; GFX908-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX908-NEXT:    s_lshl_b64 s[8:9], s[12:13], 5
+; GFX908-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX908-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
+; GFX908-NEXT:    s_lshl_b64 s[14:15], s[6:7], 5
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s2, v16
 ; GFX908-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX908-NEXT:    s_mul_i32 s3, s5, s2
-; GFX908-NEXT:    s_mul_hi_u32 s5, s4, s2
-; GFX908-NEXT:    s_mul_i32 s2, s4, s2
+; GFX908-NEXT:    s_mul_i32 s3, s13, s2
+; GFX908-NEXT:    s_mul_hi_u32 s5, s12, s2
+; GFX908-NEXT:    s_mul_i32 s2, s12, s2
 ; GFX908-NEXT:    s_add_i32 s3, s5, s3
-; GFX908-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
+; GFX908-NEXT:    s_lshl_b64 s[12:13], s[2:3], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
-; GFX908-NEXT:  .LBB3_1: ; %Flow20
+; GFX908-NEXT:  .LBB3_1: ; %Flow23
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
@@ -568,40 +566,40 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT:    s_cmp_lt_i32 s11, 0
-; GFX908-NEXT:    s_mov_b32 s13, s12
+; GFX908-NEXT:    s_mov_b32 s5, s4
+; GFX908-NEXT:    s_cmp_lt_i32 s17, 0
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v7, s5
+; GFX908-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX908-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GFX908-NEXT:    s_cmp_gt_i32 s11, -1
-; GFX908-NEXT:    v_mov_b32_e32 v4, s12
-; GFX908-NEXT:    v_mov_b32_e32 v6, s12
-; GFX908-NEXT:    v_mov_b32_e32 v8, s12
-; GFX908-NEXT:    v_mov_b32_e32 v5, s13
+; GFX908-NEXT:    s_cmp_gt_i32 s17, -1
+; GFX908-NEXT:    v_mov_b32_e32 v5, s5
+; GFX908-NEXT:    v_mov_b32_e32 v6, s4
+; GFX908-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX908-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX908-NEXT:    v_mov_b32_e32 v7, s13
-; GFX908-NEXT:    v_mov_b32_e32 v9, s13
 ; GFX908-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[2:3]
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v10
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
-; GFX908-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX908-NEXT:    s_mov_b64 s[20:21], s[10:11]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s13, v3
-; GFX908-NEXT:    s_add_u32 s9, s9, 1
-; GFX908-NEXT:    s_addc_u32 s13, s13, 0
-; GFX908-NEXT:    s_mul_hi_u32 s22, s6, s9
-; GFX908-NEXT:    s_mul_i32 s13, s6, s13
-; GFX908-NEXT:    s_mul_i32 s23, s7, s9
-; GFX908-NEXT:    s_add_i32 s13, s22, s13
-; GFX908-NEXT:    s_mul_i32 s9, s6, s9
-; GFX908-NEXT:    s_add_i32 s13, s13, s23
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX908-NEXT:    s_add_u32 s5, s5, 1
+; GFX908-NEXT:    s_addc_u32 s7, s7, 0
+; GFX908-NEXT:    s_mul_hi_u32 s22, s8, s5
+; GFX908-NEXT:    s_mul_i32 s7, s8, s7
+; GFX908-NEXT:    s_mul_i32 s23, s9, s5
+; GFX908-NEXT:    s_add_i32 s7, s22, s7
+; GFX908-NEXT:    s_mul_i32 s5, s8, s5
+; GFX908-NEXT:    s_add_i32 s7, s7, s23
 ; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT:    s_add_u32 s20, s20, s4
-; GFX908-NEXT:    s_addc_u32 s21, s21, s5
+; GFX908-NEXT:    s_add_u32 s20, s20, s12
+; GFX908-NEXT:    s_addc_u32 s21, s21, s13
 ; GFX908-NEXT:    s_mov_b64 s[22:23], 0
 ; GFX908-NEXT:    v_cmp_lt_i32_e64 s[24:25], -1, v3
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
@@ -609,8 +607,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  .LBB3_5: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    s_add_u32 s22, s20, s9
-; GFX908-NEXT:    s_addc_u32 s23, s21, s13
+; GFX908-NEXT:    s_add_u32 s22, s20, s5
+; GFX908-NEXT:    s_addc_u32 s23, s21, s7
 ; GFX908-NEXT:    global_load_dword v21, v17, s[22:23] offset:16 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    global_load_dword v20, v17, s[22:23] offset:20 glc
@@ -654,17 +652,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
-; GFX908-NEXT:  .LBB3_10: ; %Flow19
+; GFX908-NEXT:  .LBB3_10: ; %Flow22
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  ; %bb.11: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_add_u32 s10, s10, s8
-; GFX908-NEXT:    s_addc_u32 s11, s11, 0
-; GFX908-NEXT:    s_add_u32 s14, s14, s16
-; GFX908-NEXT:    s_addc_u32 s15, s15, s17
+; GFX908-NEXT:    s_add_u32 s16, s16, s6
+; GFX908-NEXT:    s_addc_u32 s17, s17, 0
+; GFX908-NEXT:    s_add_u32 s10, s10, s14
+; GFX908-NEXT:    s_addc_u32 s11, s11, s15
 ; GFX908-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX908-NEXT:    s_branch .LBB3_1
 ; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
@@ -673,51 +671,49 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-LABEL: introduced_copy_to_sgpr:
 ; GFX90A:       ; %bb.0: ; %bb
 ; GFX90A-NEXT:    global_load_ushort v18, v[0:1], off glc
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX90A-NEXT:    s_load_dword s0, s[8:9], 0x18
-; GFX90A-NEXT:    s_mov_b32 s12, 0
-; GFX90A-NEXT:    s_mov_b32 s9, s12
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX90A-NEXT:    s_sub_i32 s1, 0, s7
+; GFX90A-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_mov_b32 s7, s4
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX90A-NEXT:    s_sub_i32 s0, 0, s15
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v0
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; GFX90A-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX90A-NEXT:    s_mul_i32 s1, s1, s2
-; GFX90A-NEXT:    s_mul_hi_u32 s1, s2, s1
-; GFX90A-NEXT:    s_add_i32 s2, s2, s1
-; GFX90A-NEXT:    s_mul_hi_u32 s1, s6, s2
-; GFX90A-NEXT:    s_mul_i32 s2, s1, s7
-; GFX90A-NEXT:    s_sub_i32 s2, s6, s2
-; GFX90A-NEXT:    s_add_i32 s3, s1, 1
-; GFX90A-NEXT:    s_sub_i32 s6, s2, s7
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s18
+; GFX90A-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX90A-NEXT:    s_mul_i32 s0, s0, s1
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX90A-NEXT:    s_add_i32 s1, s1, s0
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s14, s1
+; GFX90A-NEXT:    s_mul_i32 s1, s0, s15
+; GFX90A-NEXT:    s_sub_i32 s1, s14, s1
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_sub_i32 s3, s1, s15
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s15
+; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX90A-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX90A-NEXT:    s_add_i32 s3, s1, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX90A-NEXT:    s_cselect_b32 s8, s3, s1
-; GFX90A-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s15
+; GFX90A-NEXT:    s_cselect_b32 s6, s2, s0
+; GFX90A-NEXT:    s_lshr_b32 s2, s18, 16
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, s2
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
-; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX90A-NEXT:    s_lshl_b64 s[8:9], s[12:13], 5
+; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX90A-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
+; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[6:7], 5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_readfirstlane_b32 s2, v18
 ; GFX90A-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX90A-NEXT:    s_mul_i32 s3, s5, s2
-; GFX90A-NEXT:    s_mul_hi_u32 s5, s4, s2
-; GFX90A-NEXT:    s_mul_i32 s2, s4, s2
+; GFX90A-NEXT:    s_mul_i32 s3, s13, s2
+; GFX90A-NEXT:    s_mul_hi_u32 s5, s12, s2
+; GFX90A-NEXT:    s_mul_i32 s2, s12, s2
 ; GFX90A-NEXT:    s_add_i32 s3, s5, s3
-; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
+; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[2:3], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
-; GFX90A-NEXT:  .LBB3_1: ; %Flow20
+; GFX90A-NEXT:  .LBB3_1: ; %Flow23
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
@@ -730,36 +726,36 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX90A-NEXT:    s_cmp_lt_i32 s11, 0
-; GFX90A-NEXT:    s_mov_b32 s13, s12
+; GFX90A-NEXT:    s_cmp_lt_i32 s17, 0
+; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GFX90A-NEXT:    s_cmp_gt_i32 s11, -1
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    s_cmp_gt_i32 s17, -1
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[2:3]
-; GFX90A-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX90A-NEXT:    s_mov_b64 s[20:21], s[10:11]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v12
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v4
-; GFX90A-NEXT:    v_readfirstlane_b32 s13, v5
-; GFX90A-NEXT:    s_add_u32 s9, s9, 1
-; GFX90A-NEXT:    s_addc_u32 s13, s13, 0
-; GFX90A-NEXT:    s_mul_hi_u32 s22, s6, s9
-; GFX90A-NEXT:    s_mul_i32 s13, s6, s13
-; GFX90A-NEXT:    s_mul_i32 s23, s7, s9
-; GFX90A-NEXT:    s_add_i32 s13, s22, s13
-; GFX90A-NEXT:    s_mul_i32 s9, s6, s9
-; GFX90A-NEXT:    s_add_i32 s13, s13, s23
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX90A-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX90A-NEXT:    s_add_u32 s5, s5, 1
+; GFX90A-NEXT:    s_addc_u32 s7, s7, 0
+; GFX90A-NEXT:    s_mul_hi_u32 s22, s8, s5
+; GFX90A-NEXT:    s_mul_i32 s7, s8, s7
+; GFX90A-NEXT:    s_mul_i32 s23, s9, s5
+; GFX90A-NEXT:    s_add_i32 s7, s22, s7
+; GFX90A-NEXT:    s_mul_i32 s5, s8, s5
+; GFX90A-NEXT:    s_add_i32 s7, s7, s23
 ; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    s_add_u32 s20, s20, s4
-; GFX90A-NEXT:    s_addc_u32 s21, s21, s5
+; GFX90A-NEXT:    s_add_u32 s20, s20, s12
+; GFX90A-NEXT:    s_addc_u32 s21, s21, s13
 ; GFX90A-NEXT:    s_mov_b64 s[22:23], 0
 ; GFX90A-NEXT:    v_cmp_lt_i32_e64 s[24:25], -1, v5
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
@@ -767,8 +763,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  .LBB3_5: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    s_add_u32 s22, s20, s9
-; GFX90A-NEXT:    s_addc_u32 s23, s21, s13
+; GFX90A-NEXT:    s_add_u32 s22, s20, s5
+; GFX90A-NEXT:    s_addc_u32 s23, s21, s7
 ; GFX90A-NEXT:    global_load_dword v21, v19, s[22:23] offset:16 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v20, v19, s[22:23] offset:20 glc
@@ -809,17 +805,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
-; GFX90A-NEXT:  .LBB3_10: ; %Flow19
+; GFX90A-NEXT:  .LBB3_10: ; %Flow22
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.11: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_add_u32 s10, s10, s8
-; GFX90A-NEXT:    s_addc_u32 s11, s11, 0
-; GFX90A-NEXT:    s_add_u32 s14, s14, s16
-; GFX90A-NEXT:    s_addc_u32 s15, s15, s17
+; GFX90A-NEXT:    s_add_u32 s16, s16, s6
+; GFX90A-NEXT:    s_addc_u32 s17, s17, 0
+; GFX90A-NEXT:    s_add_u32 s10, s10, s14
+; GFX90A-NEXT:    s_addc_u32 s11, s11, s15
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX90A-NEXT:    s_branch .LBB3_1
 ; GFX90A-NEXT:  .LBB3_12: ; %DummyReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
index 8d2e1c8b89ffd..5f194277337c0 100644
--- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
@@ -4,23 +4,20 @@
 define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, i1 %cond.i.i.i2295, ptr addrspace(1) %ptr, ptr %ptr1) #0 {
 ; CHECK-LABEL: vgpr_mfma_pass_av_split_crash:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x0
-; CHECK-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
+; CHECK-NEXT:    s_load_dwordx8 s[12:19], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v30, 0x9037ab78
 ; CHECK-NEXT:    v_mov_b32_e32 v31, 0x3e21eeb6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
-; CHECK-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; CHECK-NEXT:    s_xor_b64 s[18:19], s[16:17], -1
-; CHECK-NEXT:    s_bitcmp1_b32 s0, 8
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; CHECK-NEXT:    s_xor_b64 s[20:21], s[2:3], -1
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0xa17f65f6
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xbe927e4f
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_bitcmp1_b32 s14, 0
+; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; CHECK-NEXT:    s_xor_b64 s[20:21], s[10:11], -1
+; CHECK-NEXT:    s_bitcmp1_b32 s14, 8
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    s_xor_b64 s[14:15], s[0:1], -1
+; CHECK-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0x19f4ec90
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0x3efa01a0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0x16c16967
@@ -29,30 +26,31 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_mov_b32_e32 v9, 0x3f4b2bb0
 ; CHECK-NEXT:    v_mov_b32_e32 v10, 0xa57d9582
 ; CHECK-NEXT:    v_mov_b32_e32 v11, 0xbf8c6ea4
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v12, 0xe82d3ff0
 ; CHECK-NEXT:    v_mov_b32_e32 v13, 0xbfa59976
 ; CHECK-NEXT:    v_mov_b32_e32 v14, 0x8427b883
 ; CHECK-NEXT:    v_mov_b32_e32 v15, 0x3fae1bb4
-; CHECK-NEXT:    s_mov_b64 s[22:23], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v16, 0x57b87036
 ; CHECK-NEXT:    v_mov_b32_e32 v17, 0x3fb3b136
-; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[16:17]
+; CHECK-NEXT:    s_mov_b64 s[22:23], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v18, 0x55555523
 ; CHECK-NEXT:    v_mov_b32_e32 v19, 0xbfd55555
-; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[18:19]
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[10:11]
+; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[20:21]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
 ; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr22_vgpr23
 ; CHECK-NEXT:    s_branch .LBB0_2
-; CHECK-NEXT:  .LBB0_1: ; %Flow9
+; CHECK-NEXT:  .LBB0_1: ; %Flow10
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_17
 ; CHECK-NEXT:  .LBB0_2: ; %._crit_edge1942.i.i.i3548
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB0_6 Depth 2
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_9
 ; CHECK-NEXT:  ; %bb.3: ; %.preheader1868.i.i.i3244
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
@@ -60,7 +58,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %.preheader1855.i.i.i3329.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
 ; CHECK-NEXT:    flat_load_dwordx2 v[24:25], v[24:25]
 ; CHECK-NEXT:    v_mov_b64_e32 v[26:27], v[30:31]
 ; CHECK-NEXT:    v_mov_b64_e32 v[28:29], v[2:3]
@@ -97,14 +95,14 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_accvgpr_read_b32 v26, a2
 ; CHECK-NEXT:    s_mov_b64 s[24:25], -1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
-; CHECK-NEXT:    s_mov_b64 vcc, s[2:3]
+; CHECK-NEXT:    s_mov_b64 vcc, s[0:1]
 ; CHECK-NEXT:    ; implicit-def: $agpr2_agpr3
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
 ; CHECK-NEXT:  ; %bb.7: ; %.lr.ph2070.i.i.i3291
 ; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
 ; CHECK-NEXT:    v_accvgpr_mov_b32 a3, a1
 ; CHECK-NEXT:    v_accvgpr_mov_b32 a2, a0
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 vcc, s[6:7]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
 ; CHECK-NEXT:  ; %bb.8: ; %.preheader1856.preheader.i.i.i3325
@@ -115,11 +113,11 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    s_branch .LBB0_5
 ; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[10:11]
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[12:13]
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v24
 ; CHECK-NEXT:    s_mov_b64 s[22:23], 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v25
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; CHECK-NEXT:    s_branch .LBB0_15
 ; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
@@ -133,12 +131,12 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27]
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v24
-; CHECK-NEXT:    v_cndmask_b32_e64 v23, v23, 0, s[16:17]
+; CHECK-NEXT:    v_cndmask_b32_e64 v23, v23, 0, s[10:11]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v27, v26
-; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[16:17]
-; CHECK-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[16:17]
-; CHECK-NEXT:    global_store_dwordx2 v0, v[26:27], s[12:13]
+; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[10:11]
+; CHECK-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[10:11]
+; CHECK-NEXT:    global_store_dwordx2 v0, v[26:27], s[16:17]
 ; CHECK-NEXT:    s_cselect_b32 s23, s23, 0
 ; CHECK-NEXT:    s_cselect_b32 s22, s22, 0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
@@ -147,10 +145,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v24
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[22:23], 0
-; CHECK-NEXT:  .LBB0_14: ; %Flow6
+; CHECK-NEXT:  .LBB0_14: ; %Flow7
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v25
-; CHECK-NEXT:  .LBB0_15: ; %Flow6
+; CHECK-NEXT:  .LBB0_15: ; %Flow7
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], -1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
@@ -158,7 +156,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:  ; %bb.16: ; %._crit_edge2105.i.i.i2330
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
-; CHECK-NEXT:    global_store_dwordx2 v0, v[20:21], s[12:13]
+; CHECK-NEXT:    global_store_dwordx2 v0, v[20:21], s[16:17]
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_17: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index e0b83eeaa0faa..2c470ff2a4c7b 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,80 +13,78 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
-  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
+  ; GFX90A-NEXT:   early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX8_IMM_ec renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s256) from %ir.arg2.kernarg.offset, align 16, addrspace 4)
+  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr22, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr20, 8, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr22, 8, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_XOR_B64 killed renamable $sgpr28_sgpr29, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr4 = DS_READ_B32_gfx9 renamable $vgpr5, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.1.bb103:
-  ; GFX90A-NEXT:   successors: %bb.58(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.2(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x0000000000003FFF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.58, implicit $vcc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr38_sgpr39, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.59, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25, $vgpr4, $vgpr5
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
   ; GFX90A-NEXT:   renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
   ; GFX90A-NEXT:   renamable $vgpr27 = IMPLICIT_DEF implicit-def $vgpr26
   ; GFX90A-NEXT:   renamable $vgpr29 = IMPLICIT_DEF implicit-def $vgpr28
   ; GFX90A-NEXT:   renamable $vgpr33 = IMPLICIT_DEF implicit-def $vgpr32
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr26_sgpr27 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
-  ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.58(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr26_sgpr27, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; GFX90A-NEXT:   renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.58, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = nsw V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = COPY renamable $sgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr28, $vgpr0, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr1, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr38_sgpr39, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.35, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.5:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -109,7 +107,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.6.Flow20:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr26 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr28 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -121,16 +119,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr29 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
-  ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   successors: %bb.63(0x40000000), %bb.8(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
-  ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.62, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
+  ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $sgpr26_sgpr27, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.63, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.8.Flow32:
   ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -139,15 +137,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.9.bb89:
   ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.10.Flow33:
   ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -156,15 +154,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.11.bb84:
   ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.12.Flow34:
   ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -173,15 +171,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.13.bb79:
   ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.14.Flow35:
   ; GFX90A-NEXT:   successors: %bb.15(0x40000000), %bb.16(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -190,7 +188,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.15.bb72:
   ; GFX90A-NEXT:   successors: %bb.16(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc
@@ -200,122 +198,122 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   $sgpr13 = COPY killed renamable $sgpr15
   ; GFX90A-NEXT:   $sgpr14 = COPY killed renamable $sgpr16
   ; GFX90A-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.16.Flow36:
   ; GFX90A-NEXT:   successors: %bb.17(0x40000000), %bb.18(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr66_sgpr67, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr68_sgpr69, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.18, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.17.bb67:
   ; GFX90A-NEXT:   successors: %bb.18(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.18.Flow37:
   ; GFX90A-NEXT:   successors: %bb.19(0x40000000), %bb.20(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr64_sgpr65, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr66_sgpr67, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.20, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.19.bb62:
   ; GFX90A-NEXT:   successors: %bb.20(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.20.Flow38:
   ; GFX90A-NEXT:   successors: %bb.21(0x40000000), %bb.22(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr64_sgpr65, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.22, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.21.bb54:
   ; GFX90A-NEXT:   successors: %bb.22(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.22.Flow39:
   ; GFX90A-NEXT:   successors: %bb.23(0x40000000), %bb.24(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.24, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.23.bb47:
   ; GFX90A-NEXT:   successors: %bb.24(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.24.Flow40:
   ; GFX90A-NEXT:   successors: %bb.25(0x40000000), %bb.26(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.26, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.25.bb40:
   ; GFX90A-NEXT:   successors: %bb.26(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.26.Flow41:
   ; GFX90A-NEXT:   successors: %bb.27(0x40000000), %bb.28(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.28, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.27.bb33:
   ; GFX90A-NEXT:   successors: %bb.28(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.28.Flow42:
   ; GFX90A-NEXT:   successors: %bb.34(0x40000000), %bb.29(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.34, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.29.Flow43:
   ; GFX90A-NEXT:   successors: %bb.30(0x40000000), %bb.31(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
@@ -323,17 +321,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.30.bb19:
   ; GFX90A-NEXT:   successors: %bb.31(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.31.Flow44:
   ; GFX90A-NEXT:   successors: %bb.32(0x40000000), %bb.33(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr68_sgpr69, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr70_sgpr71, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr68_sgpr69, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr70_sgpr71, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.33, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock:
@@ -349,32 +347,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.34.bb26:
   ; GFX90A-NEXT:   successors: %bb.29(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr70_sgpr71, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_OR_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.29
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.35.bb20:
   ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23)
   ; GFX90A-NEXT:   renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -393,24 +391,25 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr28_sgpr29 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.37, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.36.Flow21:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.6
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.37.bb27:
   ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr66_sgpr67, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr68_sgpr69, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i30)
   ; GFX90A-NEXT:   renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
@@ -432,42 +431,42 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.39, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.38.Flow22:
   ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_ANDN2_B64 killed renamable $sgpr26_sgpr27, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr26_sgpr27 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr26_sgpr27 = S_OR_B64 killed renamable $sgpr20_sgpr21, killed renamable $sgpr26_sgpr27, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.36
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.39.bb34:
   ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i37)
   ; GFX90A-NEXT:   renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -484,39 +483,39 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.41, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.40.Flow23:
   ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_ANDN2_B64 renamable $sgpr26_sgpr27, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr20_sgpr21, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.38
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
   ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $vgpr58, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44)
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $vgpr59, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
@@ -534,34 +533,35 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr44_sgpr45 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.46, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr44_sgpr45, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_ANDN2_B64 renamable $sgpr26_sgpr27, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr20_sgpr21, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.40
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
   ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr20, 16, implicit-def $scc
+  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr22, 16, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
@@ -570,9 +570,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.44:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr62, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40, $vgpr61, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr4, $vgpr5, $vgpr6, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr58, $vgpr60, $vgpr63, $vgpr59
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -590,23 +590,23 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.45.Flow26:
   ; GFX90A-NEXT:   successors: %bb.47(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_ANDN2_B64 renamable $sgpr26_sgpr27, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr20_sgpr21, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.46.bb48:
   ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -614,10 +614,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51)
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -632,59 +632,70 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr44_sgpr45 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr46_sgpr47 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.43, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.47.Flow25:
   ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr44_sgpr45, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr46_sgpr47, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_ANDN2_B64 renamable $sgpr26_sgpr27, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr20_sgpr21, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.42
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.48.bb63:
   ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr64_sgpr65, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.49:
-  ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
+  ; GFX90A-NEXT:   successors: %bb.52(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   S_BRANCH %bb.44
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = COPY renamable $sgpr26_sgpr27
+  ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   S_BRANCH %bb.52
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.50.bb68:
-  ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
+  ; GFX90A-NEXT:   successors: %bb.55(0x40000000), %bb.51(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.55, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.51:
-  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
+  ; GFX90A-NEXT:   successors: %bb.52(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.52.Flow26:
+  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr60, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr28_sgpr29_sgpr30, $sgpr30_sgpr31, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr5, $vgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr61, $vgpr63, $vgpr62, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59
+  ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
@@ -696,22 +707,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.52.bb80:
-  ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.53.bb80:
+  ; GFX90A-NEXT:   successors: %bb.60(0x40000000), %bb.54(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc
+  ; GFX90A-NEXT:   dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr22, 65560, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_SCC0 %bb.59, implicit killed $scc
+  ; GFX90A-NEXT:   S_CBRANCH_SCC0 %bb.60, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.53:
-  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.54:
+  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr62_sgpr63, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
@@ -722,17 +733,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_BRANCH %bb.61
+  ; GFX90A-NEXT:   S_BRANCH %bb.62
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.54.bb73:
-  ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61
+  ; GFX90A-NEXT: bb.55.bb73:
+  ; GFX90A-NEXT:   successors: %bb.53(0x40000000), %bb.56(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003FF, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr60_sgpr61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
   ; GFX90A-NEXT:   renamable $vgpr8 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
@@ -747,38 +757,38 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr62_sgpr63 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.52, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.53, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.55.Flow29:
+  ; GFX90A-NEXT: bb.56.Flow29:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr62_sgpr63, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.56.bb90:
-  ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.57.bb90:
+  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003CF, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr7 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr7 = COPY renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr7, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr3, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr3 = COPY renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = COPY renamable $sgpr24, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr3, 0, 0, implicit $exec :: (load (s64) from %ir.5, addrspace 3)
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_LSHR_B64 killed renamable $sgpr56_sgpr57, 1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_LSHR_B64 renamable $sgpr20_sgpr21, 1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = V_LSHRREV_B64_e64 1, $vgpr24_vgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr7 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_OR_B64 renamable $sgpr26_sgpr27, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr16 = COPY renamable $vgpr22, implicit $exec
-  ; GFX90A-NEXT:   S_BRANCH %bb.60
+  ; GFX90A-NEXT:   S_BRANCH %bb.61
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.57:
+  ; GFX90A-NEXT: bb.58:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -790,13 +800,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -813,33 +823,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.7
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.58.bb105:
+  ; GFX90A-NEXT: bb.59.bb105:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x0000000000003FFF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr28_vgpr29 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr26, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.421, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr24, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr32_vgpr33 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.5, addrspace 3)
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr26_sgpr27 = S_MOV_B64 -1
   ; GFX90A-NEXT:   S_BRANCH %bb.3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.59.bb85:
-  ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.60.bb85:
+  ; GFX90A-NEXT:   successors: %bb.57(0x40000000), %bb.61(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003CF, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr12 = V_OR_B32_e32 1, $vgpr10, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr13 = COPY renamable $vgpr11, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr12_vgpr13, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = COPY renamable $sgpr26_sgpr27
   ; GFX90A-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
@@ -849,69 +859,69 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
   ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.56, implicit $exec
+  ; GFX90A-NEXT:   $sgpr56_sgpr57 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.57, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.60.Flow31:
-  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.61.Flow31:
+  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr56_sgpr57, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.61.Flow30:
-  ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.62.Flow30:
+  ; GFX90A-NEXT:   successors: %bb.56(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr20_sgpr21, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.55
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr20_sgpr21, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_ANDN2_B64 renamable $sgpr26_sgpr27, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr20_sgpr21, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_BRANCH %bb.56
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.62.bb140:
-  ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.63(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.63.bb140:
+  ; GFX90A-NEXT:   successors: %bb.69(0x40000000), %bb.64(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr24_sgpr25 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.68, implicit $vcc
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.69, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.63.Flow13:
-  ; GFX90A-NEXT:   successors: %bb.64(0x40000000), %bb.66(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.64.Flow13:
+  ; GFX90A-NEXT:   successors: %bb.65(0x40000000), %bb.67(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
+  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr20_sgpr21, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.67, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.64.bb159:
-  ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.65.bb159:
+  ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.66(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr6, implicit $exec
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.67, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.68, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.65.Flow10:
-  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.66.Flow10:
+  ; GFX90A-NEXT:   successors: %bb.67(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.66.Flow14:
+  ; GFX90A-NEXT: bb.67.Flow14:
   ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = COPY $exec
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = COPY $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.8
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.67.bb161:
-  ; GFX90A-NEXT:   successors: %bb.65(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.68.bb161:
+  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr27, killed $vgpr29, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr33, implicit $exec
@@ -926,11 +936,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr14, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
-  ; GFX90A-NEXT:   S_BRANCH %bb.65
+  ; GFX90A-NEXT:   S_BRANCH %bb.66
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.68.bb174:
-  ; GFX90A-NEXT:   successors: %bb.72(0x40000000), %bb.69(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.69.bb174:
+  ; GFX90A-NEXT:   successors: %bb.73(0x40000000), %bb.70(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $agpr0 = COPY killed renamable $vgpr14, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 1, $vgpr32, implicit $exec
@@ -942,23 +952,23 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr50, $vgpr20, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr14 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr38, killed $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.73, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.69.Flow:
-  ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.71(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.70.Flow:
+  ; GFX90A-NEXT:   successors: %bb.71(0x40000000), %bb.72(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.70.bb186:
-  ; GFX90A-NEXT:   successors: %bb.71(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.71.bb186:
+  ; GFX90A-NEXT:   successors: %bb.72(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = nsw V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = COPY renamable $sgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr4, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = COPY renamable $sgpr31, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr30, $vgpr4, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr5, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr35 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr55 = COPY renamable $vgpr35, implicit $exec
@@ -969,9 +979,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr15 = COPY renamable $vgpr35, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr35, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr35, renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into %ir.4, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr16 = COPY killed renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16 = COPY killed renamable $sgpr24, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr16, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.5, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr35, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.4, addrspace 3)
@@ -981,24 +991,24 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.71.Flow9:
-  ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.72.Flow9:
+  ; GFX90A-NEXT:   successors: %bb.64(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr24_sgpr25 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr14 = COPY killed renamable $agpr0, implicit $exec
-  ; GFX90A-NEXT:   S_BRANCH %bb.63
+  ; GFX90A-NEXT:   S_BRANCH %bb.64
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.72.bb196:
-  ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.73.bb196:
+  ; GFX90A-NEXT:   successors: %bb.70(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr28_sgpr29_sgpr30_sgpr31:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000003C0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr14, killed $vgpr24, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22 = V_OR_B32_e32 killed $vgpr2, killed $vgpr22, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr23, renamable $vgpr22_vgpr23, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.69
+  ; GFX90A-NEXT:   S_BRANCH %bb.70
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i11 = icmp eq i32 %i, 0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index e3273165906e9..13f89832243c4 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -1151,35 +1151,37 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-LABEL: long_branch_hang:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c nv
-; GCN-NEXT:    s_mov_b32 s7, -1
+; GCN-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c nv
+; GCN-NEXT:    s_mov_b32 s9, -1
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s6, -1, 0
+; GCN-NEXT:    s_cselect_b32 s8, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
+; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_lt_i32 s3, 6
 ; GCN-NEXT:    s_cbranch_scc0 .LBB10_1
 ; GCN-NEXT:  ; %bb.10: ; %bb
-; GCN-NEXT:    s_get_pc_i64 s[10:11]
+; GCN-NEXT:    s_get_pc_i64 s[12:13]
 ; GCN-NEXT:  .Lpost_getpc14:
-; GCN-NEXT:    s_add_co_u32 s10, s10, (.LBB10_4-.Lpost_getpc14)&4294967295
-; GCN-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_4-.Lpost_getpc14)>>32
-; GCN-NEXT:    s_set_pc_i64 s[10:11]
+; GCN-NEXT:    s_add_co_u32 s12, s12, (.LBB10_4-.Lpost_getpc14)&4294967295
+; GCN-NEXT:    s_add_co_ci_u32 s13, s13, (.LBB10_4-.Lpost_getpc14)>>32
+; GCN-NEXT:    s_set_pc_i64 s[12:13]
 ; GCN-NEXT:  .LBB10_1: ; %Flow
-; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s9
 ; GCN-NEXT:    s_cbranch_vccnz .LBB10_2
 ; GCN-NEXT:  ; %bb.12: ; %Flow
-; GCN-NEXT:    s_get_pc_i64 s[8:9]
+; GCN-NEXT:    s_get_pc_i64 s[10:11]
 ; GCN-NEXT:  .Lpost_getpc15:
-; GCN-NEXT:    s_add_co_u32 s8, s8, (.LBB10_5-.Lpost_getpc15)&4294967295
-; GCN-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc15)>>32
-; GCN-NEXT:    s_set_pc_i64 s[8:9]
-; GCN-NEXT:  .LBB10_2: ; %Flow5
+; GCN-NEXT:    s_add_co_u32 s10, s10, (.LBB10_5-.Lpost_getpc15)&4294967295
+; GCN-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_5-.Lpost_getpc15)>>32
+; GCN-NEXT:    s_set_pc_i64 s[10:11]
+; GCN-NEXT:  .LBB10_2: ; %Flow6
 ; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GCN-NEXT:    s_cbranch_vccz .LBB10_3
-; GCN-NEXT:  ; %bb.14: ; %Flow5
+; GCN-NEXT:  ; %bb.14: ; %Flow6
 ; GCN-NEXT:    s_get_pc_i64 s[0:1]
 ; GCN-NEXT:  .Lpost_getpc16:
 ; GCN-NEXT:    s_add_co_u32 s0, s0, (.LBB10_6-.Lpost_getpc16)&4294967295
@@ -1192,7 +1194,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    s_or_b32 s0, s1, s0
-; GCN-NEXT:    s_and_b32 s0, s6, s0
+; GCN-NEXT:    s_and_b32 s0, s8, s0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-NEXT:  ; %bb.8: ; %bb14
@@ -1207,70 +1209,69 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    v_nop_e64
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_sleep 0
-; GCN-NEXT:    s_mov_b32 s0, s8
+; GCN-NEXT:    s_mov_b32 s0, s10
 ; GCN-NEXT:    s_sleep 0
 ; GCN-NEXT:    s_cbranch_execz .LBB10_5
 ; GCN-NEXT:  ; %bb.16: ; %bb13
-; GCN-NEXT:    s_get_pc_i64 s[8:9]
+; GCN-NEXT:    s_get_pc_i64 s[10:11]
 ; GCN-NEXT:  .Lpost_getpc17:
-; GCN-NEXT:    s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc17)&4294967295
-; GCN-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc17)>>32
-; GCN-NEXT:    s_set_pc_i64 s[8:9]
+; GCN-NEXT:    s_add_co_u32 s10, s10, (.LBB10_2-.Lpost_getpc17)&4294967295
+; GCN-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_2-.Lpost_getpc17)>>32
+; GCN-NEXT:    s_set_pc_i64 s[10:11]
 ; GCN-NEXT:  .LBB10_5: ; %bb9
 ; GCN-NEXT:    s_cmp_lt_i32 s3, 11
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    s_cmp_ge_i32 s2, s3
-; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cselect_b32 s9, -1, 0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s0, s7, s0
+; GCN-NEXT:    s_and_b32 s0, s9, s0
 ; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GCN-NEXT:    s_cbranch_vccnz .LBB10_6
 ; GCN-NEXT:  ; %bb.18: ; %bb9
-; GCN-NEXT:    s_get_pc_i64 s[8:9]
+; GCN-NEXT:    s_get_pc_i64 s[10:11]
 ; GCN-NEXT:  .Lpost_getpc18:
-; GCN-NEXT:    s_add_co_u32 s8, s8, (.LBB10_3-.Lpost_getpc18)&4294967295
-; GCN-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_3-.Lpost_getpc18)>>32
-; GCN-NEXT:    s_set_pc_i64 s[8:9]
+; GCN-NEXT:    s_add_co_u32 s10, s10, (.LBB10_3-.Lpost_getpc18)&4294967295
+; GCN-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_3-.Lpost_getpc18)>>32
+; GCN-NEXT:    s_set_pc_i64 s[10:11]
 ; GCN-NEXT:  .LBB10_6:
 ; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:  .LBB10_7: ; %bb19
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c nv
-; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 nv
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[6:7], 2
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-ADD-PC64-LABEL: long_branch_hang:
 ; GCN-ADD-PC64:       ; %bb.0: ; %bb
 ; GCN-ADD-PC64-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-ADD-PC64-NEXT:    s_clause 0x1
 ; GCN-ADD-PC64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c nv
-; GCN-ADD-PC64-NEXT:    s_mov_b32 s7, -1
+; GCN-ADD-PC64-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c nv
+; GCN-ADD-PC64-NEXT:    s_mov_b32 s9, -1
 ; GCN-ADD-PC64-NEXT:    s_wait_kmcnt 0x0
 ; GCN-ADD-PC64-NEXT:    s_cmp_eq_u32 s0, 0
-; GCN-ADD-PC64-NEXT:    s_cselect_b32 s6, -1, 0
+; GCN-ADD-PC64-NEXT:    s_cselect_b32 s8, -1, 0
 ; GCN-ADD-PC64-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-ADD-PC64-NEXT:    s_mov_b32 s0, 0
-; GCN-ADD-PC64-NEXT:    s_cselect_b32 s8, -1, 0
+; GCN-ADD-PC64-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-ADD-PC64-NEXT:    s_cmp_lt_i32 s3, 6
 ; GCN-ADD-PC64-NEXT:    s_cbranch_scc0 .LBB10_1
 ; GCN-ADD-PC64-NEXT:  ; %bb.10: ; %bb
 ; GCN-ADD-PC64-NEXT:    s_add_pc_i64 .LBB10_4-.Lpost_addpc14
 ; GCN-ADD-PC64-NEXT:  .Lpost_addpc14:
 ; GCN-ADD-PC64-NEXT:  .LBB10_1: ; %Flow
-; GCN-ADD-PC64-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GCN-ADD-PC64-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s9
 ; GCN-ADD-PC64-NEXT:    s_cbranch_vccnz .LBB10_2
 ; GCN-ADD-PC64-NEXT:  ; %bb.12: ; %Flow
 ; GCN-ADD-PC64-NEXT:    s_add_pc_i64 .LBB10_5-.Lpost_addpc15
 ; GCN-ADD-PC64-NEXT:  .Lpost_addpc15:
-; GCN-ADD-PC64-NEXT:  .LBB10_2: ; %Flow5
+; GCN-ADD-PC64-NEXT:  .LBB10_2: ; %Flow6
 ; GCN-ADD-PC64-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GCN-ADD-PC64-NEXT:    s_cbranch_vccz .LBB10_3
-; GCN-ADD-PC64-NEXT:  ; %bb.14: ; %Flow5
+; GCN-ADD-PC64-NEXT:  ; %bb.14: ; %Flow6
 ; GCN-ADD-PC64-NEXT:    s_add_pc_i64 .LBB10_6-.Lpost_addpc16
 ; GCN-ADD-PC64-NEXT:  .Lpost_addpc16:
 ; GCN-ADD-PC64-NEXT:  .LBB10_3: ; %bb14
@@ -1280,7 +1281,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-ADD-PC64-NEXT:    s_cselect_b32 s1, -1, 0
 ; GCN-ADD-PC64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GCN-ADD-PC64-NEXT:    s_or_b32 s0, s1, s0
-; GCN-ADD-PC64-NEXT:    s_and_b32 s0, s6, s0
+; GCN-ADD-PC64-NEXT:    s_and_b32 s0, s8, s0
 ; GCN-ADD-PC64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-ADD-PC64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GCN-ADD-PC64-NEXT:  ; %bb.8: ; %bb14
@@ -1292,7 +1293,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-ADD-PC64-NEXT:    v_nop_e64
 ; GCN-ADD-PC64-NEXT:    ;;#ASMEND
 ; GCN-ADD-PC64-NEXT:    s_sleep 0
-; GCN-ADD-PC64-NEXT:    s_mov_b32 s0, s8
+; GCN-ADD-PC64-NEXT:    s_mov_b32 s0, s10
 ; GCN-ADD-PC64-NEXT:    s_sleep 0
 ; GCN-ADD-PC64-NEXT:    s_cbranch_execz .LBB10_5
 ; GCN-ADD-PC64-NEXT:  ; %bb.16: ; %bb13
@@ -1302,9 +1303,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-ADD-PC64-NEXT:    s_cmp_lt_i32 s3, 11
 ; GCN-ADD-PC64-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-ADD-PC64-NEXT:    s_cmp_ge_i32 s2, s3
-; GCN-ADD-PC64-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-ADD-PC64-NEXT:    s_cselect_b32 s9, -1, 0
 ; GCN-ADD-PC64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-ADD-PC64-NEXT:    s_and_b32 s0, s7, s0
+; GCN-ADD-PC64-NEXT:    s_and_b32 s0, s9, s0
 ; GCN-ADD-PC64-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GCN-ADD-PC64-NEXT:    s_cbranch_vccnz .LBB10_6
 ; GCN-ADD-PC64-NEXT:  ; %bb.18: ; %bb9
@@ -1313,14 +1314,11 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-ADD-PC64-NEXT:  .LBB10_6:
 ; GCN-ADD-PC64-NEXT:    ; implicit-def: $vgpr0
 ; GCN-ADD-PC64-NEXT:  .LBB10_7: ; %bb19
-; GCN-ADD-PC64-NEXT:    s_clause 0x1
-; GCN-ADD-PC64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c nv
-; GCN-ADD-PC64-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GCN-ADD-PC64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 nv
 ; GCN-ADD-PC64-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-ADD-PC64-NEXT:    s_lshl_b64 s[2:3], s[6:7], 2
 ; GCN-ADD-PC64-NEXT:    s_wait_kmcnt 0x0
-; GCN-ADD-PC64-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN-ADD-PC64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-ADD-PC64-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GCN-ADD-PC64-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GCN-ADD-PC64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GCN-ADD-PC64-NEXT:    s_endpgm
 ; GCN-ENABLE-ADD-PC64-LABEL: long_branch_hang:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 155e54baf15a1..45a0ba9a7519a 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1265,19 +1265,20 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-LABEL: long_branch_hang:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-NEXT:    s_cmp_lt_i32 s3, 6
 ; GCN-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GCN-NEXT:  ; %bb.8: ; %bb
-; GCN-NEXT:    s_getpc_b64 s[8:9]
+; GCN-NEXT:    s_getpc_b64 s[10:11]
 ; GCN-NEXT:  .Lpost_getpc12:
-; GCN-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295
-; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32
-; GCN-NEXT:    s_setpc_b64 s[8:9]
+; GCN-NEXT:    s_add_u32 s10, s10, (.LBB10_2-.Lpost_getpc12)&4294967295
+; GCN-NEXT:    s_addc_u32 s11, s11, (.LBB10_2-.Lpost_getpc12)>>32
+; GCN-NEXT:    s_setpc_b64 s[10:11]
 ; GCN-NEXT:  .LBB10_1: ; %bb13
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    v_nop_e64
@@ -1288,17 +1289,17 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    s_cbranch_execz .LBB10_3
 ; GCN-NEXT:    s_branch .LBB10_4
 ; GCN-NEXT:  .LBB10_2:
-; GCN-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-NEXT:  .LBB10_3: ; %bb9
 ; GCN-NEXT:    s_cmp_lt_i32 s3, 11
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_cmp_ge_i32 s2, s3
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GCN-NEXT:  .LBB10_4: ; %Flow5
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GCN-NEXT:    s_cmp_ge_i32 s2, s3
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GCN-NEXT:  .LBB10_4: ; %Flow6
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
 ; GCN-NEXT:    s_cbranch_vccz .LBB10_5
-; GCN-NEXT:  ; %bb.10: ; %Flow5
+; GCN-NEXT:  ; %bb.10: ; %Flow6
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
 ; GCN-NEXT:  .Lpost_getpc13:
 ; GCN-NEXT:    s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295
@@ -1310,41 +1311,42 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    s_cmp_lt_i32 s2, s3
 ; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GCN-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GCN-NEXT:    s_branch .LBB10_7
 ; GCN-NEXT:  .LBB10_6:
 ; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:  .LBB10_7: ; %bb19
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: long_branch_hang:
 ; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX11-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GFX11-NEXT:    s_cmp_lt_i32 s3, 6
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX11-NEXT:  ; %bb.8: ; %bb
-; GFX11-NEXT:    s_getpc_b64 s[8:9]
+; GFX11-NEXT:    s_getpc_b64 s[10:11]
 ; GFX11-NEXT:  .Lpost_getpc11:
 ; GFX11-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX11-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc11)&4294967295
-; GFX11-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc11)>>32
+; GFX11-NEXT:    s_add_u32 s10, s10, (.LBB10_2-.Lpost_getpc11)&4294967295
+; GFX11-NEXT:    s_addc_u32 s11, s11, (.LBB10_2-.Lpost_getpc11)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX11-NEXT:    s_setpc_b64 s[8:9]
+; GFX11-NEXT:    s_setpc_b64 s[10:11]
 ; GFX11-NEXT:  .LBB10_1: ; %bb13
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    v_nop_e64
@@ -1355,17 +1357,17 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX11-NEXT:    s_branch .LBB10_4
 ; GFX11-NEXT:  .LBB10_2:
-; GFX11-NEXT:    s_mov_b64 s[8:9], 0
+; GFX11-NEXT:    s_mov_b64 s[10:11], 0
 ; GFX11-NEXT:  .LBB10_3: ; %bb9
 ; GFX11-NEXT:    s_cmp_lt_i32 s3, 11
-; GFX11-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX11-NEXT:    s_cmp_ge_i32 s2, s3
 ; GFX11-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX11-NEXT:    s_cmp_ge_i32 s2, s3
+; GFX11-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:  .LBB10_4: ; %Flow5
+; GFX11-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX11-NEXT:  .LBB10_4: ; %Flow6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[8:9]
+; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[10:11]
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB10_6
 ; GFX11-NEXT:  ; %bb.5: ; %bb14
 ; GFX11-NEXT:    s_cmp_lt_i32 s1, 9
@@ -1374,60 +1376,59 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX11-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX11-NEXT:    s_branch .LBB10_7
 ; GFX11-NEXT:  .LBB10_6:
 ; GFX11-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-NEXT:  .LBB10_7: ; %bb19
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[6:7], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: long_branch_hang:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NEXT:    s_mov_b32 s7, -1
+; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX12-NEXT:    s_mov_b32 s9, -1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX12-NEXT:    s_cmp_lt_i32 s3, 6
 ; GFX12-NEXT:    s_cbranch_scc0 .LBB10_1
 ; GFX12-NEXT:  ; %bb.18: ; %bb
-; GFX12-NEXT:    s_getpc_b64 s[10:11]
+; GFX12-NEXT:    s_getpc_b64 s[12:13]
 ; GFX12-NEXT:  .Lpost_getpc17:
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_add_co_u32 s10, s10, (.LBB10_4-.Lpost_getpc17)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_4-.Lpost_getpc17)>>32
+; GFX12-NEXT:    s_add_co_u32 s12, s12, (.LBB10_4-.Lpost_getpc17)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s13, s13, (.LBB10_4-.Lpost_getpc17)>>32
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_setpc_b64 s[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[12:13]
 ; GFX12-NEXT:  .LBB10_1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s9
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB10_2
 ; GFX12-NEXT:  ; %bb.10: ; %Flow
-; GFX12-NEXT:    s_getpc_b64 s[8:9]
+; GFX12-NEXT:    s_getpc_b64 s[10:11]
 ; GFX12-NEXT:  .Lpost_getpc13:
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_5-.Lpost_getpc13)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc13)>>32
+; GFX12-NEXT:    s_add_co_u32 s10, s10, (.LBB10_5-.Lpost_getpc13)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_5-.Lpost_getpc13)>>32
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_setpc_b64 s[8:9]
-; GFX12-NEXT:  .LBB10_2: ; %Flow5
+; GFX12-NEXT:    s_setpc_b64 s[10:11]
+; GFX12-NEXT:  .LBB10_2: ; %Flow6
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_vccz .LBB10_3
-; GFX12-NEXT:  ; %bb.12: ; %Flow5
+; GFX12-NEXT:  ; %bb.12: ; %Flow6
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
 ; GFX12-NEXT:  .Lpost_getpc14:
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1442,7 +1443,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b32 s0, s1, s0
-; GFX12-NEXT:    s_and_b32 s0, s6, s0
+; GFX12-NEXT:    s_and_b32 s0, s8, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX12-NEXT:  ; %bb.8: ; %bb14
@@ -1454,7 +1455,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_setpc_b64 s[0:1]
 ; GFX12-NEXT:  .LBB10_4: ; %bb13
-; GFX12-NEXT:    s_mov_b32 s0, s8
+; GFX12-NEXT:    s_mov_b32 s0, s10
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    v_nop_e64
 ; GFX12-NEXT:    v_nop_e64
@@ -1463,41 +1464,38 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_cbranch_execz .LBB10_5
 ; GFX12-NEXT:  ; %bb.14: ; %bb13
-; GFX12-NEXT:    s_getpc_b64 s[8:9]
+; GFX12-NEXT:    s_getpc_b64 s[10:11]
 ; GFX12-NEXT:  .Lpost_getpc15:
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc15)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc15)>>32
+; GFX12-NEXT:    s_add_co_u32 s10, s10, (.LBB10_2-.Lpost_getpc15)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_2-.Lpost_getpc15)>>32
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_setpc_b64 s[8:9]
+; GFX12-NEXT:    s_setpc_b64 s[10:11]
 ; GFX12-NEXT:  .LBB10_5: ; %bb9
 ; GFX12-NEXT:    s_cmp_lt_i32 s3, 11
 ; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX12-NEXT:    s_cmp_ge_i32 s2, s3
-; GFX12-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, s7, s0
+; GFX12-NEXT:    s_and_b32 s0, s9, s0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB10_6
 ; GFX12-NEXT:  ; %bb.16: ; %bb9
-; GFX12-NEXT:    s_getpc_b64 s[8:9]
+; GFX12-NEXT:    s_getpc_b64 s[10:11]
 ; GFX12-NEXT:  .Lpost_getpc16:
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_3-.Lpost_getpc16)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_3-.Lpost_getpc16)>>32
+; GFX12-NEXT:    s_add_co_u32 s10, s10, (.LBB10_3-.Lpost_getpc16)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_3-.Lpost_getpc16)>>32
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_setpc_b64 s[8:9]
+; GFX12-NEXT:    s_setpc_b64 s[10:11]
 ; GFX12-NEXT:  .LBB10_6:
 ; GFX12-NEXT:    ; implicit-def: $vgpr0
 ; GFX12-NEXT:  .LBB10_7: ; %bb19
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
-; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[6:7], 2
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 37f5889918c41..5a05a16fcb148 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -334,58 +334,54 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
 define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
 ; SI-LABEL: ctpop_i64_in_br:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lg_u32 s8, 0
+; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB7_4
 ; SI-NEXT:  ; %bb.1: ; %else
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x2
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x2
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccnz .LBB7_3
 ; SI-NEXT:  .LBB7_2: ; %if
-; SI-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
-; SI-NEXT:    s_mov_b32 s5, 0
+; SI-NEXT:    s_bcnt1_i32_b64 s6, s[4:5]
+; SI-NEXT:    s_mov_b32 s7, 0
 ; SI-NEXT:  .LBB7_3: ; %endif
-; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  .LBB7_4:
-; SI-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; SI-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; SI-NEXT:    s_branch .LBB7_2
 ;
 ; VI-LABEL: ctpop_i64_in_br:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lg_u32 s8, 0
+; VI-NEXT:    s_cmp_lg_u32 s6, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB7_4
 ; VI-NEXT:  ; %bb.1: ; %else
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x8
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x8
 ; VI-NEXT:    s_cbranch_execnz .LBB7_3
 ; VI-NEXT:  .LBB7_2: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
-; VI-NEXT:    s_mov_b32 s5, 0
+; VI-NEXT:    s_bcnt1_i32_b64 s6, s[4:5]
+; VI-NEXT:    s_mov_b32 s7, 0
 ; VI-NEXT:  .LBB7_3: ; %endif
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:  .LBB7_4:
-; VI-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; VI-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; VI-NEXT:    s_branch .LBB7_2
 entry:
   %tmp0 = icmp eq i32 %cond, 0
@@ -409,28 +405,32 @@ endif:
 define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind {
 ; SI-LABEL: s_ctpop_i128:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; SI-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; SI-NEXT:    s_add_i32 s0, s0, s2
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
+; SI-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; SI-NEXT:    s_add_i32 s0, s1, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_ctpop_i128:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; VI-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; VI-NEXT:    s_add_i32 s0, s0, s2
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
+; VI-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; VI-NEXT:    s_add_i32 s0, s1, s0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 1bbe503bfe072..6f0e986e969cf 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -114,27 +114,25 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
   ; GCN-NEXT:   liveins: $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-  ; GCN-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s128) from %ir.x.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY6]], implicit-def dead $scc
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX4_IMM]]
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub2
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY5]], implicit-def dead $scc
   ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_64 = COPY $scc
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GCN-NEXT:   S_CMP_LT_I32 killed [[COPY5]], killed [[S_MOV_B32_2]], implicit-def $scc
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY8]], killed [[COPY7]], implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LT_I32 killed [[COPY4]], killed [[S_MOV_B32_2]], implicit-def $scc
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY7]], killed [[COPY6]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
+  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %setcc = icmp slt i64 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index a36be284a4bc3..beb1975e864fd 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -256,30 +256,26 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
 define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 {
 ; SI-LABEL: dynamic_extract_vector_elt_v3f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s6, 4
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; SI-NEXT:    s_lshl_b32 s2, s2, 4
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_extract_vector_elt_v3f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s8, 4
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; VI-NEXT:    s_lshl_b32 s2, s2, 4
+; VI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -287,31 +283,31 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
 ; GFX11-TRUE16-LABEL: dynamic_extract_vector_elt_v3f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s6, 4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: dynamic_extract_vector_elt_v3f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s6, 4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
-; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 9201f60a32535..591a2f829373b 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -18,8 +18,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f64_2(ptr addrspace(1) %out, ptr
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(ptr addrspace(1) %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
@@ -33,11 +33,9 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(ptr addrspace(1) %out, <
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 3
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(ptr addrspace(1) %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 0d8a9f6aca34b..e135b1134932e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -186,31 +186,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(ptr addrspace(1) %ou
 ;
 ; GCN-LABEL: dynamic_extract_vector_elt_v3i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x15
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x13
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 4
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s2
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-;
-; GFX89-LABEL: dynamic_extract_vector_elt_v3i16:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dword s8, s[4:5], 0x54
-; GFX89-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
-; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_lshl_b32 s4, s8, 4
-; GFX89-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
-; GFX89-NEXT:    v_mov_b32_e32 v0, s4
-; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1
   store i16 %p0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index eb0ed5e6ee4b9..6e2252eabbf9a 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -32,8 +32,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i64(ptr addrspace(1) %out, <2 x
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
 ; GCN-NOT: buffer_load
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(ptr addrspace(1) %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
@@ -63,8 +63,8 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(ptr addrspace(1) %out,
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(ptr addrspace(1) %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
@@ -78,11 +78,9 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(ptr addrspace(1) %out, <
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0, s{{[0-9]+}}
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 3
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(ptr addrspace(1) %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index b35f07002a48a..5ac65e56e1a6f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -5739,19 +5739,19 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
-; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_load_dword s3, s[4:5], 0xb
+; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s4, s0
+; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v1, s4
+; GCN1-NEXT:    v_mov_b32_e32 v0, s3
+; GCN1-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5760,19 +5760,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v1, s4
+; GCN2-NEXT:    v_mov_b32_e32 v0, s3
+; GCN2-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5781,17 +5781,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    s_add_u32 s0, s6, s0
+; GCN3-NEXT:    s_addc_u32 s1, s7, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s3
+; GCN3-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1] offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5800,17 +5800,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x2c
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s6, s0
+; GFX11-NEXT:    s_addc_u32 s1, s7, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5828,93 +5828,93 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
-; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xf
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s8
-; GCN1-NEXT:    s_add_u32 s0, s0, s4
-; GCN1-NEXT:    s_addc_u32 s1, s1, s5
+; GCN1-NEXT:    s_load_dword s3, s[4:5], 0xd
+; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s4, s0
+; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v1, s9
+; GCN1-NEXT:    v_mov_b32_e32 v0, s3
+; GCN1-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_mov_b32_e32 v0, s2
-; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    v_mov_b32_e32 v0, s6
+; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s8
-; GCN2-NEXT:    s_add_u32 s0, s0, s4
-; GCN2-NEXT:    s_addc_u32 s1, s1, s5
+; GCN2-NEXT:    s_load_dword s3, s[4:5], 0x34
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v1, s9
+; GCN2-NEXT:    v_mov_b32_e32 v0, s3
+; GCN2-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_mov_b32_e32 v0, s2
-; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    v_mov_b32_e32 v0, s6
+; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s8
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_load_dword s3, s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    s_add_u32 s0, s8, s0
+; GCN3-NEXT:    s_addc_u32 s1, s9, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v1, s9
+; GCN3-NEXT:    v_mov_b32_e32 v0, s3
+; GCN3-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s10
+; GCN3-NEXT:    v_mov_b32_e32 v1, s11
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x34
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s8, s0
+; GFX11-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s0, s4
-; GFX11-NEXT:    s_addc_u32 s1, s1, s5
-; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[2:3], v[0:1] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s11
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -6060,17 +6060,17 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
-; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_load_dword s3, s[4:5], 0xb
+; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s4, s0
+; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v1, s4
+; GCN1-NEXT:    v_mov_b32_e32 v0, s3
+; GCN1-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6079,17 +6079,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v1, s4
+; GCN2-NEXT:    v_mov_b32_e32 v0, s3
+; GCN2-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6098,17 +6098,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    s_add_u32 s0, s6, s0
+; GCN3-NEXT:    s_addc_u32 s1, s7, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s3
+; GCN3-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6117,17 +6117,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x2c
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s6, s0
+; GFX11-NEXT:    s_addc_u32 s1, s7, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6144,89 +6144,89 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
-; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xf
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s8
-; GCN1-NEXT:    s_add_u32 s0, s0, s4
-; GCN1-NEXT:    s_addc_u32 s1, s1, s5
+; GCN1-NEXT:    s_load_dword s3, s[4:5], 0xd
+; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s4, s0
+; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
-; GCN1-NEXT:    v_mov_b32_e32 v1, s9
+; GCN1-NEXT:    v_mov_b32_e32 v0, s3
+; GCN1-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_mov_b32_e32 v0, s2
-; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    v_mov_b32_e32 v0, s6
+; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    flat_store_dword v[0:1], v2
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s8
-; GCN2-NEXT:    s_add_u32 s0, s0, s4
-; GCN2-NEXT:    s_addc_u32 s1, s1, s5
+; GCN2-NEXT:    s_load_dword s3, s[4:5], 0x34
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
-; GCN2-NEXT:    v_mov_b32_e32 v1, s9
+; GCN2-NEXT:    v_mov_b32_e32 v0, s3
+; GCN2-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_mov_b32_e32 v0, s2
-; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    v_mov_b32_e32 v0, s6
+; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    flat_store_dword v[0:1], v2
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s8
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_load_dword s3, s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    s_add_u32 s0, s8, s0
+; GCN3-NEXT:    s_addc_u32 s1, s9, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
-; GCN3-NEXT:    v_mov_b32_e32 v1, s9
+; GCN3-NEXT:    v_mov_b32_e32 v0, s3
+; GCN3-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s10
+; GCN3-NEXT:    v_mov_b32_e32 v1, s11
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x34
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s8, s0
+; GFX11-NEXT:    s_addc_u32 s1, s9, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s0, s4
-; GFX11-NEXT:    s_addc_u32 s1, s1, s5
-; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s11
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index e1fef9083e132..7106049573483 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -4245,41 +4245,39 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
 define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) {
 ; SI-LABEL: s_fneg_select_infloop_regression_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; SI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitcmp1_b32 s6, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_bitcmp1_b32 s2, 0
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; SI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; SI-NEXT:    s_and_b64 s[6:7], s[2:3], exec
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
 ; SI-NEXT:    s_cselect_b32 s0, 0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    v_mov_b32_e32 v3, s5
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s6, 0
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    s_bitcmp1_b32 s2, 0
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; VI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; VI-NEXT:    s_and_b64 s[6:7], s[2:3], exec
+; VI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
 ; VI-NEXT:    s_cselect_b32 s0, 0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, double 0.0, double %arg
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 2fe2978f81bbd..55503825043fe 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -20,24 +20,22 @@ declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32,
 define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -47,7 +45,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
 ; GFX1250-NEXT:    s_endpgm
@@ -87,32 +85,30 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -123,7 +119,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_mov_b32 s6, 4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -142,24 +138,22 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
@@ -169,7 +163,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
 ; GFX1250-NEXT:    s_endpgm
@@ -209,32 +203,30 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -245,7 +237,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_mov_b32 s6, 4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -264,24 +256,22 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
@@ -291,7 +281,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
 ; GFX1250-NEXT:    s_endpgm
@@ -331,32 +321,30 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -366,7 +354,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -385,24 +373,22 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
@@ -412,7 +398,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
 ; GFX1250-NEXT:    s_endpgm
@@ -452,32 +438,30 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -487,7 +471,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -506,24 +490,22 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -533,7 +515,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen
 ; GFX1250-NEXT:    s_endpgm
@@ -573,32 +555,30 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -609,7 +589,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_mov_b32 s6, 4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -628,24 +608,22 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -655,7 +633,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen
 ; GFX1250-NEXT:    s_endpgm
@@ -695,32 +673,30 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -731,7 +707,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_mov_b32 s6, 4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -750,24 +726,22 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
@@ -777,7 +751,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen
 ; GFX1250-NEXT:    s_endpgm
@@ -817,32 +791,30 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -852,7 +824,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -871,24 +843,22 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
@@ -898,7 +868,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen
 ; GFX1250-NEXT:    s_endpgm
@@ -938,32 +908,30 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -973,7 +941,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -992,24 +960,22 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -1019,7 +985,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen
 ; GFX1250-NEXT:    s_endpgm
@@ -1059,32 +1025,30 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -1095,7 +1059,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_mov_b32 s6, 4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1114,24 +1078,22 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 offen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -1141,7 +1103,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen
 ; GFX1250-NEXT:    s_endpgm
@@ -1181,32 +1143,30 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -1217,7 +1177,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_mov_b32 s6, 4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1236,24 +1196,22 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
@@ -1263,7 +1221,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen
 ; GFX1250-NEXT:    s_endpgm
@@ -1303,32 +1261,30 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -1338,7 +1294,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1357,24 +1313,22 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
@@ -1384,7 +1338,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen
 ; GFX1250-NEXT:    s_endpgm
@@ -1424,32 +1378,30 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s14
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s14
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -1459,7 +1411,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34 nv
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index e987190e335d3..8d3cc0880093a 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -16,51 +16,41 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i
 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
 ; SI-LABEL: raw_buffer_atomic_min_noret_f64:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GFX7-NEXT:    s_load_dword s8, s[4:5], 0xf
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX10-NEXT:    v_mov_b32_e32 v0, s12
+; GFX10-NEXT:    v_mov_b32_e32 v1, s13
+; GFX10-NEXT:    v_mov_b32_e32 v2, s14
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX1030-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
@@ -253,51 +243,41 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
 ; SI-LABEL: raw_buffer_atomic_max_noret_f64:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GFX7-NEXT:    s_load_dword s8, s[4:5], 0xf
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX10-NEXT:    v_mov_b32_e32 v0, s12
+; GFX10-NEXT:    v_mov_b32_e32 v1, s13
+; GFX10-NEXT:    v_mov_b32_e32 v2, s14
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX1030-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
@@ -424,14 +404,15 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) {
 ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 4 offen glc slc
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index 2a7027c3393b2..796be04bf94a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -16,51 +16,41 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace
 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
 ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GFX7-NEXT:    s_load_dword s8, s[4:5], 0xf
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX10-NEXT:    v_mov_b32_e32 v0, s12
+; GFX10-NEXT:    v_mov_b32_e32 v1, s13
+; GFX10-NEXT:    v_mov_b32_e32 v2, s14
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX1030-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
@@ -253,51 +243,41 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
 ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GFX7-NEXT:    s_load_dword s8, s[4:5], 0xf
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX10-NEXT:    v_mov_b32_e32 v0, s12
+; GFX10-NEXT:    v_mov_b32_e32 v1, s13
+; GFX10-NEXT:    v_mov_b32_e32 v2, s14
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX1030-NEXT:    s_load_dword s8, s[4:5], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
@@ -424,14 +404,15 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) {
 ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 4 offen glc slc
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
index 872c2cf569dcc..f4b2f8e15bbb2 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
@@ -25,7 +25,7 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow6
+; CHECK-NEXT:  ; %bb.1: ; %Flow4
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
 ; CHECK-NEXT:  .LBB0_2: ; %atomicrmw.phi
@@ -57,7 +57,7 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
 ; CHECK-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
-; CHECK-NEXT:  .LBB0_7: ; %Flow5
+; CHECK-NEXT:  .LBB0_7: ; %Flow3
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_8: ; %atomicrmw.shared
 ; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index da132d0269e6b..226a26b26e2b8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -5437,38 +5437,38 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s9, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v3, s5
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16
+; SI-NEXT:    s_load_dword s3, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s4, s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s4, s0
+; VI-NEXT:    s_addc_u32 s1, s5, s1
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -5477,17 +5477,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x3c
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_addc_u32 s1, s7, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5495,17 +5495,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s4, s0
+; GFX11-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5522,9 +5522,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xf
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dword s10, s[4:5], 0x11
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -5545,67 +5544,67 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s4, s0
+; VI-NEXT:    s_addc_u32 s1, s5, s1
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s4, s2
-; VI-NEXT:    s_mov_b32 s5, s3
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_mov_b32 s1, s7
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_add_u32 s0, s8, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_addc_u32 s1, s9, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x3c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT:    s_add_u32 s0, s0, s4
-; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s4, s0
+; GFX11-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    global_store_b32 v2, v0, s[2:3]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[6:7]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
@@ -5752,36 +5751,36 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s9, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v3, s5
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s3, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s4, s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s4, s0
+; VI-NEXT:    s_addc_u32 s1, s5, s1
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -5790,17 +5789,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x3c
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_addc_u32 s1, s7, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5808,17 +5807,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s4, s0
+; GFX11-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5834,9 +5833,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xf
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dword s10, s[4:5], 0x11
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -5857,65 +5855,65 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    s_add_u32 s0, s0, s4
-; VI-NEXT:    s_addc_u32 s1, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s4, s0
+; VI-NEXT:    s_addc_u32 s1, s5, s1
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s4, s2
-; VI-NEXT:    s_mov_b32 s5, s3
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_mov_b32 s1, s7
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_add_u32 s0, s8, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_addc_u32 s1, s9, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x3c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT:    s_add_u32 s0, s0, s4
-; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11-NEXT:    s_add_u32 s0, s4, s0
+; GFX11-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    global_store_b32 v2, v0, s[2:3]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[6:7]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
@@ -9552,6 +9550,47 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a
 ; GFX9-NEXT:    s_cbranch_execnz .LBB136_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s3, s0, 0x4650
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:    s_and_b32 s0, s3, -4
+; GFX11-NEXT:    s_and_b32 s3, s3, 3
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT:    s_lshl_b32 s5, s3, 3
+; GFX11-NEXT:    s_and_b32 s6, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_not_b32 s3, s2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_lshl_b32 s4, s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:  .LBB136_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, s4, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB136_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i16, ptr addrspace(1) %out, i64 9000
   %val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
@@ -9671,6 +9710,47 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad
 ; GFX9-NEXT:    s_cbranch_execnz .LBB137_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s3, s0, 0x2328
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:    s_and_b32 s0, s3, -4
+; GFX11-NEXT:    s_and_b32 s3, s3, 3
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT:    s_lshl_b32 s5, s3, 3
+; GFX11-NEXT:    s_and_b32 s6, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, 0xff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_not_b32 s3, s2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_lshl_b32 s4, s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:  .LBB137_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, s4, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB137_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %out, i64 9000
   %val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 36df710529599..4a943c2233660 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -8,14 +8,13 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
-; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_addc_u32 s1, s1, s3
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; CHECK-NEXT:    s_add_u32 s0, s0, s4
+; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    s_add_u32 s0, s0, -8
 ; CHECK-NEXT:    s_addc_u32 s1, s1, -1
 ; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
@@ -35,14 +34,13 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
-; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_addc_u32 s1, s1, s3
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; CHECK-NEXT:    s_add_u32 s0, s0, s4
+; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    s_add_u32 s0, s0, -8
 ; CHECK-NEXT:    s_addc_u32 s1, s1, -1
 ; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
@@ -102,20 +100,20 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; CHECK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x24
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x24
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2c
 ; CHECK-NEXT:    s_mov_b32 s14, -1
 ; CHECK-NEXT:    s_mov_b32 s15, 0xe00000
 ; CHECK-NEXT:    s_add_u32 s12, s12, s11
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
-; CHECK-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; CHECK-NEXT:    s_add_u32 s0, s0, s4
-; CHECK-NEXT:    s_addc_u32 s1, s1, s5
-; CHECK-NEXT:    s_add_u32 s4, s0, -8
-; CHECK-NEXT:    s_addc_u32 s5, s1, -1
-; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], 1
+; CHECK-NEXT:    s_ashr_i32 s1, s0, 31
+; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; CHECK-NEXT:    s_add_u32 s0, s2, s0
+; CHECK-NEXT:    s_addc_u32 s1, s3, s1
+; CHECK-NEXT:    s_add_u32 s2, s0, -8
+; CHECK-NEXT:    s_addc_u32 s3, s1, -1
+; CHECK-NEXT:    s_cmp_eq_u64 s[2:3], 1
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
@@ -124,51 +122,54 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb1
-; CHECK-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; CHECK-NEXT:    s_cmp_eq_u32 s5, s1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; CHECK-NEXT:    s_mov_b64 s[0:1], -1
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; CHECK-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; CHECK-NEXT:    s_cmp_eq_u32 s3, s7
+; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB3_5
-; CHECK-NEXT:  ; %bb.3: ; %Flow6
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:  ; %bb.3: ; %Flow4
+; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB3_10
 ; CHECK-NEXT:  .LBB3_4: ; %atomicrmw.phi
 ; CHECK-NEXT:    s_endpgm
 ; CHECK-NEXT:  .LBB3_5: ; %atomicrmw.check.private
-; CHECK-NEXT:    s_mov_b64 s[0:1], src_private_base
-; CHECK-NEXT:    s_cmp_eq_u32 s5, s1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; CHECK-NEXT:    s_mov_b64 s[0:1], -1
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    s_cmp_eq_u32 s3, s5
+; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; CHECK-NEXT:    s_cbranch_vccz .LBB3_7
 ; CHECK-NEXT:  ; %bb.6: ; %atomicrmw.global
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
-; CHECK-NEXT:    global_atomic_add_f64 v0, v[2:3], s[4:5]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    global_atomic_add_f64 v0, v[2:3], s[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
-; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:  .LBB3_7: ; %Flow
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB3_9
 ; CHECK-NEXT:  ; %bb.8: ; %atomicrmw.private
-; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; CHECK-NEXT:    s_cselect_b32 s0, s4, -1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; CHECK-NEXT:    s_cselect_b32 s4, s2, -1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s4
 ; CHECK-NEXT:    buffer_load_dword v0, v2, s[12:15], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v1, v2, s[12:15], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
 ; CHECK-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
-; CHECK-NEXT:  .LBB3_9: ; %Flow5
+; CHECK-NEXT:  .LBB3_9: ; %Flow3
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_4
 ; CHECK-NEXT:  .LBB3_10: ; %atomicrmw.shared
-; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; CHECK-NEXT:    s_cselect_b32 s0, s4, -1
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; CHECK-NEXT:    s_cselect_b32 s2, s2, -1
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 ; CHECK-NEXT:    ds_add_f64 v0, v[2:3]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index beeeaa32cacfd..b7248a9b7aa6c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -774,20 +774,20 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s7, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b32 s8, s2, 4
+; GCN-NEXT:    s_xor_b64 s[2:3], s[0:1], s[6:7]
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s8
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 ;
@@ -1016,20 +1016,20 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s6, 0x10001
+; GCN-NEXT:    s_mov_b32 s7, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b32 s8, s2, 4
+; GCN-NEXT:    s_xor_b64 s[2:3], s[0:1], s[6:7]
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s8
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 ;
@@ -1065,19 +1065,19 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
-; GCN-NEXT:    s_lshl_b32 s6, s6, 3
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_xor_b32 s7, s1, 0x1010101
+; GCN-NEXT:    s_xor_b32 s6, s0, 0x1010101
+; GCN-NEXT:    s_lshl_b64 s[2:3], 0xff, s2
+; GCN-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 ;
@@ -1755,20 +1755,20 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x44
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s6, 1
-; GCN-NEXT:    s_cselect_b32 s3, 0x3ff00000, s3
-; GCN-NEXT:    s_cselect_b32 s2, 0, s2
+; GCN-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s6, 0
 ; GCN-NEXT:    s_cselect_b32 s1, 0x3ff00000, s1
 ; GCN-NEXT:    s_cselect_b32 s0, 0, s0
-; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 7cbf9aeacfe48..5faa15cbab2c5 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1544,19 +1544,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s8, s8, 4
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_xor_b32 s1, s3, 0x50005
-; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; SI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_lshl_b32 s2, s2, 4
+; SI-NEXT:    s_xor_b32 s9, s1, 0x50005
+; SI-NEXT:    s_xor_b32 s8, s0, 0x50005
+; SI-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
+; SI-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
+; SI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1565,20 +1563,18 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ;
 ; VI-LABEL: dynamic_insertelement_v3i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s9, s8
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s0, 0x50005
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_mov_b32 s1, s0
-; VI-NEXT:    s_lshl_b32 s8, s8, 4
+; VI-NEXT:    s_lshl_b32 s2, s2, 4
+; VI-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
+; VI-NEXT:    s_xor_b64 s[8:9], s[0:1], s[8:9]
+; VI-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2085,43 +2081,41 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2
 ; SI-LABEL: dynamic_insertelement_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s10, s[8:9], 0x8
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; SI-NEXT:    s_mov_b32 s7, 0x100f000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x4
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v3, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s10, 1
-; SI-NEXT:    s_cselect_b32 s3, 0, s3
-; SI-NEXT:    s_cselect_b32 s2, 5, s2
+; SI-NEXT:    s_cselect_b32 s6, 5, 0
 ; SI-NEXT:    s_cmp_eq_u32 s10, 0
-; SI-NEXT:    s_cselect_b32 s1, 0, s1
-; SI-NEXT:    s_cselect_b32 s0, 5, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_mov_b32_e32 v3, s3
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_cselect_b32 s5, 0, s5
+; SI-NEXT:    s_cselect_b32 s4, 5, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s10, s[8:9], 0x20
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; VI-NEXT:    s_mov_b32 s7, 0x1100f000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x10
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s10, 1
-; VI-NEXT:    s_cselect_b32 s3, 0, s3
-; VI-NEXT:    s_cselect_b32 s2, 5, s2
+; VI-NEXT:    s_cselect_b32 s6, 5, 0
 ; VI-NEXT:    s_cmp_eq_u32 s10, 0
-; VI-NEXT:    s_cselect_b32 s1, 0, s1
-; VI-NEXT:    s_cselect_b32 s0, 5, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_cselect_b32 s5, 0, s5
+; VI-NEXT:    s_cselect_b32 s4, 5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
   store <2 x i64> %vecins, ptr addrspace(1) %out, align 8
@@ -2132,57 +2126,55 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3
 ; SI-LABEL: dynamic_insertelement_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s10, s[8:9], 0x10
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x8
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0xc
-; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0x100f000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_cmp_eq_u32 s10, 1
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_cselect_b32 s7, 0, s7
-; SI-NEXT:    s_cselect_b32 s6, 5, s6
+; SI-NEXT:    s_cselect_b32 s3, 0, s3
+; SI-NEXT:    s_cselect_b32 s2, 5, s2
 ; SI-NEXT:    s_cmp_eq_u32 s10, 0
-; SI-NEXT:    s_cselect_b32 s5, 0, s5
-; SI-NEXT:    s_cselect_b32 s4, 5, s4
+; SI-NEXT:    s_cselect_b32 s1, 0, s1
+; SI-NEXT:    s_cselect_b32 s0, 5, s0
 ; SI-NEXT:    s_cmp_eq_u32 s10, 2
-; SI-NEXT:    s_cselect_b32 s9, 0, s9
-; SI-NEXT:    s_cselect_b32 s8, 5, s8
+; SI-NEXT:    s_cselect_b32 s8, 5, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:16
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v3i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s10, s[8:9], 0x40
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x30
-; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x20
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT:    s_mov_b32 s7, 0x1100f000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_cmp_eq_u32 s10, 1
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_cselect_b32 s7, 0, s7
-; VI-NEXT:    s_cselect_b32 s6, 5, s6
+; VI-NEXT:    s_cselect_b32 s3, 0, s3
+; VI-NEXT:    s_cselect_b32 s2, 5, s2
 ; VI-NEXT:    s_cmp_eq_u32 s10, 0
-; VI-NEXT:    s_cselect_b32 s5, 0, s5
-; VI-NEXT:    s_cselect_b32 s4, 5, s4
+; VI-NEXT:    s_cselect_b32 s1, 0, s1
+; VI-NEXT:    s_cselect_b32 s0, 5, s0
 ; VI-NEXT:    s_cmp_eq_u32 s10, 2
-; VI-NEXT:    s_cselect_b32 s9, 0, s9
-; VI-NEXT:    s_cselect_b32 s8, 5, s8
+; VI-NEXT:    s_cselect_b32 s8, 5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
   store <3 x i64> %vecins, ptr addrspace(1) %out, align 32
@@ -2192,68 +2184,66 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3
 define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s12, s[8:9], 0x10
+; SI-NEXT:    s_load_dword s10, s[8:9], 0x10
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x8
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
-; SI-NEXT:    s_mov_b32 s11, 0x100f000
-; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s12, 1
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_mov_b32 s7, 0x100f000
+; SI-NEXT:    s_cmp_eq_u32 s10, 1
 ; SI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
 ; SI-NEXT:    s_cselect_b32 s2, 0, s2
-; SI-NEXT:    s_cmp_eq_u32 s12, 0
+; SI-NEXT:    s_cmp_eq_u32 s10, 0
 ; SI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
 ; SI-NEXT:    s_cselect_b32 s0, 0, s0
-; SI-NEXT:    s_cmp_eq_u32 s12, 3
-; SI-NEXT:    s_cselect_b32 s7, 0x40200000, s7
-; SI-NEXT:    s_cselect_b32 s6, 0, s6
-; SI-NEXT:    s_cmp_eq_u32 s12, 2
-; SI-NEXT:    s_cselect_b32 s5, 0x40200000, s5
-; SI-NEXT:    s_cselect_b32 s4, 0, s4
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NEXT:    s_cmp_eq_u32 s10, 3
+; SI-NEXT:    s_cselect_b32 s8, 0x40200000, 0
+; SI-NEXT:    s_cmp_eq_u32 s10, 2
+; SI-NEXT:    s_cselect_b32 s9, 0x40200000, 0
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    v_mov_b32_e32 v2, s2
 ; SI-NEXT:    v_mov_b32_e32 v3, s3
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s12, s[8:9], 0x40
+; VI-NEXT:    s_load_dword s10, s[8:9], 0x40
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x20
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT:    s_mov_b32 s11, 0x1100f000
-; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s12, 1
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_mov_b32 s7, 0x1100f000
+; VI-NEXT:    s_cmp_eq_u32 s10, 1
 ; VI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
 ; VI-NEXT:    s_cselect_b32 s2, 0, s2
-; VI-NEXT:    s_cmp_eq_u32 s12, 0
+; VI-NEXT:    s_cmp_eq_u32 s10, 0
 ; VI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
 ; VI-NEXT:    s_cselect_b32 s0, 0, s0
-; VI-NEXT:    s_cmp_eq_u32 s12, 3
-; VI-NEXT:    s_cselect_b32 s7, 0x40200000, s7
-; VI-NEXT:    s_cselect_b32 s6, 0, s6
-; VI-NEXT:    s_cmp_eq_u32 s12, 2
-; VI-NEXT:    s_cselect_b32 s5, 0x40200000, s5
-; VI-NEXT:    s_cselect_b32 s4, 0, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; VI-NEXT:    s_cmp_eq_u32 s10, 3
+; VI-NEXT:    s_cselect_b32 s8, 0x40200000, 0
+; VI-NEXT:    s_cmp_eq_u32 s10, 2
+; VI-NEXT:    s_cselect_b32 s9, 0x40200000, 0
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v3, s8
+; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
   store <4 x double> %vecins, ptr addrspace(1) %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 18afb7c677207..4741787aa003f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -101,23 +101,21 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
 ; GCN-LABEL: test_div_fmas_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
-; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s8, 0
+; GCN-NEXT:    s_bitcmp1_b32 s14, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s8, s0
-; GCN-NEXT:    s_mov_b32 s9, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
 ; GCN-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
index 79fba612d72f2..c12d098644c06 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
@@ -50,9 +50,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyz:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
@@ -66,9 +64,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x hal
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyz:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -85,9 +81,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyzw:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -103,9 +97,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x ha
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyzw:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; PACKED-NEXT:    v_mov_b32_e32 v1, s5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll
index 1d2e325e98125..5d7b4c3c2e605 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll
@@ -50,9 +50,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %voffset) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyz:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
@@ -66,9 +64,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyz:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -85,9 +81,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %voffset) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyzw:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -103,9 +97,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc,
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyzw:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; PACKED-NEXT:    v_mov_b32_e32 v1, s5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
index ff421d1f22699..85da8083a2189 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
@@ -50,9 +50,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyz:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
@@ -66,9 +64,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x hal
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyz:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -85,9 +81,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyzw:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -103,9 +97,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x ha
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyzw:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; PACKED-NEXT:    v_mov_b32_e32 v1, s5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
index 8fea08df92f36..8f5a10e20517b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
@@ -50,9 +50,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyz:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
@@ -66,9 +64,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyz:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -85,9 +81,7 @@ main_body:
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) {
 ; UNPACKED-LABEL: buffer_store_format_d16_xyzw:
 ; UNPACKED:       ; %bb.0: ; %main_body
-; UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
 ; UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -103,9 +97,7 @@ define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc,
 ;
 ; PACKED-LABEL: buffer_store_format_d16_xyzw:
 ; PACKED:       ; %bb.0: ; %main_body
-; PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; PACKED-NEXT:    v_mov_b32_e32 v1, s5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
index eb28f63e90803..9007e719ee686 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
@@ -106,9 +106,7 @@ main_body:
 define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) {
 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; PREGFX10-UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; PREGFX10-UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
@@ -122,9 +120,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
 ;
 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
-; PREGFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PREGFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -135,10 +131,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
 ;
 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
 ; GFX10-PACKED:       ; %bb.0: ; %main_body
-; GFX10-PACKED-NEXT:    s_clause 0x2
-; GFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -149,15 +142,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
 ;
 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
 ; GFX11-PACKED:       ; %bb.0: ; %main_body
-; GFX11-PACKED-NEXT:    s_clause 0x2
-; GFX11-PACKED-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-NEXT:    s_load_b32 s8, s[4:5], 0x18
-; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-PACKED-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT:    s_and_b32 s4, s7, 0xffff
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
@@ -169,9 +159,7 @@ main_body:
 define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) {
 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; PREGFX10-UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; PREGFX10-UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -187,9 +175,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
 ;
 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
-; PREGFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PREGFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s5
@@ -199,10 +185,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
 ;
 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
 ; GFX10-PACKED:       ; %bb.0: ; %main_body
-; GFX10-PACKED-NEXT:    s_clause 0x2
-; GFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s5
@@ -212,14 +195,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
 ;
 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
 ; GFX11-PACKED:       ; %bb.0: ; %main_body
-; GFX11-PACKED-NEXT:    s_clause 0x2
-; GFX11-PACKED-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-NEXT:    s_load_b32 s8, s[4:5], 0x18
-; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-PACKED-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s7
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 21f179419d5d1..b959397a1aaba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -132,9 +132,7 @@ main_body:
 define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; PREGFX10-UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; PREGFX10-UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
@@ -148,9 +146,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ;
 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
-; PREGFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PREGFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -161,10 +157,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ;
 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
 ; GFX10-PACKED:       ; %bb.0: ; %main_body
-; GFX10-PACKED-NEXT:    s_clause 0x2
-; GFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
@@ -175,15 +168,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ;
 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
 ; GFX11-PACKED:       ; %bb.0: ; %main_body
-; GFX11-PACKED-NEXT:    s_clause 0x2
-; GFX11-PACKED-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-NEXT:    s_load_b32 s8, s[4:5], 0x18
-; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-PACKED-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT:    s_and_b32 s4, s7, 0xffff
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
@@ -220,9 +210,7 @@ main_body:
 define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; PREGFX10-UNPACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
+; PREGFX10-UNPACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s7, s5, 16
 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -238,9 +226,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ;
 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
-; PREGFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; PREGFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; PREGFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s5
@@ -250,10 +236,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ;
 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
 ; GFX10-PACKED:       ; %bb.0: ; %main_body
-; GFX10-PACKED-NEXT:    s_clause 0x2
-; GFX10-PACKED-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX10-PACKED-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX10-PACKED-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-PACKED-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s5
@@ -263,14 +246,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ;
 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
 ; GFX11-PACKED:       ; %bb.0: ; %main_body
-; GFX11-PACKED-NEXT:    s_clause 0x2
-; GFX11-PACKED-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-NEXT:    s_load_b32 s8, s[4:5], 0x18
-; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-PACKED-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s7
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index a5189acba98c0..b710e0169577c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -102,54 +102,54 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
 ; GFX802-SDAG-LABEL: test_writelane_sreg_i64:
 ; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
 ; GFX802-SDAG-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s0, m0
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
-; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
-; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-SDAG-LABEL: test_writelane_sreg_i64:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
-; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i64:
@@ -212,54 +212,54 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
 ; GFX802-SDAG-LABEL: test_writelane_sreg_f64:
 ; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
 ; GFX802-SDAG-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s0, m0
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
-; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
-; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-SDAG-LABEL: test_writelane_sreg_f64:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
-; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_f64:
@@ -1572,29 +1572,24 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrs
 define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
 ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX802-SDAG-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX1010-SDAG:       ; %bb.0:
-; GFX1010-SDAG-NEXT:    s_clause 0x2
-; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX1010-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x18
+; GFX1010-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s1
@@ -1606,32 +1601,27 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs
 ;
 ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_clause 0x2
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x18
+; GFX1100-SDAG-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s4
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s5, s6
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s4, s6
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX802-GISEL:       ; %bb.0:
-; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX802-GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1640,10 +1630,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs
 ;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX1010-GISEL:       ; %bb.0:
-; GFX1010-GISEL-NEXT:    s_clause 0x2
-; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX1010-GISEL-NEXT:    s_load_dword s6, s[8:9], 0x18
+; GFX1010-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1655,16 +1642,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs
 ;
 ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_clause 0x2
-; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x18
+; GFX1100-GISEL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s4
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s4, s6
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s5, s6
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
@@ -1675,29 +1659,24 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs
 define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
 ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX802-SDAG-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX1010-SDAG:       ; %bb.0:
-; GFX1010-SDAG-NEXT:    s_clause 0x2
-; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX1010-SDAG-NEXT:    s_load_dword s6, s[8:9], 0x18
+; GFX1010-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s1
@@ -1709,32 +1688,27 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr ad
 ;
 ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_clause 0x2
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x18
+; GFX1100-SDAG-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s4
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s5, s6
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s4, s6
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX802-GISEL:       ; %bb.0:
-; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT:    s_load_dword s6, s[8:9], 0x18
-; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX802-GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1743,10 +1717,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr ad
 ;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX1010-GISEL:       ; %bb.0:
-; GFX1010-GISEL-NEXT:    s_clause 0x2
-; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX1010-GISEL-NEXT:    s_load_dword s6, s[8:9], 0x18
+; GFX1010-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1758,16 +1729,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr ad
 ;
 ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_clause 0x2
-; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
-; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x18
+; GFX1100-GISEL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s4
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s4, s6
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s5, s6
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
@@ -1853,48 +1821,48 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64:
 ; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX802-SDAG-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 42
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
-; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s4
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s4
-; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s4
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s4
-; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64:
@@ -1950,48 +1918,48 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64:
 ; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX802-SDAG-NEXT:    s_add_i32 s12, s12, s17
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
-; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX1010-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s4
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s4
-; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s4
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s4
-; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 0e250baa86090..4636e401feb24 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -409,21 +409,21 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:8
 ; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_xor_b32_e32 v46, v9, v5
-; CHECK-NEXT:    v_xor_b32_e32 v45, v8, v4
-; CHECK-NEXT:    v_xor_b32_e32 v57, v11, v7
-; CHECK-NEXT:    v_xor_b32_e32 v56, v10, v6
-; CHECK-NEXT:    v_or_b32_e32 v5, v46, v57
-; CHECK-NEXT:    v_or_b32_e32 v4, v45, v56
+; CHECK-NEXT:    v_xor_b32_e32 v57, v9, v5
+; CHECK-NEXT:    v_xor_b32_e32 v56, v8, v4
+; CHECK-NEXT:    v_xor_b32_e32 v46, v11, v7
+; CHECK-NEXT:    v_xor_b32_e32 v45, v10, v6
+; CHECK-NEXT:    v_or_b32_e32 v5, v57, v46
+; CHECK-NEXT:    v_or_b32_e32 v4, v56, v45
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_27
 ; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:24
 ; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:24
-; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 4, v45
-; CHECK-NEXT:    v_alignbit_b32 v1, v46, v45, 12
-; CHECK-NEXT:    v_and_b32_e32 v2, 0xf0000, v45
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 4, v56
+; CHECK-NEXT:    v_alignbit_b32 v1, v57, v56, 12
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xf0000, v56
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_add_u32 s8, s34, 40
 ; CHECK-NEXT:    v_and_b32_e32 v3, 0xf000, v0
@@ -459,28 +459,27 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_xor_b32 s4, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_31
 ; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
-; CHECK-NEXT:    v_xor_b32_e32 v4, v60, v58
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], 16, v[56:57]
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], null, 0x180, v73, s[66:67]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v10, 5, v0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; CHECK-NEXT:    v_lshlrev_b32_e32 v8, 6, v72
-; CHECK-NEXT:    v_lshlrev_b32_e32 v9, 12, v63
-; CHECK-NEXT:    v_xor_b32_e32 v5, v61, v59
-; CHECK-NEXT:    v_lshlrev_b32_e32 v11, 16, v56
-; CHECK-NEXT:    v_or_b32_e32 v3, v1, v3
+; CHECK-NEXT:    v_xor_b32_e32 v2, v61, v59
+; CHECK-NEXT:    v_xor_b32_e32 v1, v60, v58
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], null, 0x180, v73, s[66:67]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v12, 5, v0
+; CHECK-NEXT:    v_lshrrev_b64 v[5:6], 16, v[56:57]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 16, v45
+; CHECK-NEXT:    v_lshlrev_b32_e32 v9, 6, v72
+; CHECK-NEXT:    v_lshrrev_b64 v[2:3], 16, v[1:2]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v10, 12, v63
+; CHECK-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; CHECK-NEXT:    v_lshrrev_b64 v[0:1], 16, v[45:46]
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v10
-; CHECK-NEXT:    v_or3_b32 v8, v8, v9, v62
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 16, v[4:5]
+; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v12
+; CHECK-NEXT:    v_or_b32_e32 v6, v4, v6
+; CHECK-NEXT:    v_or3_b32 v4, v9, v10, v62
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
 ; CHECK-NEXT:    v_or_b32_e32 v1, v11, v1
+; CHECK-NEXT:    global_store_dwordx3 v[7:8], v[4:6], off offset:4
+; CHECK-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:16
 ; CHECK-NEXT:    ; implicit-def: $vgpr42
 ; CHECK-NEXT:    ; implicit-def: $vgpr43
 ; CHECK-NEXT:    ; implicit-def: $vgpr44
-; CHECK-NEXT:    global_store_dword v[6:7], v8, off offset:4
-; CHECK-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off offset:8
-; CHECK-NEXT:    global_store_dwordx2 v[6:7], v[4:5], off offset:24
 ; CHECK-NEXT:  .LBB0_31: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_28 Depth=1
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s4, s4
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index ddfd7875da41d..bafc2b2fdd975 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1036,105 +1036,101 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
 define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_uniform:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s3
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
-; CI-NEXT:    s_mov_b32 s4, s0
-; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; SI-LABEL: mad_i64_i32_uniform:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mul_hi_u32 v1, s2, v0
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mul_i32 s0, s2, s3
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mul_hi_u32 v1, s0, v0
+; SI-NEXT:    s_mul_i32 s1, s0, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: mad_i64_i32_uniform:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s3
-; GFX9-NEXT:    s_mul_i32 s2, s2, s3
-; GFX9-NEXT:    s_add_u32 s2, s2, s6
-; GFX9-NEXT:    s_addc_u32 s3, s4, s7
-; GFX9-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
-; GFX9-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX9-NEXT:    s_mul_hi_u32 s4, s0, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s4, s3
+; GFX9-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX9-NEXT:    global_store_dwordx2 v0, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: mad_i64_i32_uniform:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_i32 s6, s2, s3
-; GFX11-NEXT:    s_mul_hi_u32 s3, s2, s3
-; GFX11-NEXT:    s_add_u32 s2, s6, s4
-; GFX11-NEXT:    s_addc_u32 s3, s3, s5
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_mul_i32 s6, s0, s1
+; GFX11-NEXT:    s_mul_hi_u32 s1, s0, s1
+; GFX11-NEXT:    s_add_u32 s0, s6, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mad_i64_i32_uniform:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
 ; GFX12-NEXT:    s_mov_b32 s7, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s6, s2
-; GFX12-NEXT:    s_mov_b32 s2, s3
-; GFX12-NEXT:    s_mov_b32 s3, s7
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_mov_b32 s0, s1
+; GFX12-NEXT:    s_mov_b32 s1, s7
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_u64 s[2:3], s[6:7], s[2:3]
-; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_mul_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: mad_i64_i32_uniform:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34 nv
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c nv
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_mov_b32 s5, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s4, s2
-; GFX1250-NEXT:    s_mov_b32 s2, s3
-; GFX1250-NEXT:    s_mov_b32 s3, s5
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s0, s1
+; GFX1250-NEXT:    s_mov_b32 s1, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_mul_u64 s[2:3], s[4:5], s[2:3]
-; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[6:7]
+; GFX1250-NEXT:    s_mul_u64 s[0:1], s[4:5], s[0:1]
+; GFX1250-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
 ; GFX1250-NEXT:    s_endpgm
   %ext0 = zext i32 %arg0 to i64
   %ext1 = zext i32 %arg1 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
index 990a986ffab75..91722a687fb2b 100644
--- a/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
@@ -595,15 +595,22 @@ define void @memset_p0_sz1055_align_4_varsetval(ptr addrspace(0) align 4 %dst, i
 ; GFX942-SDAG-NEXT:  ; %bb.2: ; %static-memset-post-expansion
 ; GFX942-SDAG-NEXT:    s_mov_b32 s0, 0x4040404
 ; GFX942-SDAG-NEXT:    v_perm_b32 v4, v2, v2, s0
-; GFX942-SDAG-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v4
-; GFX942-SDAG-NEXT:    v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0x41e
 ; GFX942-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:1024
-; GFX942-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[4:6] offset:1040
-; GFX942-SDAG-NEXT:    flat_store_short v[0:1], v3 offset:1052
-; GFX942-SDAG-NEXT:    flat_store_byte v[0:1], v2 offset:1054
+; GFX942-SDAG-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
+; GFX942-SDAG-NEXT:    v_or_b32_sdwa v8, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0x41c
+; GFX942-SDAG-NEXT:    flat_store_byte v[6:7], v2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0x418
+; GFX942-SDAG-NEXT:    flat_store_short v[2:3], v8
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT:    flat_store_dword v[2:3], v4
+; GFX942-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:1040
 ; GFX942-SDAG-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
 ; GFX942-SDAG-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
 ; GFX942-SDAG-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
@@ -665,19 +672,25 @@ define void @memset_p0_sz1055_align_4_varsetval(ptr addrspace(0) align 4 %dst, i
 ; GFX942-GISEL-NEXT:    v_lshl_or_b32 v4, v3, 8, v3
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX942-GISEL-NEXT:    v_or3_b32 v6, v4, v5, v3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 8
-; GFX942-GISEL-NEXT:    v_lshlrev_b16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX942-GISEL-NEXT:    v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX942-GISEL-NEXT:    v_lshl_or_b32 v4, v4, 16, v4
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v6
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
-; GFX942-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[6:9] offset:1024
-; GFX942-GISEL-NEXT:    flat_store_dwordx3 v[0:1], v[4:6] offset:1040
-; GFX942-GISEL-NEXT:    flat_store_short v[0:1], v3 offset:1052
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:1024
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1040
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1041
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1042
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1043
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1044
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1045
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1046
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1047
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1048
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1049
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1050
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1051
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1052
+; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1053
 ; GFX942-GISEL-NEXT:    flat_store_byte v[0:1], v2 offset:1054
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -913,13 +926,14 @@ define void @memset_p1_sz1055_align_4_varsetval(ptr addrspace(1) align 4 %dst, i
 ; GFX942-SDAG-NEXT:    v_perm_b32 v4, v2, v2, s0
 ; GFX942-SDAG-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:    v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v4
-; GFX942-SDAG-NEXT:    v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:1024
-; GFX942-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[4:6], off offset:1040
-; GFX942-SDAG-NEXT:    global_store_short v[0:1], v3, off offset:1052
 ; GFX942-SDAG-NEXT:    global_store_byte v[0:1], v2, off offset:1054
+; GFX942-SDAG-NEXT:    global_store_short v[0:1], v3, off offset:1052
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:1024
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off offset:1040
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -967,19 +981,25 @@ define void @memset_p1_sz1055_align_4_varsetval(ptr addrspace(1) align 4 %dst, i
 ; GFX942-GISEL-NEXT:    v_lshl_or_b32 v4, v3, 8, v3
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX942-GISEL-NEXT:    v_or3_b32 v6, v4, v5, v3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 8
-; GFX942-GISEL-NEXT:    v_lshlrev_b16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX942-GISEL-NEXT:    v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX942-GISEL-NEXT:    v_lshl_or_b32 v4, v4, 16, v4
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v6
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:1024
-; GFX942-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[4:6], off offset:1040
-; GFX942-GISEL-NEXT:    global_store_short v[0:1], v3, off offset:1052
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:1024
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1040
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1041
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1042
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1043
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1044
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1045
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1046
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1047
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1048
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1049
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1050
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1051
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1052
+; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1053
 ; GFX942-GISEL-NEXT:    global_store_byte v[0:1], v2, off offset:1054
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1193,8 +1213,8 @@ define void @memset_p3_sz1055_align_4_varsetval(ptr addrspace(3) align 4 %dst, i
 ; GFX942-SDAG-NEXT:    ds_write_b32 v0, v4 offset:1048
 ; GFX942-SDAG-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
 ; GFX942-SDAG-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-SDAG-NEXT:    ds_write_b16 v0, v2 offset:1052
 ; GFX942-SDAG-NEXT:    ds_write_b8 v0, v1 offset:1054
+; GFX942-SDAG-NEXT:    ds_write_b16 v0, v2 offset:1052
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1249,13 +1269,16 @@ define void @memset_p3_sz1055_align_4_varsetval(ptr addrspace(3) align 4 %dst, i
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX942-GISEL-NEXT:    v_lshlrev_b16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX942-GISEL-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX942-GISEL-NEXT:    v_lshl_or_b32 v4, v4, 16, v4
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xffff
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX942-GISEL-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v4, v3, 16, v3
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 0x410, v0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX942-GISEL-NEXT:    ds_write2_b32 v5, v4, v4 offset1:1
 ; GFX942-GISEL-NEXT:    ds_write_b32 v0, v2 offset:1048
-; GFX942-GISEL-NEXT:    ds_write_b16 v0, v3 offset:1052
-; GFX942-GISEL-NEXT:    ds_write_b8 v0, v1 offset:1054
+; GFX942-GISEL-NEXT:    ds_write_b16 v0, v1 offset:1052
+; GFX942-GISEL-NEXT:    ds_write_b8_d16_hi v0, v1 offset:1054
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1460,12 +1483,14 @@ define void @memset_p5_sz1055_align_4_varsetval(ptr addrspace(5) align 4 %dst, i
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX942-SDAG-NEXT:    scratch_store_dwordx4 v0, v[2:5], off offset:1024
-; GFX942-SDAG-NEXT:    scratch_store_dwordx2 v0, v[2:3], off offset:1040
-; GFX942-SDAG-NEXT:    scratch_store_dword v0, v2, off offset:1048
-; GFX942-SDAG-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
-; GFX942-SDAG-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-SDAG-NEXT:    scratch_store_short v0, v2, off offset:1052
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
+; GFX942-SDAG-NEXT:    v_or_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1054
+; GFX942-SDAG-NEXT:    scratch_store_short v0, v4, off offset:1052
+; GFX942-SDAG-NEXT:    scratch_store_dword v0, v2, off offset:1048
+; GFX942-SDAG-NEXT:    scratch_store_dwordx2 v0, v[2:3], off offset:1040
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1517,16 +1542,20 @@ define void @memset_p5_sz1055_align_4_varsetval(ptr addrspace(5) align 4 %dst, i
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX942-GISEL-NEXT:    scratch_store_dwordx4 v0, v[2:5], off offset:1024
-; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 8
-; GFX942-GISEL-NEXT:    v_lshlrev_b16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX942-GISEL-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX942-GISEL-NEXT:    v_lshl_or_b32 v4, v4, 16, v4
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
-; GFX942-GISEL-NEXT:    scratch_store_dwordx2 v0, v[4:5], off offset:1040
-; GFX942-GISEL-NEXT:    scratch_store_dword v0, v2, off offset:1048
-; GFX942-GISEL-NEXT:    scratch_store_short v0, v3, off offset:1052
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1040
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1041
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1042
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1043
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1044
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1045
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1046
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1047
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1048
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1049
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1050
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1051
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1052
+; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1053
 ; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off offset:1054
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll b/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
index a588e99980c6e..7d1333ef2ab90 100644
--- a/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
+++ b/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
@@ -21,9 +21,9 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 64, 0 :: (dereferenceable invariant load (s32) from %ir.loopcond.kernarg.offset.align.down, align 16, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg0.kernarg.offset1, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM [[COPY]](p4), 16, 0 :: (dereferenceable invariant load (s32) from %ir.arg0.kernarg.offset1 + 16, align 16, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 32, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset, align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 32, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset, align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0.kernarg.offset1, align 16, addrspace 4)
+  ; CHECK-NEXT:   early-clobber %53:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s128) from %ir.p15.kernarg.offset, align 8, addrspace 4)
   ; CHECK-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
   ; CHECK-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
   ; CHECK-NEXT:   [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]](s32), implicit $exec
@@ -45,9 +45,9 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:av_128_align2 = COPY [[S_MOV_B32_]]
   ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12288, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM2]], [[V_XOR_B32_e32_]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM2]], [[V_LSHLREV_B32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM1]], [[V_XOR_B32_e32_]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM1]], [[V_LSHLREV_B32_e32_1]], implicit $exec
   ; CHECK-NEXT:   [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_CSELECT_B64_]], -1, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_XOR_B64_]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
@@ -57,7 +57,7 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
   ; CHECK-NEXT: bb.1.loop:
   ; CHECK-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub3, [[V_MOV_B32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %53.sub1, [[V_MOV_B32_e32_1]], implicit $exec
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[S_MOV_B32_]], 0, 1, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_2]], [[S_MOV_B32_]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
@@ -89,124 +89,123 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
   ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_2]].sub0
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B32_e32_4]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_2]]
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY4]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg2.kernarg.offset, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 48, 0 :: (dereferenceable invariant load (s32) from %ir.x7.kernarg.offset, align 16, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 56, 0 :: (dereferenceable invariant load (s32) from %ir.a7.kernarg.offset, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 72, 0 :: (dereferenceable invariant load (s32) from %ir.a3.kernarg.offset, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 80, 0 :: (dereferenceable invariant load (s32) from %ir.a2.kernarg.offset, align 16, addrspace 4)
-  ; CHECK-NEXT:   early-clobber %119:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 84, 0 :: (dereferenceable invariant load (s64) from %ir.arg8.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY4]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg2.kernarg.offset, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 48, 0 :: (dereferenceable invariant load (s32) from %ir.x7.kernarg.offset, align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 56, 0 :: (dereferenceable invariant load (s32) from %ir.a7.kernarg.offset, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 72, 0 :: (dereferenceable invariant load (s32) from %ir.a3.kernarg.offset, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 80, 0 :: (dereferenceable invariant load (s128) from %ir.a2.kernarg.offset, addrspace 4)
   ; CHECK-NEXT:   KILL [[COPY]](p4)
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16384, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20480, implicit $exec
   ; CHECK-NEXT:   undef [[COPY5:%[0-9]+]].sub1:vreg_64_align2 = COPY [[DS_READ_B32_gfx9_]].sub1
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   undef [[COPY6:%[0-9]+]].sub0:vreg_64_align2 = COPY [[DS_READ_B32_gfx9_]].sub1
   ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_1]].sub0
   ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_1]].sub0
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_1]].sub0
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_5]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN4:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_5]], [[S_MOV_B32_1]], 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]].sub1:vreg_64_align2 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM4]]
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_9:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY %119.sub0
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]].sub1:vreg_64_align2 = COPY %119.sub1
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM2]]
+  ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_LOAD_DWORD_IMM3]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM5]]
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_10:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_9]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM3]]
-  ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_LOAD_DWORD_IMM4]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM6]]
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_10]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN5:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY7]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN6:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_5]], [[S_MOV_B32_1]], 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[COPY7]]
   ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_5]]
+  ; CHECK-NEXT:   KILL [[COPY7]]
   ; CHECK-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_ASHR_I32_]], 26, implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_10]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN7:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY9]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN8:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_6]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[COPY9]]
-  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LOAD_DWORD_IMM4]], [[S_LSHR_B32_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LOAD_DWORD_IMM3]], [[S_LSHR_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_ASHR_I32_1:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_ADD_I32_]], 6, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[S_ASHR_I32_1]], 1, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], 19456, implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_14:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN9:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY8]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN10:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY10]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[COPY10]]
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_14:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[S_MOV_B32_1]], 0, 1, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM7]]
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY5]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM8]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM6]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN11:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY11]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN12:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY12]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[COPY11]]
   ; CHECK-NEXT:   KILL [[COPY12]]
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY6]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   KILL [[COPY11]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY5]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN13:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_6]], [[S_MOV_B32_1]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN14:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_6]], [[S_MOV_B32_1]], 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_6]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY6]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN15:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY13]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]].sub0:vreg_64_align2 = COPY [[S_MOV_B32_1]].sub0
-  ; CHECK-NEXT:   [[V_OR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_LOAD_DWORDX4_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e32_3:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 1, [[COPY1]](s32), implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e32_4:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 24, [[V_AND_B32_e32_1]], implicit $exec
   ; CHECK-NEXT:   [[V_XOR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[V_XOR_B32_e32_]], [[V_OR_B32_e32_3]], implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_20:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e32_5:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_XOR_B32_e32_2]], [[V_XOR_B32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_20:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_LSHLREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 1, [[V_OR_B32_e32_5]], implicit $exec
   ; CHECK-NEXT:   [[V_XOR_B32_e32_3:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 1, [[V_OR_B32_e32_]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e32_6:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_LOAD_DWORDX2_IMM]].sub0, [[V_XOR_B32_e32_3]], implicit $exec
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_21:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_20]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e32_6:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_LOAD_DWORDX4_IMM]].sub0, [[V_XOR_B32_e32_3]], implicit $exec
   ; CHECK-NEXT:   [[V_XOR_B32_e32_4:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 1, [[V_OR_B32_e32_4]], implicit $exec
   ; CHECK-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 3, [[COPY1]](s32), implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_21]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_LSHLREV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 1, [[COPY1]](s32), implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_21]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DS_READ_B32_gfx9_]].sub1, [[BUFFER_LOAD_DWORDX4_OFFSET2]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) null`, addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[COPY8]], [[BUFFER_LOAD_DWORDX4_OFFEN2]], 0, 0, implicit $exec :: (store (s128) into %ir.p18, addrspace 3)
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_23:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e32_7:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 1, [[V_AND_B32_e32_2]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM3]], [[V_OR_B32_e32_7]], implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[V_ADD_U32_e32_3]], [[BUFFER_LOAD_DWORDX4_OFFSET3]], 512, 0, implicit $exec :: (store (s128) into %ir.9, addrspace 3)
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_23:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM2]], [[V_OR_B32_e32_7]], implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[V_ADD_U32_e32_3]], [[BUFFER_LOAD_DWORDX4_OFFSET3]], 512, 0, implicit $exec :: (store (s128) into %ir.11, addrspace 3)
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY %53.sub0
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_24:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_23]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:av_32 = DS_READ_B32_gfx9 [[COPY14]], 0, 0, implicit $exec :: (load (s32) from %ir.4, align 8, addrspace 3)
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub1, [[V_LSHLREV_B32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:av_32 = DS_READ_B32_gfx9 [[COPY14]], 0, 0, implicit $exec :: (load (s32) from %ir.5, align 8, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX2_IMM]].sub1, [[V_LSHLREV_B32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX2_IMM]].sub1, [[V_OR_B32_e32_2]], implicit $exec
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_25:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_24]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub1, [[V_OR_B32_e32_2]], implicit $exec
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_2:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_4]], 0, 0, implicit $exec :: (load (s32) from %ir.p10, addrspace 3)
   ; CHECK-NEXT:   [[DS_READ_B64_gfx9_:%[0-9]+]]:av_64_align2 = DS_READ_B64_gfx9 [[V_ADD_U32_e32_5]], 0, 0, implicit $exec :: (load (s64) from %ir.p11, addrspace 3)
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_25]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_3:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[DS_READ_B32_gfx9_]].sub1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_25]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
   ; CHECK-NEXT:   undef [[COPY15:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:av_64_align2 = COPY [[DS_READ_B64_gfx9_]].sub1
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM1]], [[V_LSHLREV_B32_e32_]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %53.sub2, [[V_LSHLREV_B32_e32_]], implicit $exec
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_4:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_6]], 0, 0, implicit $exec :: (load (s32) from %ir.p4, addrspace 3)
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_28:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_4:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub3, [[V_SUB_U32_e32_]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub1, [[V_ADD_U32_e32_7]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_28:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %53.sub1, [[V_SUB_U32_e32_]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX2_IMM]].sub1, [[V_ADD_U32_e32_7]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %53.sub1, [[V_OR_B32_e32_1]], implicit $exec
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_29:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_28]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub3, [[V_OR_B32_e32_1]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub3, [[V_LSHLREV_B32_e32_]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %53.sub1, [[V_LSHLREV_B32_e32_]], implicit $exec
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_5:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_8]], 0, 0, implicit $exec :: (load (s32) from %ir.p8, addrspace 3)
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_6:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_9]], 0, 0, implicit $exec :: (load (s32) from %ir.sunkaddr3, addrspace 3)
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_7:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_10]], 0, 0, implicit $exec :: (load (s32) from %ir.p5, addrspace 3)
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]].sub0:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_7]], 0, 0, implicit $exec :: (load (s32) from %ir.i85, align 8, addrspace 3)
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM2]], [[V_OR_B32_e32_6]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM1]], [[V_OR_B32_e32_6]], implicit $exec
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_8:%[0-9]+]].sub0:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_11]], 0, 0, implicit $exec :: (load (s32) from %ir.p2, align 16, addrspace 3)
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_9:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e32_2]], 8192, 0, implicit $exec :: (load (s32) from %ir.p13, addrspace 3)
   ; CHECK-NEXT:   undef [[DS_READ_B32_gfx9_10:%[0-9]+]].sub0:av_64_align2 = DS_READ_B32_gfx9 [[V_XOR_B32_e32_4]], 8192, 0, implicit $exec :: (load (s32) from %ir.p9, align 16, addrspace 3)
@@ -219,7 +218,7 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_30:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY4]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_14]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_8:%[0-9]+]].sub1:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_10:%[0-9]+]].sub1:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
-  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHRREV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHRREV_B32_e32_]], [[S_LOAD_DWORDX2_IMM]].sub0, implicit $exec
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_31:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_2]], [[BUFFER_LOAD_DWORDX4_OFFEN4]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   undef [[V_AND_OR_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_AND_OR_B32_e64 [[V_LSHLREV_B32_e32_3]], 56, [[V_MUL_LO_U32_e64_]], implicit $exec
   ; CHECK-NEXT:   [[V_AND_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY1]](s32), 48, [[V_AND_B32_e32_]], implicit $exec
@@ -256,7 +255,7 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_LSHLREV_B32_e32_4]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_59]].sub0, 0, 0, implicit $exec :: (store (s32) into %ir.p21, addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DS_READ_B32_gfx9_]].sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_58]].sub0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) null`, addrspace 3)
   ; CHECK-NEXT:   [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_60:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN13]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_56]], 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[COPY14]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_60]].sub0, 0, 0, implicit $exec :: (store (s32) into %ir.4, addrspace 3)
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[COPY14]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_60]].sub0, 0, 0, implicit $exec :: (store (s32) into %ir.5, addrspace 3)
   ; CHECK-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_ASHRREV_I32_e32 31, [[V_AND_OR_B32_e64_]].sub0, implicit $exec
   ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 1, [[V_AND_OR_B32_e64_]], implicit $exec
   ; CHECK-NEXT:   GLOBAL_ATOMIC_PK_ADD_BF16 [[V_LSHLREV_B64_e64_]], [[DS_READ_B32_gfx9_]].sub1, 0, 0, implicit $exec :: (load store syncscope("agent") monotonic (s32) on %ir.p20, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index d21cda572f5f4..12b7ed137e86c 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3902,19 +3902,19 @@ define amdgpu_kernel void @more_any_extend_to_perm(i8 %arg, <4 x i8> %arg1, i8 %
 ; GFX10-LABEL: more_any_extend_to_perm:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x30
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xc0c0304
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0xc0c0104
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v2, s6, s7, v0
-; GFX10-NEXT:    v_perm_b32 v0, s0, s7, v0
-; GFX10-NEXT:    v_perm_b32 v3, s0, s7, v1
-; GFX10-NEXT:    v_perm_b32 v1, s6, s7, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, s1
+; GFX10-NEXT:    v_perm_b32 v2, s0, s1, v0
+; GFX10-NEXT:    v_perm_b32 v0, s2, s1, v0
+; GFX10-NEXT:    v_perm_b32 v3, s2, s1, v1
+; GFX10-NEXT:    v_perm_b32 v1, s0, s1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s2
+; GFX10-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    ds_write_b32 v4, v2
@@ -3923,22 +3923,22 @@ define amdgpu_kernel void @more_any_extend_to_perm(i8 %arg, <4 x i8> %arg1, i8 %
 ;
 ; GFX9-LABEL: more_any_extend_to_perm:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc0c0304
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x30
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xc0c0104
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_perm_b32 v4, s6, v2, v1
-; GFX9-NEXT:    v_perm_b32 v1, s0, v2, v1
-; GFX9-NEXT:    v_perm_b32 v3, s0, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_perm_b32 v4, s0, v2, v1
+; GFX9-NEXT:    v_perm_b32 v1, s2, v2, v1
+; GFX9-NEXT:    v_perm_b32 v3, s2, v2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_perm_b32 v0, s6, v2, v0
+; GFX9-NEXT:    v_perm_b32 v0, s0, v2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    ds_write_b32 v4, v3
 ; GFX9-NEXT:    ds_write_b32 v1, v0
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/pr155452.ll b/llvm/test/CodeGen/AMDGPU/pr155452.ll
index 928997e9fecb8..7ace890c6fa8f 100644
--- a/llvm/test/CodeGen/AMDGPU/pr155452.ll
+++ b/llvm/test/CodeGen/AMDGPU/pr155452.ll
@@ -9,16 +9,15 @@ define amdgpu_kernel void @my_kernel(i64 %foo, i32 %bar) {
 ; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-NEXT:    s_add_i32 s12, s12, s17
 ; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x2
-; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], 1
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_ashr_i32 s6, s0, 31
-; CHECK-NEXT:    s_abs_i32 s7, s0
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s7
-; CHECK-NEXT:    s_sub_i32 s0, 0, s7
+; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
+; CHECK-NEXT:    s_abs_i32 s6, s6
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; CHECK-NEXT:    s_sub_i32 s0, 0, s6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -28,39 +27,39 @@ define amdgpu_kernel void @my_kernel(i64 %foo, i32 %bar) {
 ; CHECK-NEXT:    s_and_b64 s[0:1], exec, -1
 ; CHECK-NEXT:  .LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], s2, v3, 1
-; CHECK-NEXT:    s_mul_i32 s4, s3, s4
-; CHECK-NEXT:    s_mul_i32 s5, s2, s5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
-; CHECK-NEXT:    v_readfirstlane_b32 s4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, s5, v4
-; CHECK-NEXT:    s_ashr_i32 s5, s4, 31
-; CHECK-NEXT:    s_abs_i32 s8, s4
-; CHECK-NEXT:    s_xor_b32 s5, s5, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s2
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], s4, v3, 1
+; CHECK-NEXT:    s_mul_i32 s2, s5, s2
+; CHECK-NEXT:    s_mul_i32 s3, s4, s3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, s2, v4
+; CHECK-NEXT:    v_readfirstlane_b32 s2, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, s3, v4
+; CHECK-NEXT:    s_ashr_i32 s3, s2, 31
+; CHECK-NEXT:    s_abs_i32 s8, s2
+; CHECK-NEXT:    s_xor_b32 s3, s3, s7
 ; CHECK-NEXT:    v_mul_hi_u32 v3, s8, v2
 ; CHECK-NEXT:    v_readfirstlane_b32 s9, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CHECK-NEXT:    s_mul_i32 s9, s9, s7
+; CHECK-NEXT:    s_mul_i32 s9, s9, s6
 ; CHECK-NEXT:    s_sub_i32 s8, s8, s9
-; CHECK-NEXT:    s_sub_i32 s9, s8, s7
-; CHECK-NEXT:    s_cmp_ge_u32 s8, s7
+; CHECK-NEXT:    s_sub_i32 s9, s8, s6
+; CHECK-NEXT:    s_cmp_ge_u32 s8, s6
 ; CHECK-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; CHECK-NEXT:    s_cselect_b32 s8, s9, s8
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CHECK-NEXT:    s_cmp_ge_u32 s8, s7
+; CHECK-NEXT:    s_cmp_ge_u32 s8, s6
 ; CHECK-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, s5, v3
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s5, v3
+; CHECK-NEXT:    v_xor_b32_e32 v3, s3, v3
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v3
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; CHECK-NEXT:    v_or_b32_e32 v3, s4, v3
+; CHECK-NEXT:    v_or_b32_e32 v3, s2, v3
 ; CHECK-NEXT:    v_or_b32_e32 v4, v4, v5
 ; CHECK-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-NEXT:    s_mov_b64 vcc, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %DummyReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index e754f665c5f43..e6e9f96b00bde 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -360,8 +360,6 @@ define amdgpu_kernel void @select_v8f32(ptr addrspace(1) %out, <8 x float> %a, <
 ; GCN-LABEL: {{^}}select_v2f64:
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
-; GCN: s_cselect_b32
-; GCN: s_cselect_b32
 define amdgpu_kernel void @select_v2f64(ptr addrspace(1) %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
diff --git a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
index b0575440a76ad..436823a7d7f08 100644
--- a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
+++ b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
@@ -5,12 +5,12 @@ define amdgpu_kernel void @foo(ptr %0) {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; CHECK-NEXT:    flat_store_dwordx3 v[2:3], v[4:6]
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
 ; CHECK-NEXT:    s_endpgm
 entry:
   %1 = getelementptr inbounds i8, ptr %0, i64 4
diff --git a/llvm/test/CodeGen/AMDGPU/simulated-trap-pseudo-expand.ll b/llvm/test/CodeGen/AMDGPU/simulated-trap-pseudo-expand.ll
index 9796db4717e7b..9d7c9db84a0ef 100644
--- a/llvm/test/CodeGen/AMDGPU/simulated-trap-pseudo-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/simulated-trap-pseudo-expand.ll
@@ -6,24 +6,21 @@ declare void @llvm.trap()
 define amdgpu_kernel void @simulated_trap_pseudo_expand(i64 %offset, i1 %should_continue, ptr %base) {
 ; CHECK-LABEL: simulated_trap_pseudo_expand:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
-; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s2, -1, 0
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %normal_path
-; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-NEXT:    s_load_b64 s[2:3], s[4:5], 0x10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s2, s0
 ; CHECK-NEXT:    s_addc_u32 s1, s3, s1
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; CHECK-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, v0
 ; CHECK-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 251908b1b0f94..50438a38a06ad 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -32,58 +32,57 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
-; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x14
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[52:53]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
-; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s54, 0
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b32 s8, s5
+; GLOBALNESS1-NEXT:    s_mov_b32 s9, s6
+; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[8:9], v[0:1]
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[8:9], 0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GLOBALNESS1-NEXT:    s_xor_b64 s[8:9], s[10:11], -1
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s4, 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[10:11]
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v3
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[8:9]
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s8, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s9, 1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v3
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
-; GLOBALNESS1-NEXT:    s_mov_b32 s82, s16
-; GLOBALNESS1-NEXT:    s_mov_b32 s83, s15
-; GLOBALNESS1-NEXT:    s_mov_b32 s84, s14
-; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s16
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s14
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr58_vgpr59
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
@@ -93,20 +92,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 2
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 3
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v4
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 4
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 5
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 6
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 7
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s70, 8
-; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s71, 9
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s68, 8
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s69, 9
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -114,11 +113,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 7
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
-; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
+; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow16
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
+; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow29
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
@@ -138,13 +137,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[70:71]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -155,12 +154,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
-; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock12
+; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock13
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow26
+; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow27
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -169,7 +168,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow26
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
@@ -182,7 +181,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[50:51], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -191,6 +190,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s9, 11
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v57, 2
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v57, 3
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s55
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
@@ -206,25 +206,25 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_16
-; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow17
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[64:65]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[84:85]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -237,33 +237,33 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[80:81]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[70:71]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_add_u32 s70, s38, 40
-; GLOBALNESS1-NEXT:    s_addc_u32 s71, s39, 0
+; GLOBALNESS1-NEXT:    s_add_u32 s68, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s69, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[58:59], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
@@ -273,19 +273,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
-; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v57, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v57, 8
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s8, v57, 10
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v57, 9
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_mov_b32 s55, s7
+; GLOBALNESS1-NEXT:    s_mov_b32 s55, s83
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v57, 9
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v57, 11
-; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow25
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[52:53]
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[50:51]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
@@ -315,9 +313,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
@@ -333,9 +331,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
@@ -343,58 +341,57 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
-; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x14
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[52:53]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
-; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s54, 0
-; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b32 s8, s5
+; GLOBALNESS0-NEXT:    s_mov_b32 s9, s6
+; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[8:9], v[0:1]
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[8:9], 0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GLOBALNESS0-NEXT:    s_xor_b64 s[8:9], s[10:11], -1
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s4, 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[10:11]
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[82:83], 1, v3
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[8:9]
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s8, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s9, 1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
-; GLOBALNESS0-NEXT:    s_mov_b32 s70, s16
-; GLOBALNESS0-NEXT:    s_mov_b32 s71, s15
-; GLOBALNESS0-NEXT:    s_mov_b32 s82, s14
-; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s14
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr58_vgpr59
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
@@ -404,20 +401,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 2
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 3
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v4
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 4
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 5
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 6
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 7
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s84, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s85, 9
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s82, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s83, 9
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -425,11 +422,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 7
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
-; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
+; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow16
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
+; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow29
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
@@ -449,13 +446,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[84:85]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -466,12 +463,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
-; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock12
+; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock13
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow26
+; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow27
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -480,7 +477,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow26
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
@@ -493,7 +490,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[50:51], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -502,7 +499,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s9, 11
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v57, 2
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v57, 3
-; GLOBALNESS0-NEXT:    s_mov_b32 s83, s55
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
@@ -518,25 +514,25 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_16
-; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow17
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[68:69]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[64:65]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[84:85]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -549,33 +545,33 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[80:81]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[70:71]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s84, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s85, s39, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s82, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s83, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[58:59], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
@@ -585,17 +581,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
-; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v57, 8
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v57, 8
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v57, 10
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v57, 9
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    s_mov_b32 s55, s7
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v57, 11
-; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow25
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[52:53]
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[50:51]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
@@ -625,9 +623,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
@@ -643,9 +641,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS0-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index 3558c4e1d0c85..05a3ecc7b80a5 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -11,14 +11,14 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; CHECK-NEXT:    flat_load_dword v42, v[44:45]
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
-; CHECK-NEXT:    s_load_dwordx4 s[64:67], s[34:35], 0x8
 ; CHECK-NEXT:    s_load_dword s68, s[34:35], 0x0
+; CHECK-NEXT:    s_load_dwordx4 s[64:67], s[34:35], 0x8
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], src_private_base
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s68, -1
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_cselect_b32 s5, s9, 0
@@ -27,7 +27,7 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    v_mov_b32_e32 v47, s5
 ; CHECK-NEXT:    s_mov_b32 s5, s4
 ; CHECK-NEXT:    s_addc_u32 s51, s35, 0
-; CHECK-NEXT:    v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT:    v_pk_mov_b32 v[60:61], s[4:5], s[4:5] op_sel:[0,1]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, G at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, G at gotpcrel32@hi+12
@@ -47,12 +47,10 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    s_mov_b32 s52, s15
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
-; CHECK-NEXT:    v_mov_b32_e32 v60, s66
-; CHECK-NEXT:    v_mov_b32_e32 v61, s67
-; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[62:63]
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[60:61]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT:    flat_load_dwordx2 a[32:33], v[56:57]
+; CHECK-NEXT:    flat_load_dwordx2 v[62:63], v[56:57]
 ; CHECK-NEXT:    v_mov_b32_e32 v58, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v59, 0x3ff00000
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
@@ -64,24 +62,24 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    flat_store_dwordx2 v[44:45], v[58:59]
-; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[62:63]
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[60:61]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; CHECK-NEXT:    flat_load_dwordx2 v[0:1], v[46:47] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s67
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[66:67], s[66:67] op_sel:[0,1]
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[62:63]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s68
 ; CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT:    flat_store_dwordx2 v[56:57], a[32:33]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[60:61]
 ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v58, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_4
-; CHECK-NEXT:  ; %bb.1: ; %LeafBlock5
+; CHECK-NEXT:  ; %bb.1: ; %LeafBlock3
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v42
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], vcc
@@ -89,7 +87,7 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 1
 ; CHECK-NEXT:  ; %bb.3: ; %Flow
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:  .LBB0_4: ; %Flow8
+; CHECK-NEXT:  .LBB0_4: ; %Flow6
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
 ; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[42:43], v[42:43] op_sel:[0,1]
 ; CHECK-NEXT:    v_pk_mov_b32 v[2:3], v[44:45], v[44:45] op_sel:[0,1]
@@ -102,7 +100,7 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; CHECK-NEXT:  ; %bb.6: ; %sw.bb.i.i.i.i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:  ; %bb.7: ; %Flow7
+; CHECK-NEXT:  ; %bb.7: ; %Flow5
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:  .LBB0_8: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 06f28f05c20be..6248132455e58 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1204,35 +1204,35 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX1032-LABEL: test_div_fmas_f64:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX1032-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s12
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s13
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s14
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s15
-; GFX1032-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s10
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s11
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s12
+; GFX1032-NEXT:    v_mov_b32_e32 v3, s13
+; GFX1032-NEXT:    s_bitcmp1_b32 s14, 0
 ; GFX1032-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1032-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
+; GFX1032-NEXT:    v_div_fmas_f64 v[0:1], s[8:9], v[0:1], v[2:3]
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_div_fmas_f64:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX1064-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX1064-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s13
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s14
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s15
-; GFX1064-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s10
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s11
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s13
+; GFX1064-NEXT:    s_bitcmp1_b32 s14, 0
 ; GFX1064-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX1064-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
+; GFX1064-NEXT:    v_div_fmas_f64 v[0:1], s[8:9], v[0:1], v[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/NVPTX/dag-cse.ll b/llvm/test/CodeGen/NVPTX/dag-cse.ll
index 84a38fd30963b..57520e226aa48 100644
--- a/llvm/test/CodeGen/NVPTX/dag-cse.ll
+++ b/llvm/test/CodeGen/NVPTX/dag-cse.ll
@@ -9,11 +9,9 @@
 ; Verify that loads with different memory types are not subject to CSE
 ; once they are promoted to the same type.
 ;
-; CHECK: ld.global.v2.b8  {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];
-; CHECK: st.global.v2.b8  [b], {%[[B1]], %[[B2]]};
-;
 ; CHECK: ld.global.b32 %[[C:r[0-9]+]], [a];
-; CHECK: st.global.b32 [c], %[[C]];
+; CHECK: st.global.v2.b8  [b],
+; CHECK: st.global.b32 [c],
 
 define void @test1() #0 {
   %1 = load <2 x i8>, ptr addrspace(1) @a, align 8
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 61ff80632c789..f1a78d11fd0f8 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -142,7 +142,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<4>;
-; CHECK-PTX-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
@@ -152,8 +152,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
 ; CHECK-PTX-NEXT:    add.s64 %rd3, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd4, %rd3, -8;
-; CHECK-PTX-NEXT:    ld.b32 %r2, [%rd4];
-; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd4+4];
+; CHECK-PTX-NEXT:    ld.v2.b32 {%r2, %r3}, [%rd4];
 ; CHECK-PTX-NEXT:    ld.b8 %rs1, [%rd4+7];
 ; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs1;
 ; CHECK-PTX-NEXT:    ld.b8 %rs2, [%rd4+6];
@@ -162,8 +161,9 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs3;
 ; CHECK-PTX-NEXT:    ld.b64 %rd5, [%rd4+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
-; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd6, %r5;
+; CHECK-PTX-NEXT:    cvt.s32.s8 %r5, %r3;
+; CHECK-PTX-NEXT:    add.s32 %r6, %r4, %r5;
+; CHECK-PTX-NEXT:    cvt.u64.u32 %rd6, %r6;
 ; CHECK-PTX-NEXT:    add.s64 %rd7, %rd6, %rd5;
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %rd7;
 ; CHECK-PTX-NEXT:    ret;
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index 70c46f035636a..4e782dff0f972 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -127,7 +127,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(
 
 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32(
-; CHECK-NEXT:    store <2 x i32> <i32 1082130432, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, ptr addrspace(1) %out, align 4
 ; CHECK-NEXT:    ret void
 ;
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
@@ -187,7 +187,7 @@ define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %
 
 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32(
-; CHECK-NEXT:    store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store <4 x float> <float 8.000000e+00, float 0x36D6000000000000, float 2.000000e+00, float 0x36E1000000000000>, ptr addrspace(1) %out, align 4
 ; CHECK-NEXT:    ret void
 ;
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
index 5c331a9c9fe39..acd9565f0daf5 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
@@ -5,21 +5,46 @@ define void @merge_i32_v2i16_f32_v4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %
 ; CHECK-LABEL: define void @merge_i32_v2i16_f32_v4i8(
 ; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT:    [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 2
-; CHECK-NEXT:    [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR1]], i64 3
-; CHECK-NEXT:    [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT:    [[LOAD11:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[LOAD11]] to i32
+; CHECK-NEXT:    [[LOAD22:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i8> [[LOAD22]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <2 x i16>
+; CHECK-NEXT:    [[LOAD33:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i8> [[LOAD33]] to float
+; CHECK-NEXT:    [[LOAD44:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
-; CHECK-NEXT:    store i32 [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
-; CHECK-NEXT:    [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT:    store <2 x i16> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2]] to <2 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i16> poison, i16 [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[TMP12]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP14]], i32 3
+; CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr addrspace(2) [[STORE_GEP1]], align 4
 ; CHECK-NEXT:    [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2
-; CHECK-NEXT:    store float [[LOAD3]], ptr addrspace(2) [[STORE_GEP3]], align 4
-; CHECK-NEXT:    [[STORE_GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(2) [[PTR2]], i64 3
-; CHECK-NEXT:    store <4 x i8> [[LOAD4]], ptr addrspace(2) [[STORE_GEP4]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP6]] to <4 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i8> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x i8> poison, i8 [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i8> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i8> [[TMP16]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP21]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i8> [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <8 x i8> [[TMP22]], i8 [[TMP23]], i32 3
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i8> [[LOAD44]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i8> [[TMP24]], i8 [[TMP25]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i8> [[LOAD44]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[TMP27]], i32 5
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i8> [[LOAD44]], i32 2
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[TMP29]], i32 6
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i8> [[LOAD44]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[TMP31]], i32 7
+; CHECK-NEXT:    store <8 x i8> [[TMP32]], ptr addrspace(2) [[STORE_GEP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
@@ -45,13 +70,21 @@ define void @merge_f32_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2
 ; CHECK-LABEL: define void @merge_f32_v2f16_type(
 ; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0
-; CHECK-NEXT:    [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT:    [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT:    [[LOAD11:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[LOAD11]] to float
+; CHECK-NEXT:    [[LOAD22:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
-; CHECK-NEXT:    store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
-; CHECK-NEXT:    [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT:    store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to <2 x half>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x half> poison, half [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x half> [[TMP5]], half [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x half> [[LOAD22]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x half> [[TMP7]], half [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x half> [[LOAD22]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x half> [[TMP9]], half [[TMP10]], i32 3
+; CHECK-NEXT:    store <4 x half> [[TMP11]], ptr addrspace(2) [[STORE_GEP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index d6b51039d5b44..a963604f1614d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -130,14 +130,14 @@ entry:
   ret void
 }
 
-; Ideally this would be merged
 define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
 ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
-; CHECK-NEXT:    [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
-; CHECK-NEXT:    [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[A]], align 4
+; CHECK-NEXT:    [[LD_01:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[LD_01]] to i32
+; CHECK-NEXT:    [[LD_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
index 2076a9dd962fe..f78e055376a78 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
@@ -229,8 +229,7 @@ entry:
 }
 
 ; CHECK-LABEL: @no_merge_load_i64_ptr32(
-; CHECK: load i64,
-; CHECK: load ptr addrspace(3),
+; CHECK: load <3 x i32>
 define amdgpu_kernel void @no_merge_load_i64_ptr32(ptr addrspace(1) nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, ptr addrspace(1) %a, i64 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
index 20e1fa8633b07..d3bba166bd02c 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
@@ -23,8 +23,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define float @foo(ptr %p) {
 entry:
 ; CHECK-LABEL: foo
-; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]]
-; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]]
+; CHECK: load <2 x float>, {{.*}}, !tbaa [[TAG_char:!.*]]
+; CHECK: store <2 x float> {{.*}}, !tbaa [[TAG_char]]
   %0 = load float, ptr %p, align 4, !tbaa !2
   %sub = fadd float %0, -1.000000e+00
   store float %sub, ptr %p, align 4, !tbaa !2
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/mismatched-gep-stride.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/mismatched-gep-stride.ll
index b037b329c9607..620ba9001aa31 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/mismatched-gep-stride.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/mismatched-gep-stride.ll
@@ -7,13 +7,11 @@ define void @mismatched_stride(ptr %p, ptr %idx_ptr, ptr %out) {
 ; CHECK-LABEL: define void @mismatched_stride(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[IDX_PTR:%.*]], ptr [[OUT:%.*]]) {
 ; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[IDX_PTR]], align 4
-; CHECK-NEXT:    [[Y:%.*]] = add nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[SX:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[SY:%.*]] = sext i32 [[Y]] to i64
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[P]], i64 [[SX]]
-; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[P]], i64 [[SY]]
-; CHECK-NEXT:    [[VA:%.*]] = load i8, ptr [[A]], align 2
-; CHECK-NEXT:    [[VB:%.*]] = load i8, ptr [[B]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[A]], align 2
+; CHECK-NEXT:    [[VA:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[VB:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[SUM:%.*]] = add i8 [[VA]], [[VB]]
 ; CHECK-NEXT:    store i8 [[SUM]], ptr [[OUT]], align 1
 ; CHECK-NEXT:    ret void
@@ -35,13 +33,11 @@ define void @mismatched_stride(ptr %p, ptr %idx_ptr, ptr %out) {
 define i32 @mismatched_stride_vec(ptr %p, i64 %i, i32 %x) {
 ; CHECK-LABEL: define i32 @mismatched_stride_vec(
 ; CHECK-SAME: ptr [[P:%.*]], i64 [[I:%.*]], i32 [[X:%.*]]) {
-; CHECK-NEXT:    [[Y:%.*]] = add nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[SX:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[SY:%.*]] = sext i32 [[Y]] to i64
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr [4 x i32], ptr [[P]], i64 [[I]], i64 [[SX]]
-; CHECK-NEXT:    [[B:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[I]], i64 [[SY]]
-; CHECK-NEXT:    [[VA1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[VB2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 4
+; CHECK-NEXT:    [[VA1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[VB2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[VA1]], [[VB2]]
 ; CHECK-NEXT:    ret i32 [[SUM]]
 ;
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
index 6b9229e8dfd6d..0881729a034e5 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
@@ -15,11 +15,11 @@ define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> poison, i8 [[TMP41]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll
index cf575cef2d1ba..98cab14a7beaa 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll
@@ -15,16 +15,16 @@ define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -63,16 +63,16 @@ define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -116,15 +116,15 @@ define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -174,7 +174,7 @@ define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -230,7 +230,7 @@ define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, ptr %src, pt
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -288,15 +288,15 @@ define void @ld_v4i8_add_assume_on_arg(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -354,7 +354,7 @@ define void @ld_v4i8_add_assume_on_arg1(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -411,7 +411,7 @@ define void @ld_v2i8_add_different_contexts(i32 %ind0, i32 %ind1, ptr %src, ptr
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
-; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]], align 2
 ; CHECK-NEXT:    br label [[BB_SKIP]]
 ; CHECK:       bb.skip:
 ; CHECK-NEXT:    ret void
@@ -461,7 +461,7 @@ define void @ld_v2i8_add_different_contexts1(i32 %ind0, i32 %ind1, ptr %src, ptr
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
-; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]], align 2
 ; CHECK-NEXT:    br label [[BB_SKIP]]
 ; CHECK:       bb.skip:
 ; CHECK-NEXT:    ret void
@@ -509,7 +509,7 @@ define void @ld_v2i8_add_context(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
-; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -547,7 +547,7 @@ define void @ld_v2i8_add_context1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
-; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]], align 2
 ; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
 ; CHECK-NEXT:    ret void
@@ -602,7 +602,7 @@ define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:



More information about the llvm-commits mailing list