[llvm] [LSV] Merge contiguous chains across scalar types (PR #154069)

Anshil Gandhi via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 22 12:11:55 PDT 2025


https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/154069

>From 97222ffbb2a196fb23625edc8dcc88caa437bfb4 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Thu, 21 Aug 2025 16:29:32 -0500
Subject: [PATCH] [LSV] Merge contiguous chains across scalar types

This change enables the Load/Store Vectorizer to
merge and vectorize contiguous chains even when
their scalar element types differ, as long as the
total bitwidth matches. To do so, we rebase
offsets between chains, normalize value types to
a common integer type, and insert the necessary
casts around loads and stores. This uncovers more
vectorization opportunities and explains the
expected codegen updates across AMDGPU tests.

Key changes:
- Chain merging
  - Build contiguous subchains and then merge
  adjacent ones when:
    - They refer to the same underlying pointer
    object and address space.
    - They are either all loads or all stores.
    - A constant leader-to-leader delta exists.
    - Rebasing one chain into the other's
    coordinate space does not overlap.
    - All elements have equal total bit width.
  - Rebase the second chain by the computed delta
  and append it to the first.

- Type normalization and casting
  - Normalize merged chains to a common integer
  type sized to the total bits.
  - For loads: create a new load of the normalized
  type, copy metadata, and cast back to the
  original type for uses if needed.
  - For stores: bitcast the value to the
  normalized type and store that.
  - Insert zext/trunc for integer size changes;
  use bit-or-pointer casts when sizes match.

- Cleanups
  - Erase replaced instructions and DCE pointer
  operands when safe.
  - New helpers: computeLeaderDelta,
  chainsOverlapAfterRebase, rebaseChain,
    normalizeChainToType, and
    allElemsMatchTotalBits.

Impact:
- Increases vectorization opportunities across
mixed-typed but size-compatible access chains.
- Large set of expected AMDGPU codegen diffs due
to more/changed vectorization.
---
 llvm/include/llvm/Transforms/Utils/Local.h    |    4 +
 llvm/lib/Transforms/Utils/Local.cpp           |   45 +
 .../Vectorize/LoadStoreVectorizer.cpp         |  276 ++-
 .../AMDGPU/32-bit-local-address-space.ll      |   18 +-
 .../AMDGPU/GlobalISel/amdgpu-irtranslator.ll  |    2 +
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  |  516 +++---
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   85 +-
 .../GlobalISel/llvm.amdgcn.div.scale.ll       |   20 +-
 llvm/test/CodeGen/AMDGPU/add_i1.ll            |  153 +-
 llvm/test/CodeGen/AMDGPU/add_i64.ll           |    9 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  242 +--
 llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll       |  448 ++++-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 1481 +++++++++--------
 llvm/test/CodeGen/AMDGPU/and.ll               |  703 +++++---
 .../CodeGen/AMDGPU/atomic_cmp_swap_local.ll   |   52 +-
 .../AMDGPU/av-split-dead-valno-crash.ll       |   18 +-
 .../AMDGPU/branch-relaxation-gfx1250.ll       |   12 +-
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll |   47 +-
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |   14 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i16.ll  |   36 +-
 llvm/test/CodeGen/AMDGPU/fabs.bf16.ll         |   66 +-
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |   28 +-
 llvm/test/CodeGen/AMDGPU/fabs.ll              |   14 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |  104 +-
 llvm/test/CodeGen/AMDGPU/fdiv.ll              |  333 ++--
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      | 1308 +++++++--------
 llvm/test/CodeGen/AMDGPU/fnearbyint.ll        |   10 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |   54 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll    |  108 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |   14 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.ll         |   14 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |   26 +-
 llvm/test/CodeGen/AMDGPU/fneg.ll              |   14 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |  540 +++---
 llvm/test/CodeGen/AMDGPU/fshl.ll              |  192 +--
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  195 +--
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    | 1351 +++++++--------
 llvm/test/CodeGen/AMDGPU/half.ll              |  174 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |   54 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |   22 +-
 llvm/test/CodeGen/AMDGPU/kernel-args.ll       |   24 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll    |   20 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  240 ++-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |   86 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |   86 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |   36 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |   76 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |   76 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |   18 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |   71 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |  216 +--
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |   82 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |   68 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   52 +-
 llvm/test/CodeGen/AMDGPU/s_addk_i32.ll        |    4 +-
 .../AMDGPU/sext-divergence-driven-isel.ll     |   44 +-
 llvm/test/CodeGen/AMDGPU/shl.ll               |  500 +++---
 .../AMDGPU/si-triv-disjoint-mem-access.ll     |   28 +-
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     |   64 +-
 llvm/test/CodeGen/AMDGPU/store-to-constant.ll |   14 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  259 +--
 llvm/test/CodeGen/AMDGPU/udivrem.ll           |  126 +-
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |    2 +-
 .../AMDGPU/zext-divergence-driven-isel.ll     |   21 +-
 .../AMDGPU/merge-vectors-complex.ll           |   39 +-
 .../AMDGPU/merge-vectors.ll                   |   24 +-
 66 files changed, 5666 insertions(+), 5412 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 3f5f4278a2766..7b94001801c37 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -433,6 +433,10 @@ LLVM_ABI void combineAAMetadata(Instruction *K, const Instruction *J);
 /// replacement for the source instruction).
 LLVM_ABI void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source);
 
+/// Copy the metadata from the source instruction to the destination (the
+/// replacement for the source instruction).
+LLVM_ABI void copyMetadataForStore(StoreInst &Dest, const StoreInst &Source);
+
 /// Patch the replacement so that it is not more restrictive than the value
 /// being replaced. It assumes that the replacement does not get moved from
 /// its original position.
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ac344904f90f0..671ebc33fb1e6 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3140,6 +3140,51 @@ void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
   }
 }
 
+void llvm::copyMetadataForStore(StoreInst &Dest, const StoreInst &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  MDBuilder MDB(Dest.getContext());
+  Type *NewType = Dest.getType();
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_access_group:
+    case LLVMContext::MD_noundef:
+    case LLVMContext::MD_noalias_addrspace:
+    case LLVMContext::MD_mem_parallel_loop_access:
+      Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_tbaa: {
+      MDNode *NewTyNode =
+          MDB.createTBAAScalarTypeNode(NewType->getStructName(), N);
+      Dest.setMetadata(LLVMContext::MD_tbaa, NewTyNode);
+      break;
+    }
+    case LLVMContext::MD_nonnull:
+      break;
+
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewType->isPointerTy())
+        Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_range:
+      break;
+    }
+  }
+}
+
 void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
   auto *ReplInst = dyn_cast<Instruction>(Repl);
   if (!ReplInst)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..bb69af649dc9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -112,6 +112,7 @@
 #include <optional>
 #include <tuple>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -268,11 +269,6 @@ class Vectorizer {
   /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
   bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
 
-  /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
-  /// in the same BB with the same value for getUnderlyingObject() etc.
-  bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
-                             ArrayRef<Instruction *> EqClass);
-
   /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
   /// where all instructions access a known, constant offset from the first
   /// instruction.
@@ -338,12 +334,22 @@ class Vectorizer {
   EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
                                                 BasicBlock::iterator End);
 
+  /// Inserts a cast instruction to convert Inst to DstTy.
+  Value *insertCast(Value *Val, Type *DstTy);
+
   /// Partitions Instrs into "chains" where every instruction has a known
   /// constant offset from the first instr in the chain.
   ///
   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
   /// in the chain is the leader, and an instr touches distance 0 from itself.
   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+  // Helpers for chain merging.
+  std::optional<APInt> computeLeaderDelta(Instruction *I1, Instruction *I2);
+  bool chainsOverlapAfterRebase(const Chain &A, const Chain &B,
+                                const APInt &Delta) const;
+  static void rebaseChain(Chain &C, const APInt &Delta);
+  void normalizeChainToType(Chain &C, Type *CastTy);
 };
 
 class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -425,6 +431,20 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
   return Changed ? PA : PreservedAnalyses::all();
 }
 
+static const Value *getUnderlyingObject(const Value *Ptr) {
+  const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
+  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+    // The select's themselves are distinct instructions even if they share
+    // the same condition and evaluate to consecutive pointers for true and
+    // false values of the condition. Therefore using the select's themselves
+    // for grouping instructions would put consecutive accesses into different
+    // lists and they won't be even checked for being consecutive, and won't
+    // be vectorized.
+    return Sel->getCondition();
+  }
+  return ObjPtr;
+}
+
 bool Vectorizer::run() {
   bool Changed = false;
   // Break up the BB if there are any instrs which aren't guaranteed to transfer
@@ -468,6 +488,91 @@ bool Vectorizer::run() {
   return Changed;
 }
 
+Value *Vectorizer::insertCast(Value *Val, Type *DstTy) {
+  if (DL.getTypeSizeInBits(Val->getType()) == DL.getTypeSizeInBits(DstTy)) {
+    return Builder.CreateBitOrPointerCast(Val, DstTy, Val->getName() + ".bc");
+  }
+
+  // If the types are of different sizes and both are integers, we can use
+  // zext or sext to cast.
+  if (Val->getType()->isIntegerTy() && DstTy->isIntegerTy()) {
+    if (DL.getTypeSizeInBits(Val->getType()) < DL.getTypeSizeInBits(DstTy)) {
+      return Builder.CreateZExt(Val, DstTy, Val->getName() + ".bc");
+    }
+    return Builder.CreateTrunc(Val, DstTy, Val->getName() + ".bc");
+  }
+
+  return nullptr;
+}
+
+std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
+                                                    Instruction *I2) {
+  assert((isa<LoadInst>(I1) || isa<StoreInst>(I1)) &&
+         (isa<LoadInst>(I2) || isa<StoreInst>(I2)) &&
+         "computeLeaderDelta must be called with load or store instructions");
+  Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1;
+  const Value *Ptr1 = getLoadStorePointerOperand(I1);
+  const Value *Ptr2 = getLoadStorePointerOperand(I2);
+  return getConstantOffset(const_cast<Value *>(Ptr1), const_cast<Value *>(Ptr2),
+                           CtxInst);
+}
+
+bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B,
+                                          const APInt &Delta) const {
+  for (const ChainElem &EB : B) {
+    APInt OffB = EB.OffsetFromLeader + Delta;
+    unsigned SizeB = DL.getTypeStoreSize(getLoadStoreType(EB.Inst));
+    ConstantRange BRange(OffB, OffB + SizeB);
+    for (const ChainElem &EA : A) {
+      APInt OffA = EA.OffsetFromLeader;
+      unsigned SizeA = DL.getTypeStoreSize(getLoadStoreType(EA.Inst));
+      ConstantRange ARange(OffA, OffA + SizeA);
+      if (!ARange.intersectWith(BRange).isEmptySet())
+        return true;
+    }
+  }
+  return false;
+}
+
+void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) {
+  for (ChainElem &E : C)
+    E.OffsetFromLeader += Delta;
+}
+
+void Vectorizer::normalizeChainToType(Chain &C, Type *CastTy) {
+  for (ChainElem &Elem : C) {
+    Instruction *Inst = Elem.Inst;
+    Type *OrigValTy = getLoadStoreType(Inst);
+    if (OrigValTy == CastTy)
+      continue;
+
+    if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+      Builder.SetInsertPoint(LI);
+      LoadInst *NewLoad = Builder.CreateLoad(CastTy, LI->getPointerOperand(),
+                                             LI->getName() + ".mut");
+      copyMetadataForLoad(*NewLoad, *LI);
+      Value *CastBack = insertCast(NewLoad, OrigValTy);
+      if (!CastBack)
+        llvm_unreachable("Failed to insert cast");
+      LI->replaceAllUsesWith(CastBack);
+      ToErase.emplace_back(LI);
+      Elem.Inst = NewLoad;
+    } else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+      Builder.SetInsertPoint(SI);
+      Value *CastVal = insertCast(SI->getValueOperand(), CastTy);
+      if (!CastVal)
+        llvm_unreachable("Failed to insert cast");
+      StoreInst *NewStore =
+          Builder.CreateStore(CastVal, SI->getPointerOperand());
+      NewStore->setAlignment(SI->getAlign());
+      NewStore->setVolatile(SI->isVolatile());
+      copyMetadataForStore(*NewStore, *SI);
+      ToErase.emplace_back(SI);
+      Elem.Inst = NewStore;
+    }
+  }
+}
+
 bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
                                BasicBlock::iterator End) {
   LLVM_DEBUG({
@@ -480,49 +585,111 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
   });
 
   bool Changed = false;
+  SmallVector<Chain> ContiguousSubChains;
+
   for (const auto &[EqClassKey, EqClass] :
-       collectEquivalenceClasses(Begin, End))
-    Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
+       collectEquivalenceClasses(Begin, End)) {
 
-  return Changed;
-}
+    LLVM_DEBUG({
+      dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+             << " keyed on " << EqClassKey << ":\n";
+      for (Instruction *I : EqClass)
+        dbgs() << "  " << *I << "\n";
+    });
 
-bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
-                                       ArrayRef<Instruction *> EqClass) {
-  bool Changed = false;
+    for (Chain &C : gatherChains(EqClass)) {
 
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
-           << " keyed on " << EqClassKey << ":\n";
-    for (Instruction *I : EqClass)
-      dbgs() << "  " << *I << "\n";
-  });
+      // Split up the chain into increasingly smaller chains, until we can
+      // finally vectorize the chains.
+      //
+      // (Don't be scared by the depth of the loop nest here.  These operations
+      // are all at worst O(n lg n) in the number of instructions, and splitting
+      // chains doesn't change the number of instrs.  So the whole loop nest is
+      // O(n lg n).)
+      for (auto &C : splitChainByMayAliasInstrs(C)) {
+        for (auto &C : splitChainByContiguity(C)) {
+          ContiguousSubChains.emplace_back(C);
+        }
+      }
+    }
+  }
 
-  std::vector<Chain> Chains = gatherChains(EqClass);
-  LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
-                    << " nontrivial chains.\n";);
-  for (Chain &C : Chains)
-    Changed |= runOnChain(C);
-  return Changed;
-}
+  // Merge chains in reverse order, so that the first chain is the largest.
+  for (int I = ContiguousSubChains.size() - 1; I > 0; I--) {
+    Chain &C1 = ContiguousSubChains[I - 1];
+    Chain &C2 = ContiguousSubChains[I];
 
-bool Vectorizer::runOnChain(Chain &C) {
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
-    dumpChain(C);
-  });
+    // If the scalar types of the chains are the same, we can merge them
+    // without inserting any casts.
+    if (getLoadStoreType(C1[0].Inst)->getScalarType() ==
+        getLoadStoreType(C2[0].Inst)->getScalarType())
+      continue;
+
+    const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst);
+    const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst);
+    unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace();
+    unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace();
+    bool C1IsLoad = isa<LoadInst>(C1[0].Inst);
+    bool C2IsLoad = isa<LoadInst>(C2[0].Inst);
+
+    // If the chains are mapped to different types, have distinct underlying
+    // pointer objects, or include both loads and stores, skip.
+    if (C1IsLoad != C2IsLoad || AS1 != AS2 ||
+        ::getUnderlyingObject(C1Ptr) != ::getUnderlyingObject(C2Ptr))
+      continue;
+
+    // Compute constant offset between chain leaders; if unknown, skip.
+    std::optional<APInt> DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst);
+    if (!DeltaOpt)
+      continue;
+
+    // Check that rebasing C2 into C1's coordinate space will not overlap C1.
+    if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt))
+      continue;
+
+    // Determine the common integer cast type for normalization and ensure total
+    // bitwidth matches across all elements of both chains.
+    Type *C1ElemTy = getLoadStoreType(C1[0].Inst);
+    unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy);
+    auto AllElemsMatchTotalBits = [&](const Chain &C) {
+      return llvm::all_of(C, [&](const ChainElem &E) {
+        return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
+      });
+    };
+    if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2))
+      continue;
+
+    // Rebase C2's offsets into C1's coordinate space prior to merging.
+    rebaseChain(C2, *DeltaOpt);
+
+    // Merge C2 into C1 by appending all elements of C2 to C1, then erase C2
+    // from ContiguousSubChains.
+    C1.insert(C1.end(), C2.begin(), C2.end());
+    ContiguousSubChains.erase(ContiguousSubChains.begin() + I);
+
+    // Normalize the value operand/result type of each instruction in C1 to
+    // C1CastTy.
+    Type *C1CastTy =
+        Type::getIntNTy(C1ElemTy->getContext(), DL.getTypeSizeInBits(C1ElemTy));
+    normalizeChainToType(C1, C1CastTy);
+  }
+
+  for (auto &C : ContiguousSubChains) {
+    if (C.size() <= 1)
+      continue;
+    for (auto &AlignedSubChain : splitChainByAlignment(C))
+      Changed |= vectorizeChain(AlignedSubChain);
+  }
+
+  // Erase all instructions scheduled for deletion in this pseudo-BB.
+  for (Instruction *I : ToErase) {
+    auto *PtrOperand = getLoadStorePointerOperand(I);
+    if (I->use_empty())
+      I->eraseFromParent();
+    RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
+  }
+  ToErase.clear();
 
-  // Split up the chain into increasingly smaller chains, until we can finally
-  // vectorize the chains.
-  //
-  // (Don't be scared by the depth of the loop nest here.  These operations are
-  // all at worst O(n lg n) in the number of instructions, and splitting chains
-  // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
-  bool Changed = false;
-  for (auto &C : splitChainByMayAliasInstrs(C))
-    for (auto &C : splitChainByContiguity(C))
-      for (auto &C : splitChainByAlignment(C))
-        Changed |= vectorizeChain(C);
   return Changed;
 }
 
@@ -583,7 +750,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
         LLVM_DEBUG(
             dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
                    << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
-        if (NewChain.size() > 1) {
+        if (!NewChain.empty()) {
           LLVM_DEBUG({
             dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
             dumpChain(NewChain);
@@ -595,7 +762,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
         NewChain = SmallVector<ChainElem, 1>({*ChainIt});
       }
     }
-    if (NewChain.size() > 1) {
+    if (!NewChain.empty()) {
       LLVM_DEBUG({
         dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
         dumpChain(NewChain);
@@ -648,8 +815,6 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
       Ret.push_back({*It});
   }
 
-  // Filter out length-1 chains, these are uninteresting.
-  llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
   return Ret;
 }
 
@@ -669,7 +834,7 @@ Type *Vectorizer::getChainElemTy(const Chain &C) {
   if (any_of(C, [](const ChainElem &E) {
         return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
       })) {
-    return Type::getIntNTy(
+    return IntegerType::getIntNTy(
         F.getContext(),
         DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
   }
@@ -1434,20 +1599,6 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                       BasicBlock::iterator End) {
   EquivalenceClassMap Ret;
 
-  auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
-    const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
-    if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
-      // The select's themselves are distinct instructions even if they share
-      // the same condition and evaluate to consecutive pointers for true and
-      // false values of the condition. Therefore using the select's themselves
-      // for grouping instructions would put consecutive accesses into different
-      // lists and they won't be even checked for being consecutive, and won't
-      // be vectorized.
-      return Sel->getCondition();
-    }
-    return ObjPtr;
-  };
-
   for (Instruction &I : make_range(Begin, End)) {
     auto *LI = dyn_cast<LoadInst>(&I);
     auto *SI = dyn_cast<StoreInst>(&I);
@@ -1495,7 +1646,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
         (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
       continue;
 
-    Ret[{GetUnderlyingObject(Ptr), AS,
+    Ret[{::getUnderlyingObject(Ptr), AS,
          DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
          /*IsLoad=*/LI != nullptr}]
         .emplace_back(&I);
@@ -1590,8 +1741,7 @@ std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
   Ret.reserve(Chains.size());
   // Iterate over MRU rather than Chains so the order is deterministic.
   for (auto &E : MRU)
-    if (E.second.size() > 1)
-      Ret.emplace_back(std::move(E.second));
+    Ret.emplace_back(std::move(E.second));
   return Ret;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 4b53f66b379a4..e941aa9e71e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -298,26 +298,24 @@ define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val)
 define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
 ; GFX7-LABEL: local_address_gep_store:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_add_i32 s0, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX7-NEXT:    s_add_i32 s0, s0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-NEXT:    ds_write_b32 v1, v0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: local_address_gep_store:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    s_add_i32 s0, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX8-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    ds_write_b32 v1, v0
 ; GFX8-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
index fc236147f1238..7dd907e3c143f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -20,3 +20,5 @@ define void @addi32(i32 %arg1, i32 %arg2) {
   store i32 %res, ptr addrspace(1) poison
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 481a2540eacb7..69b49fce490e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -84,50 +84,44 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -203,50 +197,44 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -322,49 +310,43 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -440,49 +422,43 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -558,50 +534,44 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -677,50 +647,44 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -796,49 +760,43 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -914,49 +872,43 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -1032,50 +984,44 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -1151,50 +1097,44 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -1270,49 +1210,43 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -1388,49 +1322,43 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 6846137272ec6..7c82c32488beb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -513,15 +513,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x4
-; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0xd
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_and_b32 s2, 1, s6
-; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX7-NEXT:    s_and_b32 s3, 1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_nop 1
@@ -531,18 +529,15 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x10
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8-NEXT:    s_nop 2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    s_and_b32 s2, 1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, 1.0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -550,65 +545,57 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W32:       ; %bb.0:
-; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x34
-; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x10
-; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x8
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10_W32-NEXT:    s_clause 0x1
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX10_W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s6
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x34
-; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x10
-; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x8
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10_W64-NEXT:    s_clause 0x1
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX10_W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s6
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W32:       ; %bb.0:
-; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x34
-; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x10
-; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x8
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11_W32-NEXT:    s_clause 0x1
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11_W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    s_and_b32 s4, 1, s6
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x34
-; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x10
-; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x8
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11_W64-NEXT:    s_clause 0x1
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11_W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    s_and_b32 s4, 1, s6
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 0535394d1025c..feca85b76cf12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -684,15 +684,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[0:1], s[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[0:1], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -760,15 +759,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[6:7], v[0:1], s[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[4:5], v[0:1], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -836,15 +834,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[6:7], s[6:7], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -912,15 +909,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], s[6:7], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], s[4:5], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
index ca605986da941..7c13a0bd97850 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 
 define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
 ; GFX9-LABEL: add_var_var_i1:
@@ -18,39 +18,6 @@ define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: add_var_var_i1:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: add_var_var_i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_u8 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in0
   %b = load volatile i1, ptr addrspace(1) %in1
   %add = add i1 %a, %b
@@ -72,35 +39,6 @@ define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: add_var_imm_i1:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    s_xor_b32 s2, vcc_lo, -1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: add_var_imm_i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_xor_b32 s2, vcc_lo, -1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in
   %add = add i1 %a, 1
   store i1 %add, ptr addrspace(1) %out
@@ -145,88 +83,6 @@ define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a,
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: add_i1_cf:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
-; GFX10-NEXT:    ; implicit-def: $sgpr4
-; GFX10-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s5, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execz .LBB2_2
-; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v0
-; GFX10-NEXT:  .LBB2_2: ; %Flow
-; GFX10-NEXT:    s_andn2_saveexec_b32 s5, s5
-; GFX10-NEXT:    s_cbranch_execz .LBB2_4
-; GFX10-NEXT:  ; %bb.3: ; %if
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 s2, s4, exec_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
-; GFX10-NEXT:    s_or_b32 s4, s2, s3
-; GFX10-NEXT:  .LBB2_4: ; %endif
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b32 s2, s4, -1
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: add_i1_cf:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_mov_b32 s7, exec_lo
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmpx_lt_u32_e32 15, v0
-; GFX11-NEXT:    s_xor_b32 s7, exec_lo, s7
-; GFX11-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11-NEXT:  ; %bb.1: ; %else
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[4:5] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 1, v0
-; GFX11-NEXT:  .LBB2_2: ; %Flow
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s4, s7
-; GFX11-NEXT:    s_cbranch_execz .LBB2_4
-; GFX11-NEXT:  ; %bb.3: ; %if
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 s2, s6, exec_lo
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
-; GFX11-NEXT:    s_or_b32 s6, s2, s3
-; GFX11-NEXT:  .LBB2_4: ; %endif
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_xor_b32 s2, s6, -1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -248,3 +104,6 @@ endif:
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX10: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index a8560e8498c4d..f68b9b318826c 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -142,12 +142,11 @@ define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) {
 ; SI-LABEL: trunc_i64_add_to_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xd
-; SI-NEXT:    s_load_dword s6, s[4:5], 0x11
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_add_i32 s4, s6, s2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_add_i32 s4, s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 3160e38df5e3f..4c5a1d71e41b2 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -513,53 +513,53 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-LABEL: introduced_copy_to_sgpr:
 ; GFX908:       ; %bb.0: ; %bb
 ; GFX908-NEXT:    global_load_ushort v16, v[0:1], off glc
-; GFX908-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX908-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX908-NEXT:    s_load_dword s0, s[8:9], 0x18
-; GFX908-NEXT:    s_mov_b32 s12, 0
-; GFX908-NEXT:    s_mov_b32 s9, s12
+; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX908-NEXT:    s_load_dwordx2 s[16:17], s[8:9], 0x0
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x10
+; GFX908-NEXT:    s_mov_b32 s4, 0
+; GFX908-NEXT:    s_mov_b32 s9, s4
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX908-NEXT:    s_sub_i32 s1, 0, s7
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s0
+; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX908-NEXT:    s_sub_i32 s3, 0, s1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s2
 ; GFX908-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX908-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX908-NEXT:    s_mul_i32 s1, s1, s2
-; GFX908-NEXT:    s_mul_hi_u32 s1, s2, s1
-; GFX908-NEXT:    s_add_i32 s2, s2, s1
-; GFX908-NEXT:    s_mul_hi_u32 s1, s6, s2
-; GFX908-NEXT:    s_mul_i32 s2, s1, s7
-; GFX908-NEXT:    s_sub_i32 s2, s6, s2
-; GFX908-NEXT:    s_add_i32 s3, s1, 1
-; GFX908-NEXT:    s_sub_i32 s6, s2, s7
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX908-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX908-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX908-NEXT:    s_add_i32 s3, s1, 1
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX908-NEXT:    s_cselect_b32 s8, s3, s1
-; GFX908-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX908-NEXT:    s_mul_i32 s3, s3, s5
+; GFX908-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX908-NEXT:    s_add_i32 s5, s5, s3
+; GFX908-NEXT:    s_mul_hi_u32 s3, s0, s5
+; GFX908-NEXT:    s_mul_i32 s5, s3, s1
+; GFX908-NEXT:    s_sub_i32 s0, s0, s5
+; GFX908-NEXT:    s_add_i32 s8, s3, 1
+; GFX908-NEXT:    s_sub_i32 s5, s0, s1
+; GFX908-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX908-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX908-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX908-NEXT:    s_add_i32 s5, s3, 1
+; GFX908-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX908-NEXT:    s_cselect_b32 s8, s5, s3
+; GFX908-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s2
-; GFX908-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
-; GFX908-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX908-NEXT:    s_lshl_b64 s[12:13], s[6:7], 5
+; GFX908-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX908-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX908-NEXT:    s_or_b32 s14, s14, 28
-; GFX908-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
+; GFX908-NEXT:    s_or_b32 s12, s12, 28
+; GFX908-NEXT:    s_lshl_b64 s[14:15], s[8:9], 5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s2, v16
 ; GFX908-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX908-NEXT:    s_mul_i32 s3, s5, s2
-; GFX908-NEXT:    s_mul_hi_u32 s5, s4, s2
-; GFX908-NEXT:    s_mul_i32 s2, s4, s2
+; GFX908-NEXT:    s_mul_i32 s3, s17, s2
+; GFX908-NEXT:    s_mul_hi_u32 s5, s16, s2
+; GFX908-NEXT:    s_mul_i32 s2, s16, s2
 ; GFX908-NEXT:    s_add_i32 s3, s5, s3
-; GFX908-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
+; GFX908-NEXT:    s_lshl_b64 s[16:17], s[2:3], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
-; GFX908-NEXT:  .LBB3_1: ; %Flow20
+; GFX908-NEXT:  .LBB3_1: ; %Flow21
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
@@ -572,47 +572,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX908-NEXT:    s_mov_b32 s13, s12
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
+; GFX908-NEXT:    s_mov_b32 s5, s4
 ; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v4, s12
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v6, s12
-; GFX908-NEXT:    v_mov_b32_e32 v8, s12
-; GFX908-NEXT:    v_mov_b32_e32 v5, s13
-; GFX908-NEXT:    v_mov_b32_e32 v7, s13
-; GFX908-NEXT:    v_mov_b32_e32 v9, s13
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
+; GFX908-NEXT:    v_mov_b32_e32 v7, s5
+; GFX908-NEXT:    v_mov_b32_e32 v9, s5
+; GFX908-NEXT:    v_mov_b32_e32 v5, s5
+; GFX908-NEXT:    v_mov_b32_e32 v6, s4
+; GFX908-NEXT:    v_mov_b32_e32 v8, s4
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
-; GFX908-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX908-NEXT:    s_mov_b64 s[20:21], s[12:13]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s13, v3
-; GFX908-NEXT:    s_add_u32 s9, s9, 1
-; GFX908-NEXT:    s_addc_u32 s13, s13, 0
-; GFX908-NEXT:    s_mul_hi_u32 s22, s6, s9
-; GFX908-NEXT:    s_mul_i32 s13, s6, s13
-; GFX908-NEXT:    s_mul_i32 s23, s7, s9
-; GFX908-NEXT:    s_add_i32 s13, s22, s13
-; GFX908-NEXT:    s_mul_i32 s9, s6, s9
-; GFX908-NEXT:    s_add_i32 s13, s13, s23
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX908-NEXT:    s_add_u32 s5, s5, 1
+; GFX908-NEXT:    s_addc_u32 s9, s9, 0
+; GFX908-NEXT:    s_mul_hi_u32 s22, s10, s5
+; GFX908-NEXT:    s_mul_i32 s9, s10, s9
+; GFX908-NEXT:    s_mul_i32 s23, s11, s5
+; GFX908-NEXT:    s_add_i32 s9, s22, s9
+; GFX908-NEXT:    s_mul_i32 s5, s10, s5
+; GFX908-NEXT:    s_add_i32 s9, s9, s23
 ; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT:    s_add_u32 s20, s20, s4
+; GFX908-NEXT:    s_add_u32 s20, s20, s16
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
-; GFX908-NEXT:    s_addc_u32 s21, s21, s5
+; GFX908-NEXT:    s_addc_u32 s21, s21, s17
 ; GFX908-NEXT:    s_mov_b64 s[22:23], 0
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
 ; GFX908-NEXT:  .LBB3_5: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    s_add_u32 s22, s20, s9
-; GFX908-NEXT:    s_addc_u32 s23, s21, s13
+; GFX908-NEXT:    s_add_u32 s22, s20, s5
+; GFX908-NEXT:    s_addc_u32 s23, s21, s9
 ; GFX908-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    global_load_dword v20, v19, s[22:23] offset:-8 glc
@@ -660,17 +660,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
-; GFX908-NEXT:  .LBB3_10: ; %Flow19
+; GFX908-NEXT:  .LBB3_10: ; %Flow20
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  ; %bb.11: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_add_u32 s10, s10, s8
-; GFX908-NEXT:    s_addc_u32 s11, s11, 0
-; GFX908-NEXT:    s_add_u32 s14, s14, s16
-; GFX908-NEXT:    s_addc_u32 s15, s15, s17
+; GFX908-NEXT:    s_add_u32 s6, s6, s8
+; GFX908-NEXT:    s_addc_u32 s7, s7, 0
+; GFX908-NEXT:    s_add_u32 s12, s12, s14
+; GFX908-NEXT:    s_addc_u32 s13, s13, s15
 ; GFX908-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX908-NEXT:    s_branch .LBB3_1
 ; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
@@ -679,52 +679,52 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-LABEL: introduced_copy_to_sgpr:
 ; GFX90A:       ; %bb.0: ; %bb
 ; GFX90A-NEXT:    global_load_ushort v18, v[0:1], off glc
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX90A-NEXT:    s_load_dword s0, s[8:9], 0x18
-; GFX90A-NEXT:    s_mov_b32 s12, 0
-; GFX90A-NEXT:    s_mov_b32 s9, s12
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[8:9], 0x0
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x10
+; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_mov_b32 s9, s4
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX90A-NEXT:    s_sub_i32 s1, 0, s7
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX90A-NEXT:    s_sub_i32 s3, 0, s1
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GFX90A-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; GFX90A-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX90A-NEXT:    s_mul_i32 s1, s1, s2
-; GFX90A-NEXT:    s_mul_hi_u32 s1, s2, s1
-; GFX90A-NEXT:    s_add_i32 s2, s2, s1
-; GFX90A-NEXT:    s_mul_hi_u32 s1, s6, s2
-; GFX90A-NEXT:    s_mul_i32 s2, s1, s7
-; GFX90A-NEXT:    s_sub_i32 s2, s6, s2
-; GFX90A-NEXT:    s_add_i32 s3, s1, 1
-; GFX90A-NEXT:    s_sub_i32 s6, s2, s7
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX90A-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX90A-NEXT:    s_add_i32 s3, s1, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX90A-NEXT:    s_cselect_b32 s8, s3, s1
-; GFX90A-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX90A-NEXT:    s_mul_i32 s3, s3, s5
+; GFX90A-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX90A-NEXT:    s_add_i32 s5, s5, s3
+; GFX90A-NEXT:    s_mul_hi_u32 s3, s0, s5
+; GFX90A-NEXT:    s_mul_i32 s5, s3, s1
+; GFX90A-NEXT:    s_sub_i32 s0, s0, s5
+; GFX90A-NEXT:    s_add_i32 s8, s3, 1
+; GFX90A-NEXT:    s_sub_i32 s5, s0, s1
+; GFX90A-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX90A-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX90A-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX90A-NEXT:    s_add_i32 s5, s3, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX90A-NEXT:    s_cselect_b32 s8, s5, s3
+; GFX90A-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s2
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
-; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[6:7], 5
+; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX90A-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX90A-NEXT:    s_or_b32 s14, s14, 28
-; GFX90A-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
+; GFX90A-NEXT:    s_or_b32 s12, s12, 28
+; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[8:9], 5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_readfirstlane_b32 s2, v18
 ; GFX90A-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX90A-NEXT:    s_mul_i32 s3, s5, s2
-; GFX90A-NEXT:    s_mul_hi_u32 s5, s4, s2
-; GFX90A-NEXT:    s_mul_i32 s2, s4, s2
+; GFX90A-NEXT:    s_mul_i32 s3, s17, s2
+; GFX90A-NEXT:    s_mul_hi_u32 s5, s16, s2
+; GFX90A-NEXT:    s_mul_i32 s2, s16, s2
 ; GFX90A-NEXT:    s_add_i32 s3, s5, s3
-; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
+; GFX90A-NEXT:    s_lshl_b64 s[16:17], s[2:3], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
-; GFX90A-NEXT:  .LBB3_1: ; %Flow20
+; GFX90A-NEXT:  .LBB3_1: ; %Flow21
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
@@ -737,34 +737,34 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX90A-NEXT:    s_mov_b32 s13, s12
+; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
+; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
-; GFX90A-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
+; GFX90A-NEXT:    s_mov_b64 s[20:21], s[12:13]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v4
-; GFX90A-NEXT:    v_readfirstlane_b32 s13, v5
-; GFX90A-NEXT:    s_add_u32 s9, s9, 1
-; GFX90A-NEXT:    s_addc_u32 s13, s13, 0
-; GFX90A-NEXT:    s_mul_hi_u32 s22, s6, s9
-; GFX90A-NEXT:    s_mul_i32 s13, s6, s13
-; GFX90A-NEXT:    s_mul_i32 s23, s7, s9
-; GFX90A-NEXT:    s_add_i32 s13, s22, s13
-; GFX90A-NEXT:    s_mul_i32 s9, s6, s9
-; GFX90A-NEXT:    s_add_i32 s13, s13, s23
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX90A-NEXT:    s_add_u32 s5, s5, 1
+; GFX90A-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NEXT:    s_mul_hi_u32 s22, s10, s5
+; GFX90A-NEXT:    s_mul_i32 s9, s10, s9
+; GFX90A-NEXT:    s_mul_i32 s23, s11, s5
+; GFX90A-NEXT:    s_add_i32 s9, s22, s9
+; GFX90A-NEXT:    s_mul_i32 s5, s10, s5
+; GFX90A-NEXT:    s_add_i32 s9, s9, s23
 ; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    s_add_u32 s20, s20, s4
-; GFX90A-NEXT:    s_addc_u32 s21, s21, s5
+; GFX90A-NEXT:    s_add_u32 s20, s20, s16
+; GFX90A-NEXT:    s_addc_u32 s21, s21, s17
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[22:23], 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
@@ -772,8 +772,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  .LBB3_5: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    s_add_u32 s22, s20, s9
-; GFX90A-NEXT:    s_addc_u32 s23, s21, s13
+; GFX90A-NEXT:    s_add_u32 s22, s20, s5
+; GFX90A-NEXT:    s_addc_u32 s23, s21, s9
 ; GFX90A-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v20, v19, s[22:23] offset:-8 glc
@@ -814,17 +814,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
-; GFX90A-NEXT:  .LBB3_10: ; %Flow19
+; GFX90A-NEXT:  .LBB3_10: ; %Flow20
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.11: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_add_u32 s10, s10, s8
-; GFX90A-NEXT:    s_addc_u32 s11, s11, 0
-; GFX90A-NEXT:    s_add_u32 s14, s14, s16
-; GFX90A-NEXT:    s_addc_u32 s15, s15, s17
+; GFX90A-NEXT:    s_add_u32 s6, s6, s8
+; GFX90A-NEXT:    s_addc_u32 s7, s7, 0
+; GFX90A-NEXT:    s_add_u32 s12, s12, s14
+; GFX90A-NEXT:    s_addc_u32 s13, s13, s15
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX90A-NEXT:    s_branch .LBB3_1
 ; GFX90A-NEXT:  .LBB3_12: ; %DummyReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f96a6f74e3c0d..1e5ecc725de3b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,13 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+; GCN-LABEL: kernel_ieee_mode_default:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+; GCN-LABEL: kernel_ieee_mode_on:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+; GCN-LABEL: kernel_ieee_mode_off:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 0
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_default() #0 {
+; GCN-LABEL: func_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_on() #1 {
+; GCN-LABEL: func_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_off() #2 {
+; GCN-LABEL: func_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_default() #0 {
+; GCN-LABEL: cs_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_on() #1 {
+; GCN-LABEL: cs_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_off() #2 {
+; GCN-LABEL: cs_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_default() #0 {
+; GCN-LABEL: ps_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_on() #1 {
+; GCN-LABEL: ps_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_off() #2 {
+; GCN-LABEL: ps_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index c7385e4324e2c..f1db84fe60b82 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2547,44 +2547,45 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: udiv_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s6, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_and_b32 s4, s9, 0xffff
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_lshr_b32 s4, s11, 16
+; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v4, v5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
+; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
@@ -2600,6 +2601,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    s_mov_b32 s1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2607,42 +2609,43 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s7, 16
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -2650,7 +2653,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
@@ -2659,8 +2661,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -2762,49 +2763,51 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: urem_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s6, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_lshr_b32 s5, s6, 16
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mad_f32 v1, -v1, v2, v4
-; GFX6-NEXT:    s_and_b32 s6, s11, 0xffff
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX6-NEXT:    s_and_b32 s5, s9, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX6-NEXT:    v_mad_f32 v1, -v1, v3, v4
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
+; GFX6-NEXT:    s_and_b32 s8, s9, 0xffff
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v1
-; GFX6-NEXT:    s_lshr_b32 s4, s11, 16
+; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
-; GFX6-NEXT:    s_lshr_b32 s5, s9, 16
+; GFX6-NEXT:    s_lshr_b32 s5, s7, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
@@ -2815,10 +2818,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s9, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s7, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2830,67 +2833,67 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    s_and_b32 s8, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    s_and_b32 s4, s3, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
-; GFX9-NEXT:    s_and_b32 s5, s1, 0xffff
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s5
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX9-NEXT:    s_and_b32 s8, s3, 0xffff
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
 ; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
-; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, s1, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
+; GFX9-NEXT:    v_sub_u32_e32 v4, s2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -3000,62 +3003,64 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: sdiv_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s7, s10
+; GFX6-NEXT:    s_sext_i32_i16 s6, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s7
+; GFX6-NEXT:    s_xor_b32 s6, s7, s6
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    s_or_b32 s8, s6, 1
+; GFX6-NEXT:    s_mov_b32 s1, s9
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    s_ashr_i32 s5, s10, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX6-NEXT:    s_cselect_b32 s6, s8, 0
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
+; GFX6-NEXT:    s_ashr_i32 s6, s10, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_xor_b32 s4, s6, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    s_sext_i32_i16 s5, s11
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    s_sext_i32_i16 s6, s5
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s9
+; GFX6-NEXT:    s_sext_i32_i16 s4, s11
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    s_ashr_i32 s5, s11, 16
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
-; GFX6-NEXT:    s_ashr_i32 s4, s9, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s11, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
@@ -3080,13 +3085,13 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: sdiv_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    s_sext_i32_i16 s4, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT:    s_sext_i32_i16 s5, s0
+; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX9-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
@@ -3098,61 +3103,61 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX9-NEXT:    s_ashr_i32 s5, s6, 16
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
-; GFX9-NEXT:    s_sext_i32_i16 s0, s1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    s_sext_i32_i16 s4, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    v_add_u32_e32 v4, s2, v4
+; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 16
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    v_add_u32_e32 v1, s2, v5
 ; GFX9-NEXT:    s_ashr_i32 s2, s3, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
-; GFX9-NEXT:    s_ashr_i32 s0, s1, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s2, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v5|, |v0|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v6
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -3270,53 +3275,55 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: srem_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s0, s8
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX6-NEXT:    s_sext_i32_i16 s1, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s1
+; GFX6-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_or_b32 s10, s0, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s10, 16
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT:    s_cselect_b32 s0, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_ashr_i32 s5, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    s_lshr_b32 s7, s10, 16
-; GFX6-NEXT:    s_or_b32 s8, s4, 1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    s_lshr_b32 s10, s6, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s8, 16
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s11
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
+; GFX6-NEXT:    s_sext_i32_i16 s4, s9
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s9
+; GFX6-NEXT:    s_sext_i32_i16 s5, s7
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v1
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
@@ -3330,30 +3337,30 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
-; GFX6-NEXT:    s_ashr_i32 s4, s11, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s9, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s9, 16
+; GFX6-NEXT:    s_ashr_i32 s5, s7, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_lshr_b32 s6, s9, 16
-; GFX6-NEXT:    s_lshr_b32 s7, s11, 16
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    s_or_b32 s8, s4, 1
+; GFX6-NEXT:    s_lshr_b32 s8, s9, 16
+; GFX6-NEXT:    s_or_b32 s9, s4, 1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s9, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v5
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s9, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s8
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s7, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -3362,13 +3369,13 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: srem_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s8, s2
+; GFX9-NEXT:    s_sext_i32_i16 s8, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
-; GFX9-NEXT:    s_sext_i32_i16 s9, s0
+; GFX9-NEXT:    s_sext_i32_i16 s9, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX9-NEXT:    s_xor_b32 s4, s9, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
@@ -3380,69 +3387,69 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s10, 0
-; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s0, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s10, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s2, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9-NEXT:    s_xor_b32 s2, s10, s0
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v1, s4, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_or_b32 s2, s2, 1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
-; GFX9-NEXT:    s_sext_i32_i16 s8, s1
+; GFX9-NEXT:    s_xor_b32 s4, s10, s2
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX9-NEXT:    s_or_b32 s6, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v4
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
+; GFX9-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX9-NEXT:    s_sext_i32_i16 s6, s7
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s6
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
+; GFX9-NEXT:    s_sext_i32_i16 s8, s3
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s8
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX9-NEXT:    s_xor_b32 s0, s8, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT:    s_xor_b32 s2, s8, s6
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s3
-; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s1, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 16
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s4
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 16
+; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, s2, v5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    s_xor_b32 s0, s2, s3
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s4, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s5, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s3
+; GFX9-NEXT:    s_or_b32 s6, s2, 1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v5|, |v4|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX9-NEXT:    v_add_u32_e32 v4, s2, v6
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
+; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -3838,46 +3845,48 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: udiv_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s6, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_and_b32 s4, s9, 0xffff
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -3885,48 +3894,47 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
-; GFX9-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
 ; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v6, v2, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v6, v0, s[6:7]
+; GFX9-NEXT:    global_store_short v6, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v6, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -4006,52 +4014,54 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: urem_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT:    s_lshr_b32 s6, s10, 16
+; GFX6-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX6-NEXT:    s_and_b32 s1, s10, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v4
+; GFX6-NEXT:    s_mov_b32 s1, s9
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s6, s11, 0xffff
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
-; GFX6-NEXT:    s_and_b32 s6, s9, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
+; GFX6-NEXT:    v_mad_f32 v2, -v1, v4, v3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s11
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_mad_f32 v2, -v2, v3, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s5
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s11, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -4059,33 +4069,34 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s2, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    s_and_b32 s8, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffff
+; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
@@ -4094,18 +4105,17 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX9-NEXT:    v_mad_f32 v2, -v2, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s5
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s0, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s1, v2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -4191,46 +4201,47 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: sdiv_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s7, s10
+; GFX6-NEXT:    s_sext_i32_i16 s6, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s7
+; GFX6-NEXT:    s_xor_b32 s6, s7, s6
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    s_or_b32 s8, s6, 1
+; GFX6-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    s_ashr_i32 s5, s10, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT:    s_cselect_b32 s6, s8, 0
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s6, v2
+; GFX6-NEXT:    s_ashr_i32 s6, s10, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_xor_b32 s4, s6, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    s_sext_i32_i16 s5, s11
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s9
+; GFX6-NEXT:    s_sext_i32_i16 s4, s11
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
@@ -4243,6 +4254,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    s_mov_b32 s1, s9
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -4253,13 +4265,13 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    s_sext_i32_i16 s4, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT:    s_sext_i32_i16 s5, s0
+; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s5
 ; GFX9-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
@@ -4271,44 +4283,44 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s8, 0
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s5, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NEXT:    v_add_u32_e32 v2, s4, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
-; GFX9-NEXT:    s_sext_i32_i16 s0, s1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    s_sext_i32_i16 s4, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    v_add_u32_e32 v3, s2, v4
+; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s2, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v4|, |v0|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v5
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT:    global_store_short v1, v0, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v1, v2, s[6:7]
+; GFX9-NEXT:    global_store_short v1, v0, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -4400,68 +4412,70 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: srem_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s0, s8
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX6-NEXT:    s_sext_i32_i16 s1, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s1
+; GFX6-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_or_b32 s10, s0, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s10, 16
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT:    s_cselect_b32 s0, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_ashr_i32 s5, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    s_lshr_b32 s7, s10, 16
-; GFX6-NEXT:    s_or_b32 s8, s4, 1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    s_lshr_b32 s10, s6, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s8, 16
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s11
+; GFX6-NEXT:    s_sext_i32_i16 s4, s9
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s9
+; GFX6-NEXT:    s_sext_i32_i16 s5, s7
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    s_or_b32 s7, s4, 1
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s7, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s11
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s9
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s10, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
@@ -4470,12 +4484,12 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s8, s2
+; GFX9-NEXT:    s_sext_i32_i16 s8, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
-; GFX9-NEXT:    s_sext_i32_i16 s9, s0
+; GFX9-NEXT:    s_sext_i32_i16 s9, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX9-NEXT:    s_xor_b32 s4, s9, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
@@ -4487,51 +4501,51 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s10, 0
-; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s0, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s10, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s2, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9-NEXT:    s_xor_b32 s2, s10, s0
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v1, s4, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT:    s_or_b32 s2, s2, 1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
+; GFX9-NEXT:    s_xor_b32 s4, s10, s2
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX9-NEXT:    s_or_b32 s6, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
-; GFX9-NEXT:    s_sext_i32_i16 s3, s1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s3
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v3
+; GFX9-NEXT:    s_sext_i32_i16 s4, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX9-NEXT:    s_sext_i32_i16 s5, s3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX9-NEXT:    s_xor_b32 s0, s3, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s4, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s5, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s6, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
+; GFX9-NEXT:    v_add_u32_e32 v2, s2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
-; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -5665,29 +5679,31 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s10
+; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX6-NEXT:    s_sub_i32 s1, 0, s0
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s11
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s0
+; GFX6-NEXT:    s_mov_b32 s8, s4
+; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX6-NEXT:    s_mul_i32 s1, s1, s0
-; GFX6-NEXT:    s_sub_i32 s1, s8, s1
+; GFX6-NEXT:    s_sub_i32 s1, s6, s1
 ; GFX6-NEXT:    s_sub_i32 s3, s1, s0
 ; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
@@ -5702,10 +5718,10 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX6-NEXT:    s_mul_i32 s0, s0, s2
-; GFX6-NEXT:    s_sub_i32 s0, s9, s0
+; GFX6-NEXT:    s_sub_i32 s0, s7, s0
 ; GFX6-NEXT:    s_sub_i32 s1, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
 ; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
@@ -5716,19 +5732,19 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s2
+; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
@@ -5740,37 +5756,37 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s0, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
 ; GFX9-NEXT:    s_mul_i32 s5, s4, s7
-; GFX9-NEXT:    s_sub_i32 s0, s0, s5
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX9-NEXT:    s_add_i32 s9, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_sub_i32 s5, s2, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
 ; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    s_cselect_b32 s0, s5, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s6
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
 ; GFX9-NEXT:    s_add_i32 s8, s8, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s5, s4, s6
-; GFX9-NEXT:    s_sub_i32 s1, s1, s5
+; GFX9-NEXT:    s_sub_i32 s3, s3, s5
 ; GFX9-NEXT:    s_add_i32 s7, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s1, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX9-NEXT:    s_sub_i32 s5, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
 ; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
 ; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
@@ -6006,64 +6022,67 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_sub_i32 s0, 0, s6
+; GFX6-NEXT:    s_lshl_b32 s8, 0x1000, s1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX6-NEXT:    s_mul_i32 s6, s6, s2
-; GFX6-NEXT:    s_sub_i32 s0, s0, s6
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_mul_i32 s4, s4, s6
+; GFX6-NEXT:    s_sub_i32 s2, s2, s4
+; GFX6-NEXT:    s_sub_i32 s4, s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, 0, s8
+; GFX6-NEXT:    v_mul_lo_u32 v0, s4, v1
+; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s8
+; GFX6-NEXT:    s_sub_i32 s0, s3, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s2
+; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
@@ -6075,33 +6094,33 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s0, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s7
-; GFX9-NEXT:    s_sub_i32 s0, s0, s4
-; GFX9-NEXT:    s_sub_i32 s4, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
-; GFX9-NEXT:    s_sub_i32 s4, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s4, s2, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_sub_i32 s4, s2, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s6
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
 ; GFX9-NEXT:    s_add_i32 s8, s8, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s6
-; GFX9-NEXT:    s_sub_i32 s1, s1, s4
-; GFX9-NEXT:    s_sub_i32 s4, s1, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
-; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
-; GFX9-NEXT:    s_sub_i32 s4, s1, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
-; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    s_sub_i32 s4, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    s_sub_i32 s4, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = urem <2 x i32> %x, %shl.y
@@ -6483,136 +6502,138 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX6-NEXT:    s_abs_i32 s6, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_sub_i32 s7, 0, s6
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s6
+; GFX6-NEXT:    s_abs_i32 s8, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s0, 0, s8
+; GFX6-NEXT:    s_lshl_b32 s9, 0x1000, s7
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v0
-; GFX6-NEXT:    s_abs_i32 s7, s0
-; GFX6-NEXT:    s_xor_b32 s0, s0, s2
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_abs_i32 s4, s2
+; GFX6-NEXT:    s_xor_b32 s2, s2, s6
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s6
-; GFX6-NEXT:    s_sub_i32 s2, s7, s2
-; GFX6-NEXT:    s_sub_i32 s7, s2, s6
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX6-NEXT:    s_mul_i32 s5, s5, s8
+; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s5, s4, s8
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_abs_i32 s2, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
+; GFX6-NEXT:    s_abs_i32 s8, s9
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_sub_i32 s0, 0, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_xor_b32 s3, s1, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    s_abs_i32 s1, s1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_abs_i32 s1, s3
+; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    s_ashr_i32 s3, s3, 31
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_lo_u32 v3, s6, v2
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v2
+; GFX6-NEXT:    s_xor_b32 s0, s3, s9
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX6-NEXT:    s_mul_i32 s0, s0, s2
-; GFX6-NEXT:    s_sub_i32 s0, s1, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s2
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX6-NEXT:    s_mul_i32 s2, s2, s8
+; GFX6-NEXT:    s_sub_i32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, s1, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s3, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX9-NEXT:    s_abs_i32 s6, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s3
-; GFX9-NEXT:    s_abs_i32 s3, s0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT:    s_abs_i32 s8, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    s_lshl_b32 s4, 0x1000, s7
+; GFX9-NEXT:    s_abs_i32 s5, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s2, 0, s6
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX9-NEXT:    s_sub_i32 s6, 0, s8
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s2, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s2, s8, s2
-; GFX9-NEXT:    s_add_i32 s8, s8, s2
-; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s8
-; GFX9-NEXT:    s_mul_i32 s8, s2, s6
-; GFX9-NEXT:    s_sub_i32 s3, s3, s8
-; GFX9-NEXT:    s_add_i32 s9, s2, 1
-; GFX9-NEXT:    s_sub_i32 s8, s3, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
-; GFX9-NEXT:    s_cselect_b32 s2, s9, s2
-; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
-; GFX9-NEXT:    s_add_i32 s8, s2, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
-; GFX9-NEXT:    s_cselect_b32 s6, s8, s2
-; GFX9-NEXT:    s_abs_i32 s8, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    s_xor_b32 s5, s6, s0
-; GFX9-NEXT:    s_sub_i32 s6, 0, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT:    s_mul_i32 s7, s6, s8
+; GFX9-NEXT:    s_sub_i32 s5, s5, s7
+; GFX9-NEXT:    s_add_i32 s9, s6, 1
+; GFX9-NEXT:    s_sub_i32 s7, s5, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s8
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s6, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s8
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s6
+; GFX9-NEXT:    s_abs_i32 s6, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_xor_b32 s5, s5, s2
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
+; GFX9-NEXT:    s_sub_i32 s2, s5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s0, s5, s0
-; GFX9-NEXT:    s_xor_b32 s4, s1, s7
-; GFX9-NEXT:    s_abs_i32 s1, s1
+; GFX9-NEXT:    s_xor_b32 s4, s3, s4
+; GFX9-NEXT:    s_abs_i32 s3, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s6
-; GFX9-NEXT:    s_add_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s1, s5
-; GFX9-NEXT:    s_mul_i32 s6, s5, s8
-; GFX9-NEXT:    s_sub_i32 s1, s1, s6
+; GFX9-NEXT:    s_mul_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s7
+; GFX9-NEXT:    s_add_i32 s8, s5, 1
+; GFX9-NEXT:    s_sub_i32 s7, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
 ; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_sub_i32 s6, s1, s8
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s8
-; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
-; GFX9-NEXT:    s_cselect_b32 s1, s6, s1
-; GFX9-NEXT:    s_add_i32 s6, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s8
-; GFX9-NEXT:    s_cselect_b32 s1, s6, s5
-; GFX9-NEXT:    s_xor_b32 s1, s1, s4
-; GFX9-NEXT:    s_sub_i32 s1, s1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
@@ -6935,122 +6956,125 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX6-NEXT:    s_abs_i32 s2, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s6
+; GFX6-NEXT:    s_abs_i32 s6, s0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_sub_i32 s0, 0, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
-; GFX6-NEXT:    s_abs_i32 s6, s0
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_lshl_b32 s5, 0x1000, s7
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_abs_i32 s4, s2
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 31
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_mul_i32 s7, s7, s2
-; GFX6-NEXT:    s_sub_i32 s6, s6, s7
-; GFX6-NEXT:    s_sub_i32 s7, s6, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s2
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_sub_i32 s7, s6, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s7, s6
-; GFX6-NEXT:    s_abs_i32 s3, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s6, 0, s3
-; GFX6-NEXT:    s_abs_i32 s8, s1
-; GFX6-NEXT:    s_xor_b32 s2, s2, s0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s6
+; GFX6-NEXT:    s_sub_i32 s4, s4, s7
+; GFX6-NEXT:    s_sub_i32 s7, s4, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s6
+; GFX6-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX6-NEXT:    s_sub_i32 s7, s4, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s6
+; GFX6-NEXT:    s_cselect_b32 s8, s7, s4
+; GFX6-NEXT:    s_abs_i32 s9, s5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX6-NEXT:    s_sub_i32 s4, 0, s9
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_ashr_i32 s1, s3, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s0, s2, s0
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_abs_i32 s0, s3
+; GFX6-NEXT:    s_xor_b32 s3, s8, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_sub_i32 s2, s3, s2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s2, s8, s2
-; GFX6-NEXT:    s_sub_i32 s8, s2, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
-; GFX6-NEXT:    s_sub_i32 s8, s2, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
-; GFX6-NEXT:    s_xor_b32 s2, s2, s1
-; GFX6-NEXT:    s_sub_i32 s1, s2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX6-NEXT:    s_mul_i32 s3, s3, s9
+; GFX6-NEXT:    s_sub_i32 s0, s0, s3
+; GFX6-NEXT:    s_sub_i32 s3, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX6-NEXT:    s_sub_i32 s3, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX6-NEXT:    s_xor_b32 s0, s0, s1
+; GFX6-NEXT:    s_sub_i32 s0, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX9-NEXT:    s_abs_i32 s2, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_sub_i32 s7, 0, s2
-; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
-; GFX9-NEXT:    s_abs_i32 s0, s0
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s6
+; GFX9-NEXT:    s_abs_i32 s6, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_lshl_b32 s4, 0x1000, s7
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
 ; GFX9-NEXT:    s_add_i32 s8, s8, s7
-; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s8
-; GFX9-NEXT:    s_mul_i32 s7, s7, s2
-; GFX9-NEXT:    s_sub_i32 s0, s0, s7
-; GFX9-NEXT:    s_sub_i32 s7, s0, s2
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX9-NEXT:    s_cselect_b32 s0, s7, s0
-; GFX9-NEXT:    s_sub_i32 s7, s0, s2
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX9-NEXT:    s_cselect_b32 s0, s7, s0
-; GFX9-NEXT:    s_abs_i32 s7, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_xor_b32 s0, s0, s6
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_i32 s5, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s7, s2, s8
+; GFX9-NEXT:    s_mul_i32 s7, s7, s6
+; GFX9-NEXT:    s_sub_i32 s2, s2, s7
+; GFX9-NEXT:    s_sub_i32 s7, s2, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX9-NEXT:    s_sub_i32 s7, s2, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX9-NEXT:    s_abs_i32 s4, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s7, 0, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s0, s0, s6
-; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_abs_i32 s1, s1
+; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX9-NEXT:    s_abs_i32 s3, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s1, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s7
-; GFX9-NEXT:    s_sub_i32 s1, s1, s5
-; GFX9-NEXT:    s_sub_i32 s5, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
-; GFX9-NEXT:    s_sub_i32 s5, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
-; GFX9-NEXT:    s_xor_b32 s1, s1, s4
-; GFX9-NEXT:    s_sub_i32 s1, s1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT:    s_mul_i32 s5, s5, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s5
+; GFX9-NEXT:    s_sub_i32 s5, s3, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_sub_i32 s5, s3, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_xor_b32 s3, s3, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
@@ -10096,6 +10120,9 @@ define i64 @udiv_i64_9divbits(i8 %size) {
 }
 
 define <2 x i64> @srem_zero_zero() {
+; GCN-LABEL: kernel:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
 ; GFX6-LABEL: srem_zero_zero:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index ca1e7c9b06653..7f7fbed7fcdeb 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -949,31 +949,39 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_32_bit_constant_i64:
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, i64 %a) {
 ; GFX6-LABEL: s_and_32_bit_constant_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 0x12d687
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s2, 0x12d687
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_32_bit_constant_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 0x12d687
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 0x12d687
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_32_bit_constant_i64:
@@ -992,58 +1000,73 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
+; SI: s_load_dwordx4
+; SI: s_load_dword
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_lshl_b32 [[C:s[0-9]+]], [[A:s[0-9]+]], 1
+; SI: s_lshl_b32 [[D:s[0-9]+]], [[B:s[0-9]+]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
 ; GFX6-LABEL: s_and_multi_use_inline_imm_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX6-NEXT:    s_load_dword s7, s[4:5], 0x11
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x15
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s6, s6, 1
-; GFX6-NEXT:    s_lshl_b32 s7, s7, 1
-; GFX6-NEXT:    s_and_b32 s6, s6, 62
-; GFX6-NEXT:    s_and_b32 s8, s7, 62
-; GFX6-NEXT:    s_add_u32 s6, s6, s4
-; GFX6-NEXT:    s_addc_u32 s7, 0, s5
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    s_add_u32 s4, s8, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s7
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_load_dword s3, s[4:5], 0x11
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x15
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX6-NEXT:    s_and_b32 s0, s0, 62
+; GFX6-NEXT:    s_and_b32 s2, s1, 62
+; GFX6-NEXT:    s_add_u32 s0, s0, s8
+; GFX6-NEXT:    s_addc_u32 s1, 0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    s_add_u32 s2, s2, s8
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    s_addc_u32 s3, 0, s9
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_multi_use_inline_imm_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GFX8-NEXT:    s_load_dword s7, s[4:5], 0x44
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
-; GFX8-NEXT:    s_lshl_b32 s7, s7, 1
-; GFX8-NEXT:    s_and_b32 s6, s6, 62
-; GFX8-NEXT:    s_and_b32 s8, s7, 62
-; GFX8-NEXT:    s_add_u32 s6, s6, s4
-; GFX8-NEXT:    s_addc_u32 s7, 0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    s_add_u32 s4, s8, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    s_addc_u32 s5, 0, s5
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x44
+; GFX8-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x54
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 1
+; GFX8-NEXT:    s_and_b32 s0, s0, 62
+; GFX8-NEXT:    s_and_b32 s2, s1, 62
+; GFX8-NEXT:    s_add_u32 s0, s0, s8
+; GFX8-NEXT:    s_addc_u32 s1, 0, s9
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_add_u32 s2, s2, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_addc_u32 s3, 0, s9
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1625,28 +1648,30 @@ define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_64_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 64
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s2, 64
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_64_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 64
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 64
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_64_i64:
@@ -1665,39 +1690,49 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
+; SI: s_load_dwordx4
+; SI: s_lshl_b32 [[B:s[0-9]+]], [[A:s[0-9]+]], 1{{$}}
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, [[B:s[0-9]+]], 64
+; SI-NOT: and
+; SI: s_add_u32
+; SI-NEXT: s_mov_b32
 define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a, i32, i64 %b) {
 ; GFX6-LABEL: s_and_inline_imm_64_i64_noshrink:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x11
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x11
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s6, s6, 1
-; GFX6-NEXT:    s_and_b32 s6, s6, 64
-; GFX6-NEXT:    s_add_u32 s4, s6, s4
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX6-NEXT:    s_and_b32 s0, s0, 64
+; GFX6-NEXT:    s_add_u32 s0, s0, s8
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_addc_u32 s1, 0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_64_i64_noshrink:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x44
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
-; GFX8-NEXT:    s_and_b32 s6, s6, 64
-; GFX8-NEXT:    s_add_u32 s4, s6, s4
-; GFX8-NEXT:    s_addc_u32 s5, 0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX8-NEXT:    s_and_b32 s0, s0, 64
+; GFX8-NEXT:    s_add_u32 s0, s0, s8
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    s_addc_u32 s1, 0, s9
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_64_i64_noshrink:
@@ -1722,31 +1757,39 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_1_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s2, 1
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_1_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_1_i64:
@@ -1765,31 +1808,41 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_1.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 0x3ff00000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 0x3ff00000
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_1.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 0x3ff00000
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 0x3ff00000
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_1.0_i64:
@@ -1809,31 +1862,41 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_neg_1.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 0xbff00000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 0xbff00000
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_neg_1.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 0xbff00000
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 0xbff00000
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_neg_1.0_i64:
@@ -1853,31 +1916,41 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_0.5_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 0x3fe00000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 0x3fe00000
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_0.5_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 0x3fe00000
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 0x3fe00000
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_0.5_i64:
@@ -1897,31 +1970,41 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_neg_0.5_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 0xbfe00000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 0xbfe00000
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_neg_0.5_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 0xbfe00000
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 0xbfe00000
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_neg_0.5_i64:
@@ -1941,31 +2024,39 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64:
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_2.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 2.0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 2.0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_2.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 2.0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 2.0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_2.0_i64:
@@ -1985,31 +2076,39 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64:
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_neg_2.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, -2.0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, -2.0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_neg_2.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, -2.0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, -2.0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_neg_2.0_i64:
@@ -2029,31 +2128,41 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_4.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 0x40100000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 0x40100000
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_4.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 0x40100000
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 0x40100000
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_4.0_i64:
@@ -2073,31 +2182,41 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_neg_4.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 0xc0100000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 0xc0100000
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_neg_4.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 0xc0100000
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 0xc0100000
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_neg_4.0_i64:
@@ -2120,31 +2239,39 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p
 ; Test with the 64-bit integer bitpattern for a 32-bit float in the
 ; low 32-bits, which is not a valid 64-bit inline immmediate.
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_f32_4.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 4.0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s2, 4.0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_f32_4.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 4.0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 4.0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_f32_4.0_i64:
@@ -2163,31 +2290,39 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64:
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_imm_f32_neg_4.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, -4.0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s7
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s2, -4.0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_imm_f32_neg_4.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, -4.0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, -4.0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64:
@@ -2207,32 +2342,38 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou
 }
 
 ; Shift into upper 32-bits
-
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_high_imm_f32_4.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, 4.0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, 4.0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_high_imm_f32_4.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, 4.0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, 4.0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64:
@@ -2252,31 +2393,39 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64:
+; SI: s_load_dwordx4
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; GFX6-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xe
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s4, s6, -4.0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_and_b32 s0, s3, -4.0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x38
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s6, -4.0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_and_b32 s0, s3, -4.0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 9a4040a25419a..abb2dd34e8bf3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -99,15 +99,14 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %o
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i64 %swap) nounwind {
 ; CHECK-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dword s6, s[4:5], 0xb
-; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
-; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CHECK-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 7
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v4, s6
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    v_mov_b32_e32 v4, s4
 ; CHECK-NEXT:    s_mov_b32 m0, -1
 ; CHECK-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -118,16 +117,15 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %o
 ;
 ; GFX7-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX7-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -137,53 +135,49 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %o
 ;
 ; GFX8-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 7
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[0:1], v4, v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
index 614b1e38a530f..021464f85c0dd 100644
--- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
@@ -5,8 +5,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-LABEL: vgpr_mfma_pass_av_split_crash:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x0
-; CHECK-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x10
+; CHECK-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3e21eeb6
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0xa17f65f6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -46,7 +46,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    ; implicit-def: $vgpr30_vgpr31
 ; CHECK-NEXT:    ; implicit-def: $vgpr22_vgpr23
 ; CHECK-NEXT:    s_branch .LBB0_2
-; CHECK-NEXT:  .LBB0_1: ; %Flow9
+; CHECK-NEXT:  .LBB0_1: ; %Flow10
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_17
@@ -61,7 +61,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %.preheader1855.i.i.i3329.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[10:11]
 ; CHECK-NEXT:    flat_load_dwordx2 v[24:25], v[24:25]
 ; CHECK-NEXT:    v_accvgpr_read_b32 v27, a3
 ; CHECK-NEXT:    v_accvgpr_read_b32 v26, a2
@@ -118,7 +118,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_branch .LBB0_5
 ; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[22:23], 0
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[10:11]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[12:13]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; CHECK-NEXT:    s_branch .LBB0_15
 ; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
@@ -137,7 +137,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v17, v16
 ; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[16:17]
-; CHECK-NEXT:    global_store_dwordx2 v20, v[16:17], s[12:13]
+; CHECK-NEXT:    global_store_dwordx2 v20, v[16:17], s[14:15]
 ; CHECK-NEXT:    s_cselect_b32 s23, s23, 0
 ; CHECK-NEXT:    s_cselect_b32 s22, s22, 0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
@@ -145,10 +145,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[22:23], 0
-; CHECK-NEXT:  .LBB0_14: ; %Flow6
+; CHECK-NEXT:  .LBB0_14: ; %Flow7
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[24:25]
-; CHECK-NEXT:  .LBB0_15: ; %Flow6
+; CHECK-NEXT:  .LBB0_15: ; %Flow7
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], -1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
@@ -157,7 +157,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v21, v20
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
-; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[12:13]
+; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[14:15]
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_17: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 243f0ed3a8d0d..13c4dbc8c3aca 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -539,10 +539,10 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:  ; %bb.12: ; %Flow
 ; GCN-NEXT:    s_add_pc_i64 .LBB10_5-.Lpost_addpc15
 ; GCN-NEXT:  .Lpost_addpc15:
-; GCN-NEXT:  .LBB10_2: ; %Flow5
+; GCN-NEXT:  .LBB10_2: ; %Flow7
 ; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GCN-NEXT:    s_cbranch_vccz .LBB10_3
-; GCN-NEXT:  ; %bb.14: ; %Flow5
+; GCN-NEXT:  ; %bb.14: ; %Flow7
 ; GCN-NEXT:    s_add_pc_i64 .LBB10_6-.Lpost_addpc16
 ; GCN-NEXT:  .Lpost_addpc16:
 ; GCN-NEXT:  .LBB10_3: ; %bb14
@@ -585,14 +585,12 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:  .LBB10_6:
 ; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:  .LBB10_7: ; %bb19
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
-; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 5959f76492f3c..9cbfc08f8ecdc 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1295,10 +1295,10 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    s_cmp_ge_i32 s2, s3
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GCN-NEXT:  .LBB10_4: ; %Flow5
+; GCN-NEXT:  .LBB10_4: ; %Flow7
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_vccz .LBB10_5
-; GCN-NEXT:  ; %bb.10: ; %Flow5
+; GCN-NEXT:  ; %bb.10: ; %Flow7
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
 ; GCN-NEXT:  .Lpost_getpc13:
 ; GCN-NEXT:    s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295
@@ -1316,15 +1316,15 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:  .LBB10_6:
 ; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:  .LBB10_7: ; %bb19
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 2
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: long_branch_hang:
@@ -1363,7 +1363,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:  .LBB10_4: ; %Flow5
+; GFX11-NEXT:  .LBB10_4: ; %Flow7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[8:9]
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB10_6
@@ -1381,15 +1381,14 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:  .LBB10_6:
 ; GFX11-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-NEXT:  .LBB10_7: ; %bb19
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1424,10 +1423,10 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc13)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[8:9]
-; GFX12-NEXT:  .LBB10_2: ; %Flow5
+; GFX12-NEXT:  .LBB10_2: ; %Flow7
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_vccz .LBB10_3
-; GFX12-NEXT:  ; %bb.12: ; %Flow5
+; GFX12-NEXT:  ; %bb.12: ; %Flow7
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
 ; GFX12-NEXT:  .Lpost_getpc14:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1490,14 +1489,12 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:  .LBB10_6:
 ; GFX12-NEXT:    ; implicit-def: $vgpr0
 ; GFX12-NEXT:  .LBB10_7: ; %bb19
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
-; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 763f436997c21..2a9ce083dc0e3 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -271,13 +271,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 625ac12b99839..ca0bcd7e196f2 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -186,30 +186,32 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(ptr addrspace(1) %ou
 ;
 ; GCN-LABEL: dynamic_extract_vector_elt_v3i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x15
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x15
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s2, s2, 4
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s2
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_lshl_b32 s4, s6, 4
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX89-LABEL: dynamic_extract_vector_elt_v3i16:
 ; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX89-NEXT:    s_load_dword s8, s[4:5], 0x54
-; GFX89-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
-; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_lshl_b32 s4, s8, 4
-; GFX89-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
-; GFX89-NEXT:    v_mov_b32_e32 v0, s4
-; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_lshl_b32 s0, s8, 4
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; GFX89-NEXT:    v_mov_b32_e32 v0, s0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX89-NEXT:    s_endpgm
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index d8f81db70e309..e4e48ffccad54 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -218,21 +218,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s3, s3, 16
-; CI-NEXT:    s_and_b32 s5, s2, 0xffff0000
-; CI-NEXT:    v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT:    v_mul_f32_e64 v1, 1.0, |s3|
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s5|
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; CI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabs_v4bf16:
@@ -242,23 +234,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s3, 0x7fff
-; VI-NEXT:    s_lshr_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s5, s2, 0x7fff
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0x7fff
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; VI-NEXT:    s_and_b32 s4, 0xffff, s4
-; VI-NEXT:    s_and_b32 s5, 0xffff, s5
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s3, s4, s3
-; VI-NEXT:    s_or_b32 s2, s5, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_fabs_v4bf16:
@@ -266,14 +248,8 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, s3, 0x7fff
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NEXT:    s_and_b32 s5, s2, 0x7fff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, 0x7fff
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
+; GFX9-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
+; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -283,14 +259,8 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s4, s3, 0x7fff
-; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-NEXT:    s_and_b32 s5, s5, 0x7fff
-; GFX11-NEXT:    s_and_b32 s3, s3, 0x7fff
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 27cf49aec8229..a77c7ae923d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
-; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; CI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabs_v4f16:
@@ -234,13 +234,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 6bcb086944c91..baf9b0abf7b0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -115,13 +115,13 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset0_b32 s3, 31
-; VI-NEXT:    s_bitset0_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fffffff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 61f5b73033f5e..a00afb6fcf6ef 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -472,50 +472,52 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
 define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) {
 ; SI-LABEL: s_test_copysign_v2f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT:    s_brev_b32 s8, -2
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_bfi_b32 v1, s8, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_bfi_b32 v0, s8, v0, v2
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_bfi_b32 v1, s0, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_bfi_b32 v0, s0, v0, v2
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_copysign_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_brev_b32 s6, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_bfi_b32 v1, s6, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_bfi_b32 v0, s6, v2, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_bfi_b32 v3, s6, v2, v3
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_bfi_b32 v2, s6, v2, v4
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_v2f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v2
-; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s3, v0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v2
+; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, ptr addrspace(1) %out, align 8
@@ -877,46 +879,46 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) {
 define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) {
 ; SI-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_brev_b32 s4, -2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_brev_b32 s0, -2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_bfi_b32 v2, s0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %sign.trunc = fptrunc double %sign to float
   %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index b826e6c469d8e..72e3549656327 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -938,16 +938,18 @@ entry:
 define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32:
 ; GFX6-FASTFMA:       ; %bb.0: ; %entry
-; GFX6-FASTFMA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-FASTFMA-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GFX6-FASTFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-FASTFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, -1
 ; GFX6-FASTFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v1, s9
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], s11, s11, v1
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s4, s0
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s5, s1
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[0:1], s9, s9, v1
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s11
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s9, v0, s9
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s3, v0, s3
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v4, v3, v3
@@ -956,13 +958,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v3, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v0, -v2, v4, v0
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], s10, s10, v2
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[0:1], s8, s8, v2
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v1, v0, s11, v1
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s8, v0, s8
+; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v1, v0, s9, v1
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s2, v0, s2
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v4, v4
@@ -972,20 +974,21 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v0, -v3, v5, v0
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v0, v0, v4, v5
-; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v0, s10, v2
-; GFX6-FASTFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v0, s8, v2
+; GFX6-FASTFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-FASTFMA-NEXT:    s_endpgm
 ;
 ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32:
 ; GFX6-SLOWFMA:       ; %bb.0: ; %entry
-; GFX6-SLOWFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-SLOWFMA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-SLOWFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-SLOWFMA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-SLOWFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[6:7], s3, s3, v0
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s1, v2, s1
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v4, s0
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[6:7], s5, s5, v0
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s3, v2, s3
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX6-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v5, -v1, v3, 1.0
@@ -995,14 +998,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v5, v2
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[6:7], s2, s2, v4
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[6:7], s4, s4, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v5
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v3, s2
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, s0, v3, s0
-; GFX6-SLOWFMA-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, -1
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, s2, v3, s2
+; GFX6-SLOWFMA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v5, v2
-; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v1, s3, v0
+; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v1, s5, v0
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v0, -v2, v5, 1.0
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v0, v0, v5, v5
@@ -1012,22 +1014,24 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v0, v2, v0, v5
-; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v0, s2, v4
-; GFX6-SLOWFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v0, s4, v4
+; GFX6-SLOWFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-SLOWFMA-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: s_fdiv_v2f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], s11, s11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_mov_b32 s4, s0
+; GFX7-NEXT:    s_mov_b32 s5, s1
+; GFX7-NEXT:    v_div_scale_f32 v2, s[0:1], s9, s9, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s11
-; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s9, v0, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s9
+; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s3, v0, s3
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
@@ -1036,13 +1040,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v3, v4
 ; GFX7-NEXT:    v_fma_f32 v0, -v2, v4, v0
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], s10, s10, v2
+; GFX7-NEXT:    v_div_scale_f32 v3, s[0:1], s8, s8, v2
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX7-NEXT:    v_div_fixup_f32 v1, v0, s11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v0, s10
-; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s8, v0, s8
+; GFX7-NEXT:    v_div_fixup_f32 v1, v0, s9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s2, v0, s2
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v4, v4
@@ -1052,19 +1056,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX7-NEXT:    v_fma_f32 v0, -v3, v5, v0
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v4, v5
-; GFX7-NEXT:    v_div_fixup_f32 v0, v0, s10, v2
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT:    v_div_fixup_f32 v0, v0, s8, v2
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    v_div_scale_f32 v1, s[6:7], s3, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s1, v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_div_scale_f32 v1, s[6:7], s5, s5, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s3, v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v5, -v1, v3, 1.0
@@ -1074,13 +1079,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX8-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v5, v2
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX8-NEXT:    v_div_scale_f32 v2, s[6:7], s2, s2, v4
+; GFX8-NEXT:    v_div_scale_f32 v2, s[6:7], s4, s4, v4
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_div_scale_f32 v3, vcc, s0, v3, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_div_scale_f32 v3, vcc, s2, v3, s2
 ; GFX8-NEXT:    v_rcp_f32_e32 v5, v2
-; GFX8-NEXT:    v_div_fixup_f32 v1, v1, s3, v0
+; GFX8-NEXT:    v_div_fixup_f32 v1, v1, s5, v0
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v0, -v2, v5, 1.0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v5, v5
@@ -1090,19 +1094,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v2, v0, v5
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_div_fixup_f32 v0, v0, s2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_div_fixup_f32 v0, v0, s4, v4
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_v2f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s6, s3, s3, s1
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s1, s3, s1
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s7, s7, s3
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s3, s7, s3
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -1112,12 +1117,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_scale_f32 v2, s6, s2, s2, s0
+; GFX10-NEXT:    v_div_scale_f32 v2, s4, s6, s6, s2
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX10-NEXT:    v_div_fixup_f32 v1, v0, s3, s1
-; GFX10-NEXT:    v_div_scale_f32 v0, vcc_lo, s0, s2, s0
+; GFX10-NEXT:    v_div_fixup_f32 v1, v0, s7, s3
+; GFX10-NEXT:    v_div_scale_f32 v0, vcc_lo, s2, s6, s2
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
@@ -1128,19 +1132,18 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s2, s0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s6, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s1
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s1, s3, s1
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s5, s5, s3
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s3, s5, s3
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -1151,11 +1154,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX11-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX11-NEXT:    s_denorm_mode 12
-; GFX11-NEXT:    v_div_scale_f32 v2, null, s2, s2, s0
+; GFX11-NEXT:    v_div_scale_f32 v2, null, s4, s4, s2
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX11-NEXT:    v_div_fixup_f32 v1, v0, s3, s1
-; GFX11-NEXT:    v_div_scale_f32 v0, vcc_lo, s0, s2, s0
+; GFX11-NEXT:    v_div_fixup_f32 v1, v0, s5, s3
+; GFX11-NEXT:    v_div_scale_f32 v0, vcc_lo, s2, s4, s2
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -1167,8 +1170,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s2, s0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s4, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32:
@@ -1193,58 +1196,60 @@ entry:
 define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX67-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX67-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX67-NEXT:    v_rcp_f32_e32 v1, s3
-; GFX67-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX67-NEXT:    v_mul_f32_e32 v1, s1, v1
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s8
+; GFX67-NEXT:    v_rcp_f32_e32 v1, s9
+; GFX67-NEXT:    s_mov_b32 s4, s0
+; GFX67-NEXT:    s_mov_b32 s5, s1
+; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX67-NEXT:    v_mul_f32_e32 v1, s3, v1
 ; GFX67-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX8-NEXT:    v_rcp_f32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, s1, v1
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_rcp_f32_e32 v2, s6
+; GFX8-NEXT:    v_rcp_f32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, s3, v3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX10-NEXT:    v_rcp_f32_e32 v1, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, s1, v1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s6
+; GFX10-NEXT:    v_rcp_f32_e32 v1, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_f32_e32 v1, s3
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s6
+; GFX11-NEXT:    v_rcp_f32_e32 v1, s7
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_ulp25_v2f32:
@@ -1269,58 +1274,60 @@ entry:
 define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX67-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX67-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX67-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX67-NEXT:    v_mul_f32_e32 v0, s0, v2
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s9
+; GFX67-NEXT:    v_rcp_f32_e32 v2, s8
+; GFX67-NEXT:    s_mov_b32 s4, s0
+; GFX67-NEXT:    s_mov_b32 s5, s1
+; GFX67-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v2
 ; GFX67-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX8-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_rcp_f32_e32 v2, s7
+; GFX8-NEXT:    v_rcp_f32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_f32_e32 v3, s3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v4
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX10-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[6:7]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_rcp_f32_e32 v2, s6
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v2
+; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX11-NEXT:    v_rcp_f32_e32 v2, s2
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v2, s6
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
-; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
+; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32_fast_math:
@@ -1345,58 +1352,60 @@ entry:
 define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX67-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX67-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX67-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX67-NEXT:    v_mul_f32_e32 v0, s0, v2
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s9
+; GFX67-NEXT:    v_rcp_f32_e32 v2, s8
+; GFX67-NEXT:    s_mov_b32 s4, s0
+; GFX67-NEXT:    s_mov_b32 s5, s1
+; GFX67-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v2
 ; GFX67-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX8-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_rcp_f32_e32 v2, s7
+; GFX8-NEXT:    v_rcp_f32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_f32_e32 v3, s3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v4
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX10-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[6:7]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_rcp_f32_e32 v2, s6
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v2
+; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX11-NEXT:    v_rcp_f32_e32 v2, s2
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v2, s6
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
-; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
+; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32_arcp_math:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 57be2907da4a0..ec9017c78db4f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -279,13 +279,12 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -298,13 +297,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_add_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -317,13 +315,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_add_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -334,17 +331,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -361,7 +357,7 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -383,7 +379,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -405,7 +401,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -426,7 +422,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -583,13 +579,12 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -600,13 +595,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_add_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -617,13 +611,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_add_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -634,17 +627,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -660,7 +652,7 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -680,7 +672,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_add_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -700,7 +692,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_add_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -721,7 +713,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_add_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -887,13 +879,12 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -906,13 +897,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_and_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -925,13 +915,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_and_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -942,17 +931,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -969,7 +957,7 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -991,7 +979,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1013,7 +1001,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1034,7 +1022,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1191,13 +1179,12 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1208,13 +1195,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_and_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1225,13 +1211,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_and_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1242,17 +1227,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1268,7 +1252,7 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1288,7 +1272,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_and_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1308,7 +1292,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_and_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1329,7 +1313,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_and_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1495,13 +1479,12 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -1514,13 +1497,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_sub_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -1533,13 +1515,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_sub_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1550,17 +1531,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1577,7 +1557,7 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1599,7 +1579,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1621,7 +1601,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1642,7 +1622,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1799,13 +1779,12 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1816,13 +1795,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_sub_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1833,13 +1811,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_sub_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1850,17 +1827,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1876,7 +1852,7 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1896,7 +1872,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_sub_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1916,7 +1892,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_sub_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1937,7 +1913,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2098,13 +2074,12 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -2116,13 +2091,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2134,13 +2108,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2150,17 +2123,16 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2176,7 +2148,7 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2198,7 +2170,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2220,7 +2192,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2241,7 +2213,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2392,13 +2364,12 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2408,13 +2379,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -2424,13 +2394,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2440,17 +2409,16 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2465,7 +2433,7 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2485,7 +2453,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_max_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2505,7 +2473,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_max_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2526,7 +2494,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_max_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2686,13 +2654,12 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -2704,13 +2671,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN2-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2722,13 +2688,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2738,17 +2703,16 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2764,7 +2728,7 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2786,7 +2750,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2808,7 +2772,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2829,7 +2793,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2980,13 +2944,12 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2996,13 +2959,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_umax_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -3012,13 +2974,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3028,17 +2989,16 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3053,7 +3013,7 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3073,7 +3033,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN2-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3093,7 +3053,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN3-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3114,7 +3074,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3274,13 +3234,12 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -3292,13 +3251,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_min_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -3310,13 +3268,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3326,17 +3283,16 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3352,7 +3308,7 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3374,7 +3330,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3396,7 +3352,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3417,7 +3373,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3568,13 +3524,12 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -3584,13 +3539,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_min_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -3600,13 +3554,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3616,17 +3569,16 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3641,7 +3593,7 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3661,7 +3613,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_min_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3681,7 +3633,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_min_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3702,7 +3654,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_min_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3862,13 +3814,12 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -3880,13 +3831,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN2-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -3898,13 +3848,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN3-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3914,17 +3863,16 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3940,7 +3888,7 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3962,7 +3910,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3984,7 +3932,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4005,7 +3953,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4156,13 +4104,12 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -4172,13 +4119,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_umin_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -4188,13 +4134,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_umin_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4204,17 +4149,16 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4229,7 +4173,7 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4249,7 +4193,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN2-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4269,7 +4213,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN3-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4290,7 +4234,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4455,13 +4399,12 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -4474,13 +4417,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 ;
 ; GCN2-LABEL: atomic_or_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -4493,13 +4435,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 ;
 ; GCN3-LABEL: atomic_or_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4510,17 +4451,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4537,7 +4477,7 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4559,7 +4499,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4581,7 +4521,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4602,7 +4542,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4759,13 +4699,12 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -4776,13 +4715,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ;
 ; GCN2-LABEL: atomic_or_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -4793,13 +4731,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ;
 ; GCN3-LABEL: atomic_or_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4810,17 +4747,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4836,7 +4772,7 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4856,7 +4792,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ;
 ; GCN2-LABEL: atomic_or_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4876,7 +4812,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ;
 ; GCN3-LABEL: atomic_or_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4897,7 +4833,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ; GFX11-LABEL: atomic_or_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5127,13 +5063,12 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -5146,13 +5081,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN2-LABEL: atomic_xchg_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -5165,13 +5099,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN3-LABEL: atomic_xchg_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -5182,17 +5115,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5209,7 +5141,7 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5231,7 +5163,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5253,7 +5185,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5274,7 +5206,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5431,13 +5363,12 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -5448,13 +5379,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_xchg_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -5465,13 +5395,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_xchg_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -5482,17 +5411,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5508,7 +5436,7 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5528,7 +5456,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN2-LABEL: atomic_xchg_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5548,7 +5476,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN3-LABEL: atomic_xchg_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5569,7 +5497,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5739,18 +5667,17 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5760,18 +5687,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5781,16 +5707,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1] offset:16
@@ -5800,17 +5725,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5828,7 +5752,7 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
@@ -5852,7 +5776,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5876,7 +5800,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5899,7 +5823,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
@@ -6060,16 +5984,15 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -6079,16 +6002,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -6098,16 +6020,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -6117,17 +6038,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6144,7 +6064,7 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
@@ -6166,7 +6086,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -6188,7 +6108,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -6211,7 +6131,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
@@ -6379,13 +6299,12 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -6398,13 +6317,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_xor_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -6417,13 +6335,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_xor_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6434,17 +6351,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6461,7 +6377,7 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6483,7 +6399,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6505,7 +6421,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6526,7 +6442,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6683,13 +6599,12 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -6700,13 +6615,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_xor_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -6717,13 +6631,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_xor_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6734,17 +6647,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6760,7 +6672,7 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6780,7 +6692,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_xor_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6800,7 +6712,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_xor_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6821,7 +6733,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9340,13 +9252,12 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_incr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -9359,13 +9270,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_inc_i32_incr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -9378,13 +9288,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_inc_i32_incr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -9395,17 +9304,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_inc_i32_incr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9422,7 +9330,7 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9444,7 +9352,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9466,7 +9374,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9487,7 +9395,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9644,13 +9552,12 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_incr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -9661,13 +9568,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_inc_i32_incr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -9678,13 +9584,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_inc_i32_incr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -9695,17 +9600,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_inc_i32_incr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9721,7 +9625,7 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_ret_incr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9741,7 +9645,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_inc_i32_ret_incr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9761,7 +9665,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_inc_i32_ret_incr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9782,7 +9686,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_inc_i32_ret_incr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10080,13 +9984,12 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_decr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -10099,13 +10002,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_dec_i32_decr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -10118,13 +10020,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_dec_i32_decr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -10135,17 +10036,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_dec_i32_decr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -10162,7 +10062,7 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10184,7 +10084,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10206,7 +10106,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10227,7 +10127,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10384,13 +10284,12 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_decr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -10401,13 +10300,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_dec_i32_decr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -10418,13 +10316,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_dec_i32_decr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -10435,17 +10332,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_dec_i32_decr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -10461,7 +10357,7 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_ret_decr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10481,7 +10377,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_dec_i32_ret_decr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10501,7 +10397,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_dec_i32_ret_decr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10522,7 +10418,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_dec_i32_ret_decr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index a025c36f620c7..3e450b785b57b 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -135,11 +135,11 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_rndne_f32_e32 v1, s3
-; VI-NEXT:    v_rndne_f32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_rndne_f32_e32 v3, s3
+; VI-NEXT:    v_rndne_f32_e32 v2, s2
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fnearbyint_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index e687745469014..195b7e37d0749 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -4185,42 +4185,40 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
 define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) {
 ; SI-LABEL: s_fneg_select_infloop_regression_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
-; SI-NEXT:    v_bfrev_b32_e32 v0, 1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-NEXT:    v_bfrev_b32_e32 v3, 1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitcmp1_b32 s6, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; SI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_bitcmp1_b32 s4, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; SI-NEXT:    v_cndmask_b32_e64 v2, -v2, v3, s[2:3]
 ; SI-NEXT:    s_cselect_b32 s0, 0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v3, s3
-; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v3, v2, 0, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
-; VI-NEXT:    v_bfrev_b32_e32 v0, 1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
+; VI-NEXT:    v_bfrev_b32_e32 v3, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s6, 0
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; VI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_bitcmp1_b32 s4, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; VI-NEXT:    v_cndmask_b32_e64 v2, -v2, v3, s[2:3]
 ; VI-NEXT:    s_cselect_b32 s0, 0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v3, v2, 0, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, double 0.0, double %arg
   %i2 = fneg double %i
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 64a9727330cfd..0a2f64b21e7b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -629,29 +629,23 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s4, s2, 16
-; CI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT:    s_and_b32 s2, s3, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s5, s3, 16
-; CI-NEXT:    v_mul_f32_e64 v3, 1.0, |s2|
-; CI-NEXT:    v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT:    v_mul_f32_e64 v1, 1.0, |s5|
-; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s1, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff0000
+; CI-NEXT:    v_mul_f32_e64 v2, -1.0, s3
+; CI-NEXT:    s_lshl_b32 s1, s1, 16
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_and_b32 s0, s2, 0x7fff
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; CI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    v_mul_f32_e64 v3, -1.0, s1
+; CI-NEXT:    s_and_b32 s1, s2, 0x7fff0000
+; CI-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; CI-NEXT:    v_mul_f32_e64 v2, -1.0, s1
+; CI-NEXT:    s_lshl_b32 s0, s0, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_mul_f32_e64 v4, -1.0, s0
+; CI-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fneg_fabs_v4bf16:
@@ -661,23 +655,25 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_bitset1_b32 s3, 15
-; VI-NEXT:    s_bitset1_b32 s2, 15
-; VI-NEXT:    s_bitset1_b32 s5, 15
-; VI-NEXT:    s_bitset1_b32 s4, 15
-; VI-NEXT:    s_and_b32 s3, 0xffff, s3
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_and_b32 s2, 0xffff, s2
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_or_b32 s3, s3, s5
-; VI-NEXT:    s_or_b32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s2, 0x7fff7fff
+; VI-NEXT:    s_and_b32 s1, s3, 0x7fff7fff
+; VI-NEXT:    s_bfe_u32 s3, s3, 0xf0010
+; VI-NEXT:    s_bfe_u32 s2, s2, 0xf0010
+; VI-NEXT:    s_xor_b32 s1, s1, 0x8000
+; VI-NEXT:    s_xor_b32 s3, s3, 0x8000
+; VI-NEXT:    s_xor_b32 s0, s0, 0x8000
+; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
+; VI-NEXT:    s_and_b32 s1, 0xffff, s1
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_and_b32 s0, 0xffff, s0
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_or_b32 s1, s1, s3
+; VI-NEXT:    s_or_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fneg_fabs_v4bf16:
@@ -685,14 +681,16 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    s_bitset1_b32 s3, 15
-; GFX9-NEXT:    s_bitset1_b32 s2, 15
-; GFX9-NEXT:    s_bitset1_b32 s5, 15
-; GFX9-NEXT:    s_bitset1_b32 s4, 15
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s4, s2, 0x7fff7fff
+; GFX9-NEXT:    s_and_b32 s5, s3, 0x7fff7fff
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0xf0010
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0xf0010
+; GFX9-NEXT:    s_xor_b32 s3, s3, 0x8000
+; GFX9-NEXT:    s_xor_b32 s5, s5, 0x8000
+; GFX9-NEXT:    s_xor_b32 s2, s2, 0x8000
+; GFX9-NEXT:    s_xor_b32 s4, s4, 0x8000
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -702,14 +700,16 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX11-NEXT:    s_bitset1_b32 s3, 15
-; GFX11-NEXT:    s_bitset1_b32 s2, 15
-; GFX11-NEXT:    s_bitset1_b32 s4, 15
-; GFX11-NEXT:    s_bitset1_b32 s5, 15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-NEXT:    s_and_b32 s4, s2, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s5, s3, 0x7fff7fff
+; GFX11-NEXT:    s_bfe_u32 s3, s3, 0xf0010
+; GFX11-NEXT:    s_bfe_u32 s2, s2, 0xf0010
+; GFX11-NEXT:    s_xor_b32 s3, s3, 0x8000
+; GFX11-NEXT:    s_xor_b32 s2, s2, 0x8000
+; GFX11-NEXT:    s_xor_b32 s4, s4, 0x8000
+; GFX11-NEXT:    s_xor_b32 s5, s5, 0x8000
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 9d9a851a5507e..305f4e56184cc 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -516,13 +516,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_or_b32 s3, s3, 0x80008000
-; CIVI-NEXT:    s_or_b32 s2, s2, 0x80008000
-; CIVI-NEXT:    v_mov_b32_e32 v3, s1
-; CIVI-NEXT:    v_mov_b32_e32 v0, s2
-; CIVI-NEXT:    v_mov_b32_e32 v1, s3
-; CIVI-NEXT:    v_mov_b32_e32 v2, s0
-; CIVI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    s_or_b32 s0, s3, 0x80008000
+; CIVI-NEXT:    s_or_b32 s1, s2, 0x80008000
+; CIVI-NEXT:    v_mov_b32_e32 v2, s1
+; CIVI-NEXT:    v_mov_b32_e32 v3, s0
+; CIVI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CIVI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fneg_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 1fa9bfa3cfa3f..b93a598cb52ae 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -215,13 +215,13 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset1_b32 s3, 31
-; VI-NEXT:    s_bitset1_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_or_b32 s0, s3, 0x80000000
+; VI-NEXT:    s_or_b32 s1, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 5674ae328406d..ebf03f739308a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1622,34 +1622,34 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo
 define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) {
 ; GFX7-LABEL: multiple_uses_fneg_select_f64:
 ; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
 ; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x6
 ; GFX7-NEXT:    s_add_i32 s12, s12, s17
 ; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_bitcmp1_b32 s6, 0
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX7-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, -v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: multiple_uses_fneg_select_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s6, s[8:9], 0x10
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bitcmp1_b32 s6, 0
@@ -1671,7 +1671,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x10
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x18
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index c3f4ebe30152b..17225b7c39f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -68,13 +68,13 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
-; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_xor_b32 s0, s3, 0x80000000
+; VI-NEXT:    s_xor_b32 s1, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fneg_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 0cb2b0b7df3d2..943f0a51a17b6 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -85,50 +85,46 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -204,50 +200,46 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -323,49 +315,45 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -441,49 +429,45 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -559,50 +543,46 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -678,50 +658,46 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -797,49 +773,45 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -915,49 +887,45 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -1033,50 +1001,46 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -1152,50 +1116,46 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s6, 4
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_mov_b32 s0, 4
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -1271,49 +1231,45 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -1389,49 +1345,45 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x3c
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[8:11], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[14:15]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index ed1ee4527ed89..9385b3c26276e 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -179,22 +179,22 @@ entry:
 define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
 ; SI-LABEL: fshl_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; SI-NEXT:    s_not_b32 s3, s5
-; SI-NEXT:    s_lshr_b32 s1, s1, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_not_b32 s1, s4
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; SI-NEXT:    s_lshr_b32 s0, s0, 1
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    s_not_b32 s1, s7
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    v_alignbit_b32 v0, s3, v0, 1
+; SI-NEXT:    s_lshr_b32 s0, s3, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_alignbit_b32 v1, s0, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_not_b32 s1, s6
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
+; SI-NEXT:    s_lshr_b32 s0, s2, 1
 ; SI-NEXT:    v_mov_b32_e32 v2, s1
 ; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -202,47 +202,43 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; VI-LABEL: fshl_v2i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_not_b32 s7, s7
-; VI-NEXT:    s_lshr_b32 s3, s1, 1
-; VI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_not_b32 s1, s6
-; VI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; VI-NEXT:    s_lshr_b32 s0, s0, 1
-; VI-NEXT:    v_mov_b32_e32 v2, s1
-; VI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_not_b32 s1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 s0, s3, 1
+; VI-NEXT:    v_alignbit_b32 v2, s3, v2, 1
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_alignbit_b32 v3, s0, v2, v3
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_not_b32 s1, s6
+; VI-NEXT:    v_alignbit_b32 v2, s2, v2, 1
+; VI-NEXT:    s_lshr_b32 s0, s2, 1
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_alignbit_b32 v2, s0, v2, v4
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshl_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 1
-; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; GFX9-NEXT:    s_not_b32 s1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    s_not_b32 s1, s15
+; GFX9-NEXT:    s_lshr_b32 s0, s11, 1
+; GFX9-NEXT:    v_alignbit_b32 v0, s11, v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_not_b32 s1, s8
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    s_not_b32 s1, s14
+; GFX9-NEXT:    v_alignbit_b32 v0, s10, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s10, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32:
@@ -265,40 +261,34 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; GFX10-LABEL: fshl_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v0, s1, s3, 1
-; GFX10-NEXT:    v_alignbit_b32 v3, s0, s2, 1
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_not_b32 s2, s7
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX10-NEXT:    s_not_b32 s3, s6
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, v0, s2
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, v3, s3
+; GFX10-NEXT:    v_alignbit_b32 v0, s11, s13, 1
+; GFX10-NEXT:    v_alignbit_b32 v3, s10, s12, 1
+; GFX10-NEXT:    s_lshr_b32 s0, s11, 1
+; GFX10-NEXT:    s_not_b32 s1, s15
+; GFX10-NEXT:    s_lshr_b32 s2, s10, 1
+; GFX10-NEXT:    s_not_b32 s3, s14
+; GFX10-NEXT:    v_alignbit_b32 v1, s0, v0, s1
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, v3, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_v2i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, s1, s3, 1
-; GFX11-NEXT:    v_alignbit_b32 v3, s0, s2, 1
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX11-NEXT:    s_not_b32 s2, s7
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX11-NEXT:    s_not_b32 s3, s6
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, v0, s2
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v3, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v0, s3, s5, 1
+; GFX11-NEXT:    v_alignbit_b32 v3, s2, s4, 1
+; GFX11-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX11-NEXT:    s_not_b32 s4, s7
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX11-NEXT:    s_not_b32 s5, s6
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, v0, s4
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, v3, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -309,43 +299,45 @@ entry:
 define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
 ; SI-LABEL: fshl_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 23
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 25
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v2i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_alignbit_b32 v3, s3, v2, 23
+; VI-NEXT:    v_alignbit_b32 v2, s2, v4, 25
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshl_v2i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 25
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, 23
+; GFX9-NEXT:    v_alignbit_b32 v0, s2, v3, 25
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32_imm:
@@ -365,25 +357,25 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX10-LABEL: fshl_v2i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 23
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 25
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s7, 23
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s6, 25
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_v2i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 23
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 25
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s5, 23
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s4, 25
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index b35b55338e5fe..48e1cccf516b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -221,51 +221,47 @@ entry:
 define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
 ; SI-LABEL: fshr_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v2i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_alignbit_b32 v3, s3, v2, v3
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_alignbit_b32 v2, s2, v4, v2
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshr_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
+; GFX9-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-NEXT:    v_alignbit_b32 v1, s11, v0, v1
+; GFX9-NEXT:    v_alignbit_b32 v0, s10, v3, v4
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -285,79 +281,64 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; GFX10-LABEL: fshr_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s15
+; GFX10-NEXT:    v_mov_b32_e32 v2, s14
+; GFX10-NEXT:    v_alignbit_b32 v1, s11, s13, v0
+; GFX10-NEXT:    v_alignbit_b32 v0, s10, s12, v2
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: fshr_v2i32:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT:    s_clause 0x2
-; GFX11-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
-; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0.l
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s2, s4, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fshr_v2i32:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX11-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s2, s4, v2
+; GFX11-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX12-TRUE16-LABEL: fshr_v2i32:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT:    s_clause 0x2
-; GFX12-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
-; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0.l
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s2, s4, v0.h
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX12-FAKE16-LABEL: fshr_v2i32:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT:    s_clause 0x2
-; GFX12-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX12-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s2, s4, v2
+; GFX12-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -368,43 +349,45 @@ entry:
 define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
 ; SI-LABEL: fshr_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 9
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v2i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_alignbit_b32 v3, s3, v2, 9
+; VI-NEXT:    v_alignbit_b32 v2, s2, v4, 7
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshr_v2i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 7
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, 9
+; GFX9-NEXT:    v_alignbit_b32 v0, s2, v3, 7
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v2i32_imm:
@@ -424,37 +407,37 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX10-LABEL: fshr_v2i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 7
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s7, 9
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s6, 7
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_v2i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 7
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s5, 9
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s4, 7
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: fshr_v2i32_imm:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX12-NEXT:    v_alignbit_b32 v0, s0, s2, 7
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT:    v_alignbit_b32 v1, s3, s5, 9
+; GFX12-NEXT:    v_alignbit_b32 v0, s2, s4, 7
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index da132d0269e6b..b21392bbe8cfe 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -321,14 +321,13 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -338,13 +337,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_add_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -357,14 +355,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_add_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_add v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -373,16 +370,15 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -399,7 +395,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_add_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -420,7 +416,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_add_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -444,7 +440,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -462,7 +458,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -613,30 +609,29 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_add_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -647,14 +642,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_add_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_add v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -663,16 +657,15 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -688,7 +681,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_add_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -709,7 +702,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_add_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -731,7 +724,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_add_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -749,7 +742,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_add_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -901,14 +894,13 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -918,13 +910,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_and_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -937,14 +928,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_and_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_and v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -953,16 +943,15 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -979,7 +968,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_and_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1000,7 +989,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_and_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1024,7 +1013,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1042,7 +1031,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -1193,30 +1182,29 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_and_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1227,14 +1215,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_and_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_and v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1243,16 +1230,15 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1268,7 +1254,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_and_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1289,7 +1275,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_and_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1311,7 +1297,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_and_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1329,7 +1315,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_and_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -1481,14 +1467,13 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -1498,13 +1483,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_sub_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1517,14 +1501,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_sub_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_sub v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1533,16 +1516,15 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1559,7 +1541,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1580,7 +1562,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1604,7 +1586,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1622,7 +1604,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -1773,30 +1755,29 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_sub_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1807,14 +1788,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_sub_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_sub v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1823,16 +1803,15 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1848,7 +1827,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_sub_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1869,7 +1848,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_sub_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1891,7 +1870,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_sub_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1909,7 +1888,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2057,14 +2036,13 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -2072,13 +2050,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_max_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2089,30 +2066,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smax v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2128,7 +2103,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_max_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2148,7 +2123,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_max_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2171,7 +2146,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2188,7 +2163,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2327,28 +2302,27 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_max_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -2357,30 +2331,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smax v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2395,7 +2367,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_max_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2415,7 +2387,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_max_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2436,7 +2408,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_max_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2453,7 +2425,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_max_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2593,14 +2565,13 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -2608,13 +2579,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_umax_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2625,30 +2595,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2664,7 +2632,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2684,7 +2652,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2707,7 +2675,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2724,7 +2692,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2863,28 +2831,27 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umax_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -2893,30 +2860,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2931,7 +2896,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_umax_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2951,7 +2916,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_umax_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2972,7 +2937,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umax_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2989,7 +2954,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3129,14 +3094,13 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -3144,13 +3108,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_min_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -3161,30 +3124,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3200,7 +3161,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_min_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3220,7 +3181,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_min_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3243,7 +3204,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -3260,7 +3221,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3399,28 +3360,27 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_min_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3429,30 +3389,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3467,7 +3425,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_min_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3487,7 +3445,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_min_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3508,7 +3466,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_min_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -3525,7 +3483,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_min_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3665,14 +3623,13 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -3680,13 +3637,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_umin_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -3697,30 +3653,28 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umin_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3736,7 +3690,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3756,7 +3710,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; VI-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3779,7 +3733,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -3796,7 +3750,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3935,28 +3889,27 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umin_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3965,30 +3918,28 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX9-LABEL: atomic_umin_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -4003,7 +3954,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_umin_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4023,7 +3974,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_umin_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4044,7 +3995,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umin_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4061,7 +4012,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -4212,14 +4163,13 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -4229,13 +4179,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
 ;
 ; VI-LABEL: atomic_or_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -4248,14 +4197,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
 ;
 ; GFX9-LABEL: atomic_or_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_or v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4264,16 +4212,15 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -4290,7 +4237,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ; SI-LABEL: atomic_or_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4311,7 +4258,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ;
 ; VI-LABEL: atomic_or_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4335,7 +4282,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4353,7 +4300,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ; GFX11-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -4504,30 +4451,29 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_or_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -4538,14 +4484,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_or_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_or v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4554,16 +4499,15 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -4579,7 +4523,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ; SI-LABEL: atomic_or_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4600,7 +4544,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ;
 ; VI-LABEL: atomic_or_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4622,7 +4566,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: atomic_or_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4640,7 +4584,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX11-LABEL: atomic_or_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -4849,14 +4793,13 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -4866,13 +4809,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_xchg_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -4885,14 +4827,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_xchg_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4901,16 +4842,15 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -4927,7 +4867,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4948,7 +4888,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4972,7 +4912,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4990,7 +4930,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -5141,30 +5081,29 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_xchg_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -5175,14 +5114,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX9-LABEL: atomic_xchg_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -5191,16 +5129,15 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5216,7 +5153,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_xchg_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5237,7 +5174,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_xchg_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5259,7 +5196,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xchg_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -5277,7 +5214,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -5437,16 +5374,15 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s9, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s7, s[4:5], 0xf
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    v_mov_b32_e32 v3, s5
 ; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16
@@ -5456,18 +5392,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5477,17 +5412,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5495,17 +5429,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5522,9 +5455,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dword s10, s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x11
+; SI-NEXT:    s_load_dword s10, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -5532,8 +5465,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ; SI-NEXT:    s_lshl_b64 s[8:9], s[8:9], 2
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16 glc
@@ -5545,7 +5478,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5571,17 +5504,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5591,7 +5524,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -5752,35 +5685,34 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s9, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v3, s5
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5790,17 +5722,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5808,17 +5739,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5834,9 +5764,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dword s10, s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x11
+; SI-NEXT:    s_load_dword s10, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -5844,8 +5774,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; SI-NEXT:    s_lshl_b64 s[8:9], s[8:9], 2
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 glc
@@ -5857,7 +5787,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5881,17 +5811,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5901,7 +5831,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -6055,14 +5985,13 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -6072,13 +6001,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_xor_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -6091,14 +6019,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_xor_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_xor v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -6107,16 +6034,15 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -6133,7 +6059,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6154,7 +6080,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6178,7 +6104,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -6196,7 +6122,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -6347,30 +6273,29 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_xor_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -6381,14 +6306,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_xor_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_xor v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -6397,16 +6321,15 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -6422,7 +6345,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_xor_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6443,7 +6366,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_xor_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6465,7 +6388,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xor_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -6483,7 +6406,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -8495,14 +8418,13 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_inc_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -8512,13 +8434,12 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_inc_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,14 +8452,13 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_inc_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_inc v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -8547,16 +8467,15 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_inc_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -8573,7 +8492,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8594,7 +8513,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8618,7 +8537,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -8636,7 +8555,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -8974,14 +8893,13 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_dec_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -8991,13 +8909,12 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_dec_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -9010,14 +8927,13 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_dec_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_dec v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -9026,16 +8942,15 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_dec_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -9052,7 +8967,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9073,7 +8988,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9097,7 +9012,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9115,7 +9030,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 8e427a6ef2023..87e57298f5dc6 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -98,16 +98,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_add_u32 s4, s0, 4
-; CIVI-NEXT:    s_addc_u32 s5, s1, 0
-; CIVI-NEXT:    v_mov_b32_e32 v2, s4
-; CIVI-NEXT:    v_mov_b32_e32 v4, s3
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
-; CIVI-NEXT:    v_mov_b32_e32 v3, s5
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
-; CIVI-NEXT:    v_mov_b32_e32 v5, s2
-; CIVI-NEXT:    flat_store_short v[2:3], v4
-; CIVI-NEXT:    flat_store_dword v[0:1], v5
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
+; CIVI-NEXT:    s_add_u32 s0, s0, 4
+; CIVI-NEXT:    flat_store_dword v[0:1], v2
+; CIVI-NEXT:    s_addc_u32 s1, s1, 0
+; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    v_mov_b32_e32 v2, s3
+; CIVI-NEXT:    flat_store_short v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v3f16_arg:
@@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
-; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v3, s3
 ; CIVI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CIVI-NEXT:    s_endpgm
@@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; GFX11-LABEL: load_v4f16_arg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   store <4 x half> %arg, ptr addrspace(1) %out
@@ -348,37 +348,21 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 }
 
 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
-; CI-LABEL: extload_v3f16_to_v3f32_arg:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s4, s2, 16
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v4, s1
-; CI-NEXT:    v_mov_b32_e32 v3, s0
-; CI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: extload_v3f16_to_v3f32_arg:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s0
-; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
-; VI-NEXT:    s_endpgm
+; CIVI-LABEL: extload_v3f16_to_v3f32_arg:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT:    s_add_i32 s12, s12, s17
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_lshr_b32 s4, s2, 16
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; CIVI-NEXT:    v_mov_b32_e32 v3, s0
+; CIVI-NEXT:    v_mov_b32_e32 v4, s1
+; CIVI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
+; CIVI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
 ; GFX11:       ; %bb.0:
@@ -386,9 +370,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <3 x half> %arg to <3 x float>
@@ -404,14 +388,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s4, s3, 16
-; CI-NEXT:    s_lshr_b32 s5, s2, 16
+; CI-NEXT:    s_lshr_b32 s4, s2, 16
+; CI-NEXT:    s_lshr_b32 s5, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, s5
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, s0
+; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; CI-NEXT:    s_endpgm
 ;
@@ -424,12 +408,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -440,10 +424,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <4 x half> %arg to <4 x float>
@@ -724,61 +708,33 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 }
 
 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
-; CI-LABEL: extload_v4f16_to_v4f64_arg:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s4, s3, 16
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, s4
-; CI-NEXT:    s_lshr_b32 s5, s2, 16
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, s5
-; CI-NEXT:    s_add_u32 s2, s0, 16
-; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; CI-NEXT:    s_addc_u32 s3, s1, 0
-; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; CI-NEXT:    v_mov_b32_e32 v9, s3
-; CI-NEXT:    v_mov_b32_e32 v8, s2
-; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; CI-NEXT:    s_nop 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: extload_v4f16_to_v4f64_arg:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s3
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s5
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v6, s4
-; VI-NEXT:    s_add_u32 s2, s0, 16
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; VI-NEXT:    v_mov_b32_e32 v9, s3
-; VI-NEXT:    v_mov_b32_e32 v8, s2
-; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; VI-NEXT:    s_endpgm
+; CIVI-LABEL: extload_v4f16_to_v4f64_arg:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT:    s_add_i32 s12, s12, s17
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_lshr_b32 s5, s3, 16
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v0, s3
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; CIVI-NEXT:    s_lshr_b32 s4, s2, 16
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v4, s2
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; CIVI-NEXT:    s_add_u32 s2, s0, 16
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; CIVI-NEXT:    s_addc_u32 s3, s1, 0
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; CIVI-NEXT:    v_mov_b32_e32 v9, s3
+; CIVI-NEXT:    v_mov_b32_e32 v8, s2
+; CIVI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; CIVI-NEXT:    s_nop 0
+; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CIVI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
 ; GFX11:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index e1b4cad370f96..11826aa0b360d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -290,19 +290,19 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec,
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dword s5, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s0, s5, 4
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GCN-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -418,19 +418,19 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec,
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dword s5, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s0, s5, 4
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GCN-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -443,18 +443,18 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x34
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
-; GCN-NEXT:    s_lshl_b32 s6, s6, 3
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s4, s4, 3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_xor_b32 s1, s3, 0x1010101
+; GCN-NEXT:    s_xor_b32 s0, s2, 0x1010101
+; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 44bd4090436ef..0251a5af8916b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1573,13 +1573,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s0, 0x50005
+; VI-NEXT:    s_lshl_b32 s0, s8, 4
+; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s9, s8
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_mov_b32 s1, s0
-; VI-NEXT:    s_lshl_b32 s8, s8, 4
-; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; VI-NEXT:    s_xor_b64 s[8:9], s[2:3], s[8:9]
+; VI-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
 ; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
@@ -1680,14 +1680,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0xa
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; SI-NEXT:    s_and_b32 s5, s6, s5
 ; SI-NEXT:    s_xor_b32 s4, s5, s4
@@ -1697,14 +1696,13 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8
 ;
 ; VI-LABEL: dynamic_insertelement_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x28
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; VI-NEXT:    s_and_b32 s5, s6, s5
 ; VI-NEXT:    s_xor_b32 s4, s5, s4
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a2da8876472ab..41b5103b38e50 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s0, 4
-; VI-NEXT:    s_addc_u32 s5, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v4, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v5, s2
-; VI-NEXT:    flat_store_short v[2:3], v4
-; VI-NEXT:    flat_store_dword v[0:1], v5
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_add_u32 s0, s0, 4
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v3i16_arg:
@@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 18afb7c677207..b71b6720f479f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -57,19 +57,19 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
 ; GCN-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x16
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0xd
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x16
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_bitcmp1_b32 s6, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 078a043b94604..093e205c13305 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -139,16 +139,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; SDAG-NEXT:    v_mov_b32_e32 v7, s15
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; SDAG-NEXT:    s_nop 7
@@ -177,16 +183,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s0
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s1
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; HEURRC-NEXT:    s_nop 7
@@ -196,16 +208,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; VGPRRC-NEXT:    s_nop 7
@@ -258,16 +276,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; SDAG-NEXT:    v_mov_b32_e32 v7, s15
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    s_nop 7
@@ -296,16 +320,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s0
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s1
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
@@ -315,16 +345,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
@@ -5459,16 +5495,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    v_mov_b32_e32 v6, s14
+; GCN-NEXT:    v_mov_b32_e32 v7, s15
+; GCN-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-NEXT:    v_mov_b32_e32 v11, s3
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; GCN-NEXT:    s_nop 7
@@ -5478,16 +5520,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s0
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s1
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; HEURRC-NEXT:    s_nop 7
@@ -5497,16 +5545,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; VGPRRC-NEXT:    s_nop 7
@@ -5559,16 +5613,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    v_mov_b32_e32 v6, s14
+; GCN-NEXT:    v_mov_b32_e32 v7, s15
+; GCN-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-NEXT:    v_mov_b32_e32 v11, s3
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    s_nop 7
@@ -5578,16 +5638,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s0
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s1
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
@@ -5597,16 +5663,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 7151fee3cdc96..ae4c88832dbe6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_exp_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8a000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3fb8a000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x42b17218
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x39a3b295
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x39a3b295, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3fb8a000, v6
-; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v5, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    v_exp_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    s_and_b32 s0, s2, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v8, s2, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, s0, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x39a3b295, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3fb8a000, v8
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v7, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v8, v8, v9
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v7
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v8
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v7
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, v5, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc2ce8ed0
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-SDAG-NEXT:    v_ldexp_f32 v2, v2, v4
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v6
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 918b1b26aff2e..f28b5538cc9c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-LABEL: s_exp10_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x40549000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x421a209b
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x40549000, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3a2784bc
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3a2784bc, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x40549000, v6
-; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc23369f4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v5, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    v_exp_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    s_and_b32 s0, s2, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v8, s2, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, s0, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x3a2784bc, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x40549000, v8
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v7, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v8, v8, v9
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v7
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v8
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v7
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, v5, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc23369f4
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-SDAG-NEXT:    v_ldexp_f32 v2, v2, v4
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v6
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index e71ea505caea1..d3336f419c310 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -225,26 +225,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-LABEL: s_exp2_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_exp_f32_e32 v4, v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, s2, v2
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_add_f32_e32 v0, s2, v0
-; VI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT:    s_cselect_b32 s3, 0xffffffc0, 0
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, v2, s3
-; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-SDAG-NEXT:    s_cselect_b32 s2, 0xffffffc0, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, s2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffffc0, 0
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, v4, s0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffffc0, 0
+; VI-SDAG-NEXT:    v_ldexp_f32 v2, v2, s0
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 307fa89003b4b..5412b26eef8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_log_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41b17218
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v2
 ; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_and_b32_e32 v6, 0xfffff000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v7, v5, v6
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x3f317000, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v7
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v8, v7
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT:    v_log_f32_e32 v7, v3
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v7
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v7, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v5
-; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v8, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v5, v1
-; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v5
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v5, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v6, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_log_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 5278589968d91..d85e181861a93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-LABEL: s_log10_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x411a209b
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v2
 ; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_and_b32_e32 v6, 0xfffff000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v7, v5, v6
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x3e9a2000, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v7
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v8, v7
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT:    v_log_f32_e32 v7, v3
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v7
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v7, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v5
-; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v8, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v5, v1
-; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v5
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v5, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v6, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_log10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 4ca612aa32e84..738bc2f9c6d6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -285,16 +285,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
 ; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; VI-SDAG-NEXT:    v_ldexp_f32 v0, s2, v0
 ; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v4, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v4, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v5, v4
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_log2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index b8f9571ccc2ee..f6a05291a58af 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1036,14 +1036,14 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; CI-LABEL: mad_i64_i32_uniform:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xb
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s3
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v2, s5
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s4, v2, v[0:1]
 ; CI-NEXT:    s_mov_b32 s4, s0
 ; CI-NEXT:    s_mov_b32 s5, s1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1052,17 +1052,17 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; SI-LABEL: mad_i64_i32_uniform:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mul_hi_u32 v1, s2, v0
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mul_i32 s0, s2, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_mul_hi_u32 v1, s8, v0
+; SI-NEXT:    s_mul_i32 s0, s8, s9
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1070,14 +1070,14 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ;
 ; GFX9-LABEL: mad_i64_i32_uniform:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2c
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s3
-; GFX9-NEXT:    s_mul_i32 s2, s2, s3
-; GFX9-NEXT:    s_add_u32 s2, s2, s6
-; GFX9-NEXT:    s_addc_u32 s3, s4, s7
+; GFX9-NEXT:    s_mul_i32 s5, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s7
+; GFX9-NEXT:    s_add_u32 s2, s5, s2
+; GFX9-NEXT:    s_addc_u32 s3, s4, s3
 ; GFX9-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1085,13 +1085,13 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; GFX11-LABEL: mad_i64_i32_uniform:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_i32 s6, s2, s3
-; GFX11-NEXT:    s_mul_hi_u32 s3, s2, s3
-; GFX11-NEXT:    s_add_u32 s2, s6, s4
-; GFX11-NEXT:    s_addc_u32 s3, s3, s5
+; GFX11-NEXT:    s_mul_i32 s4, s6, s7
+; GFX11-NEXT:    s_mul_hi_u32 s5, s6, s7
+; GFX11-NEXT:    s_add_u32 s2, s4, s2
+; GFX11-NEXT:    s_addc_u32 s3, s5, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -1100,16 +1100,16 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; GFX12-LABEL: mad_i64_i32_uniform:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x2c
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s7, 0
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s6, s2
-; GFX12-NEXT:    s_mov_b32 s2, s3
-; GFX12-NEXT:    s_mov_b32 s3, s7
+; GFX12-NEXT:    s_mov_b32 s4, s6
+; GFX12-NEXT:    s_mov_b32 s6, s7
+; GFX12-NEXT:    s_mov_b32 s7, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_u64 s[2:3], s[6:7], s[2:3]
-; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -1117,18 +1117,19 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ;
 ; GFX1250-LABEL: mad_i64_i32_uniform:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x2c
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT:    s_mov_b32 s7, 0
+; GFX1250-NEXT:    s_mov_b32 s5, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s6, s2
-; GFX1250-NEXT:    s_mov_b32 s2, s3
-; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s4, s6
+; GFX1250-NEXT:    s_mov_b32 s6, s7
+; GFX1250-NEXT:    s_mov_b32 s7, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_mul_u64 s[2:3], s[6:7], s[2:3]
-; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 311527d5d04cc..6c4e557f8af2d 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1069,117 +1069,117 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ;
 ; CI-LABEL: s_test_imin_sle_v4i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_ashr_i32 s6, s0, 16
-; CI-NEXT:    s_ashr_i32 s7, s1, 16
-; CI-NEXT:    s_sext_i32_i16 s0, s0
-; CI-NEXT:    s_sext_i32_i16 s1, s1
-; CI-NEXT:    s_ashr_i32 s8, s2, 16
-; CI-NEXT:    s_ashr_i32 s9, s3, 16
-; CI-NEXT:    s_sext_i32_i16 s2, s2
-; CI-NEXT:    s_sext_i32_i16 s3, s3
-; CI-NEXT:    s_min_i32 s7, s7, s9
-; CI-NEXT:    s_min_i32 s1, s1, s3
-; CI-NEXT:    s_min_i32 s3, s6, s8
-; CI-NEXT:    s_min_i32 s0, s0, s2
-; CI-NEXT:    s_lshl_b32 s7, s7, 16
-; CI-NEXT:    s_and_b32 s1, s1, 0xffff
-; CI-NEXT:    s_lshl_b32 s3, s3, 16
-; CI-NEXT:    s_and_b32 s0, s0, 0xffff
-; CI-NEXT:    s_or_b32 s1, s1, s7
-; CI-NEXT:    s_or_b32 s0, s0, s3
-; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    s_ashr_i32 s0, s2, 16
+; CI-NEXT:    s_ashr_i32 s1, s3, 16
+; CI-NEXT:    s_sext_i32_i16 s2, s2
+; CI-NEXT:    s_sext_i32_i16 s3, s3
+; CI-NEXT:    s_ashr_i32 s6, s4, 16
+; CI-NEXT:    s_ashr_i32 s7, s5, 16
+; CI-NEXT:    s_sext_i32_i16 s4, s4
+; CI-NEXT:    s_sext_i32_i16 s5, s5
+; CI-NEXT:    s_min_i32 s1, s1, s7
+; CI-NEXT:    s_min_i32 s3, s3, s5
+; CI-NEXT:    s_min_i32 s0, s0, s6
+; CI-NEXT:    s_min_i32 s2, s2, s4
+; CI-NEXT:    s_lshl_b32 s1, s1, 16
+; CI-NEXT:    s_and_b32 s3, s3, 0xffff
+; CI-NEXT:    s_lshl_b32 s0, s0, 16
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff
+; CI-NEXT:    s_or_b32 s1, s3, s1
+; CI-NEXT:    s_or_b32 s0, s2, s0
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_imin_sle_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s6, s3, 16
-; VI-NEXT:    s_ashr_i32 s7, s1, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_ashr_i32 s0, s5, 16
+; VI-NEXT:    s_ashr_i32 s1, s3, 16
+; VI-NEXT:    s_min_i32 s0, s1, s0
+; VI-NEXT:    s_sext_i32_i16 s1, s5
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_min_i32 s6, s7, s6
-; VI-NEXT:    s_min_i32 s1, s1, s3
-; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_min_i32 s1, s3, s1
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s6
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_ashr_i32 s1, s4, 16
 ; VI-NEXT:    s_ashr_i32 s3, s2, 16
-; VI-NEXT:    s_ashr_i32 s6, s0, 16
+; VI-NEXT:    s_min_i32 s1, s3, s1
+; VI-NEXT:    s_sext_i32_i16 s3, s4
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_min_i32 s3, s6, s3
-; VI-NEXT:    s_min_i32 s0, s0, s2
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_min_i32 s2, s2, s3
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_or_b32 s1, s2, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_test_imin_sle_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_pk_min_i16 v1, s1, v0
-; GFX9-NEXT:    v_pk_min_i16 v0, s0, v3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_pk_min_i16 v1, s3, v0
+; GFX9-NEXT:    v_pk_min_i16 v0, s2, v3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_imin_sle_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_min_i16 v1, s1, s3
-; GFX10-NEXT:    v_pk_min_i16 v0, s0, s2
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    v_pk_min_i16 v1, s3, s5
+; GFX10-NEXT:    v_pk_min_i16 v0, s2, s4
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_imin_sle_v4i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_i16 v1, s1, s3
-; GFX11-NEXT:    v_pk_min_i16 v0, s0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_pk_min_i16 v1, s3, s5
+; GFX11-NEXT:    v_pk_min_i16 v0, s2, s4
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v4i16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_pk_min_i16 v1, s1, s3
-; GFX1250-NEXT:    v_pk_min_i16 v0, s0, s2
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-NEXT:    v_pk_min_i16 v1, s3, s5
+; GFX1250-NEXT:    v_pk_min_i16 v0, s2, s4
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -1611,91 +1611,91 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ;
 ; CI-LABEL: s_test_imin_slt_v2i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_min_i32 s1, s1, s3
-; CI-NEXT:    s_min_i32 s0, s0, s2
-; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    s_min_i32 s0, s3, s5
+; CI-NEXT:    s_min_i32 s1, s2, s4
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_imin_slt_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_min_i32 s1, s1, s3
-; VI-NEXT:    s_min_i32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_min_i32 s0, s3, s5
+; VI-NEXT:    s_min_i32 s1, s2, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_test_imin_slt_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_min_i32 s1, s1, s3
-; GFX9-NEXT:    s_min_i32 s0, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_min_i32 s3, s3, s5
+; GFX9-NEXT:    s_min_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_imin_slt_v2i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_min_i32 s0, s0, s2
-; GFX10-NEXT:    s_min_i32 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    s_min_i32 s2, s2, s4
+; GFX10-NEXT:    s_min_i32 s3, s3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_imin_slt_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s0, s0, s2
-; GFX11-NEXT:    s_min_i32 s1, s1, s3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_min_i32 s2, s2, s4
+; GFX11-NEXT:    s_min_i32 s3, s3, s5
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_imin_slt_v2i32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_min_i32 s0, s0, s2
-; GFX1250-NEXT:    s_min_i32 s1, s1, s3
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-NEXT:    s_min_i32 s2, s2, s4
+; GFX1250-NEXT:    s_min_i32 s3, s3, s5
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index f78168ba29ef1..bcfeae61af68c 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -75,18 +75,31 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
 ; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-NEXT:    s_endpgm
 ;
-; GFX1250-LABEL: fadd_v2_vs:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_load_b64 v[0:1], v4, s[0:1] scale_offset
-; GFX1250-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    global_store_b64 v4, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT:    s_endpgm
+; GFX1250-SDAG-LABEL: fadd_v2_vs:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_vs:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT:    global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1332,18 +1345,31 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
 ; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-NEXT:    s_endpgm
 ;
-; GFX1250-LABEL: fmul_v2_vs:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_load_b64 v[0:1], v4, s[0:1] scale_offset
-; GFX1250-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    global_store_b64 v4, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT:    s_endpgm
+; GFX1250-SDAG-LABEL: fmul_v2_vs:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v2_vs:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT:    global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -3466,8 +3492,8 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; GFX900-LABEL: fadd_fadd_fsub:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX900-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX900-NEXT:    v_add_f32_e32 v0, s1, v0
@@ -3475,14 +3501,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; GFX900-NEXT:    v_add_f32_e32 v3, s2, v0
 ; GFX900-NEXT:    v_sub_f32_e32 v0, s0, v1
 ; GFX900-NEXT:    v_subrev_f32_e32 v1, s3, v3
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; PACKED-SDAG-LABEL: fadd_fadd_fsub:
 ; PACKED-SDAG:       ; %bb.0: ; %bb
 ; PACKED-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; PACKED-SDAG-NEXT:    v_add_f32_e32 v0, s1, v0
@@ -3490,7 +3516,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v3, v0
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
-; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
+; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
 ; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 25020673bce22..3f446af58ee70 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -94,62 +94,64 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; SI-LABEL: rotl_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s3, 32, s3
-; SI-NEXT:    s_sub_i32 s2, 32, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_sub_i32 s0, 32, s9
+; SI-NEXT:    s_sub_i32 s1, 32, s8
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_alignbit_b32 v1, s3, s3, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, s2, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: rotl_v2i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sub_i32 s2, 32, s2
-; GFX8-NEXT:    s_sub_i32 s3, 32, s3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, s0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_sub_i32 s1, 32, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_sub_i32 s0, 32, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_alignbit_b32 v3, s3, s3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v2
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: rotl_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sub_i32 s3, 32, s3
-; GFX10-NEXT:    s_sub_i32 s2, 32, s2
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    s_sub_i32 s4, 32, s7
+; GFX10-NEXT:    s_sub_i32 s5, 32, s6
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s3, s4
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s2, s5
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotl_v2i32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sub_i32 s3, 32, s3
-; GFX11-NEXT:    s_sub_i32 s2, 32, s2
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_sub_i32 s4, 32, s7
+; GFX11-NEXT:    s_sub_i32 s5, 32, s6
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s3, s4
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s2, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = shl <2 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 74ac181c120b5..0d14fcd45f45e 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -83,54 +83,56 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; SI-LABEL: rotr_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v1, s3, s3, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_alignbit_b32 v0, s2, s2, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: rotr_v2i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, s0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v3, s3, s3, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v4
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: rotr_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s3, s7
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s2, s6
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotr_v2i32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s3, s5
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s2, s4
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index f14a5cc19774d..541b528256b5c 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -56,8 +56,8 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 {
 }
 
 ; SI-LABEL: {{^}}s_addk_v2i32_k0:
-; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
-; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x42
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x41
 ; SI: s_endpgm
 ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b
 define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
index a0bac532454f5..2203e44053fe2 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -25,18 +25,19 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a
 define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: sext_i16_to_i64_uniform:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_i64 s[4:5], s[6:7], 0x100000
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s5, s9, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_bfe_i64 s[0:1], s[8:9], 0x100000
+; GCN-NEXT:    s_add_u32 s0, s2, s0
+; GCN-NEXT:    s_addc_u32 s1, s3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %sext = sext i16 %a to i64
   %res = add i64 %b, %sext
@@ -89,18 +90,19 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16
 define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a, i64 %b) {
 ; GCN-LABEL: sext_i32_to_i64_uniform:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s5, s8, 31
-; GCN-NEXT:    s_add_u32 s4, s6, s8
-; GCN-NEXT:    s_addc_u32 s5, s7, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_ashr_i32 s1, s8, 31
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_add_u32 s0, s2, s8
+; GCN-NEXT:    s_addc_u32 s1, s3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %sext = sext i32 %a to i64
   %res = add i64 %b, %sext
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 28330bfc9bb69..7170e06835e67 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1394,28 +1394,30 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_64_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 64, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 64, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_64_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 64, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 64, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_64_i64:
@@ -1444,28 +1446,30 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_1_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 1, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 1, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_1_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 1, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 1, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_1_i64:
@@ -1495,28 +1499,30 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
 define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_1_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 1.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 1.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_1_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 1.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 1.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_1_0_i64:
@@ -1542,28 +1548,30 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -1.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -1.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -1.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -1.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
@@ -1589,28 +1597,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_0_5_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 0.5, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 0.5, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_0_5_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 0.5, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 0.5, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_0_5_i64:
@@ -1636,28 +1646,30 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -0.5, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -0.5, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -0.5, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -0.5, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
@@ -1683,28 +1695,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_2_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 2.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 2.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_2_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 2.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 2.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_2_0_i64:
@@ -1730,28 +1744,30 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -2.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -2.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -2.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -2.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
@@ -1777,28 +1793,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 4.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 4.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 4.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 4.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_4_0_i64:
@@ -1824,28 +1842,30 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -4.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -4.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -4.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -4.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
@@ -1874,28 +1894,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 0x40800000, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 0x40800000, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
@@ -1926,32 +1948,34 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s4, -4.0
-; SI-NEXT:    s_mov_b32 s5, -1
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s0, -4.0
+; SI-NEXT:    s_mov_b32 s1, -1
+; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s4, -4.0
-; VI-NEXT:    s_mov_b32 s5, -1
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, -4.0
+; VI-NEXT:    s_mov_b32 s1, -1
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
@@ -1982,32 +2006,34 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s4, 0
-; SI-NEXT:    s_mov_b32 s5, 4.0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s8, 0
+; SI-NEXT:    s_mov_b32 s9, 4.0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    s_mov_b32 s5, 4.0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s8, 0
+; VI-NEXT:    s_mov_b32 s9, 4.0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
@@ -2033,32 +2059,34 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %o
 define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s4, 0
-; SI-NEXT:    s_mov_b32 s5, -4.0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s8, 0
+; SI-NEXT:    s_mov_b32 s9, -4.0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    s_mov_b32 s5, -4.0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s8, 0
+; VI-NEXT:    s_mov_b32 s9, -4.0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index fb336f4558a63..601860b0be976 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -339,38 +339,38 @@ define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(ptr a
 define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(ptr addrspace(1) %out, ptr addrspace(3) noalias %lptr, ptr addrspace(4) %ptr0) #0 {
 ; CI-LABEL: reorder_smrd_load_local_store_smrd_load:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dword s8, s[4:5], 0xb
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x63
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x1
 ; CI-NEXT:    v_mov_b32_e32 v1, s8
-; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    ds_write_b32 v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s4, s4, s5
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_add_i32 s0, s0, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: reorder_smrd_load_local_store_smrd_load:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    ds_write_b32 v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_add_i32 s2, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 47998767a948c..be10302c42854 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -331,39 +331,39 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s7, s2, 16
-; VI-NEXT:    s_sub_i32 s7, 0, s7
-; VI-NEXT:    s_sub_i32 s4, 0, s3
-; VI-NEXT:    s_lshr_b32 s6, s3, 16
-; VI-NEXT:    s_ashr_i32 s8, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s7, s7
-; VI-NEXT:    s_sub_i32 s5, 0, s2
-; VI-NEXT:    s_sub_i32 s6, 0, s6
-; VI-NEXT:    s_max_i32 s7, s8, s7
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s4, s4
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s6, s6
-; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_sub_i32 s1, 0, s2
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_i32 s0, 0, s3
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_sub_i32 s5, 0, s5
+; VI-NEXT:    s_ashr_i32 s6, s2, 16
+; VI-NEXT:    s_sext_i32_i16 s1, s1
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_max_i32 s3, s3, s4
-; VI-NEXT:    s_max_i32 s6, s8, s6
-; VI-NEXT:    s_max_i32 s2, s2, s5
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s4, s6, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_add_i32 s2, s2, 2
-; VI-NEXT:    s_or_b32 s3, s4, s3
-; VI-NEXT:    s_lshl_b32 s4, s7, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s2, s4, s2
-; VI-NEXT:    s_add_i32 s3, s3, 0x20000
-; VI-NEXT:    s_add_i32 s2, s2, 0x20000
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_sub_i32 s4, 0, s4
+; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    s_max_i32 s1, s2, s1
+; VI-NEXT:    s_sext_i32_i16 s0, s0
+; VI-NEXT:    s_sext_i32_i16 s2, s3
+; VI-NEXT:    s_max_i32 s5, s6, s5
+; VI-NEXT:    s_ashr_i32 s6, s3, 16
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_max_i32 s0, s2, s0
+; VI-NEXT:    s_max_i32 s4, s6, s4
+; VI-NEXT:    s_add_i32 s0, s0, 2
+; VI-NEXT:    s_lshl_b32 s2, s4, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_add_i32 s1, s1, 2
+; VI-NEXT:    s_or_b32 s0, s2, s0
+; VI-NEXT:    s_lshl_b32 s2, s5, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_or_b32 s1, s2, s1
+; VI-NEXT:    s_add_i32 s0, s0, 0x20000
+; VI-NEXT:    s_add_i32 s1, s1, 0x20000
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_abs_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
index 64d5d01454a37..4b4e03df72903 100644
--- a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
@@ -134,10 +134,11 @@ define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) {
 ; CHECK-LABEL: store_as4_2xi32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; CHECK-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; CHECK-NEXT:    s_endpgm
   store <2 x i32> %v, ptr addrspace(4) %p
   ret void
@@ -161,10 +162,11 @@ define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v
 ; CHECK-LABEL: store_as4_2xfloat:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; CHECK-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; CHECK-NEXT:    s_endpgm
   store <2 x float> %v, ptr addrspace(4) %p
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index f6c357dc38b48..023a50e83cd94 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -32,92 +32,93 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
-; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[54:55], s[8:9], 0x8
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[52:53]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
-; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
+; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    s_nop 0
+; GLOBALNESS1-NEXT:    s_load_dword s9, s[38:39], 0x20
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s54, 0
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[6:7], v[0:1]
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s8, 0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s9, 0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, v1
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s8, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s9, 1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s6, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s7, 1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
-; GLOBALNESS1-NEXT:    s_mov_b32 s82, s16
-; GLOBALNESS1-NEXT:    s_mov_b32 s83, s15
-; GLOBALNESS1-NEXT:    s_mov_b32 s84, s14
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s16
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr58_vgpr59
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 2
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 3
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v4
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 4
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 5
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v1
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 6
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 7
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v3
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s70, 8
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s71, 9
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 5
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
-; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
+; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow17
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
+; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow30
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
@@ -137,9 +138,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -154,12 +155,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
-; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock12
+; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock14
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow26
+; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow28
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -168,7 +169,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow27
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
@@ -188,8 +189,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s8, 10
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s9, 11
-; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v57, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v57, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v57, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v57, 7
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s55
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
@@ -206,12 +208,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_16
-; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow18
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
@@ -220,7 +222,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[84:85]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -237,7 +239,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[80:81]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -251,9 +253,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
@@ -261,9 +263,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[58:59], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
@@ -274,25 +276,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
-; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow25
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v57, 8
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s8, v57, 10
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    s_mov_b32 s55, s83
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v57, 9
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_mov_b32 s55, s7
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v57, 11
-; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow26
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 3
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb69.i
@@ -318,9 +318,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
@@ -336,9 +336,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
@@ -346,92 +346,93 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS0-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
-; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[54:55], s[8:9], 0x8
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[52:53]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
-; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
+; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    s_nop 0
+; GLOBALNESS0-NEXT:    s_load_dword s9, s[38:39], 0x20
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s54, 0
-; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[6:7], v[0:1]
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s8, 0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s9, 0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, v1
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s8, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s9, 1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[82:83], 1, v3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s6, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s7, 1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS0-NEXT:    s_mov_b32 s71, s15
-; GLOBALNESS0-NEXT:    s_mov_b32 s82, s14
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr58_vgpr59
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 2
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 3
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v4
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 4
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 5
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v1
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 6
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 7
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s84, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s85, 9
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s82, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s83, 9
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 5
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
-; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
+; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow17
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
+; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow30
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
@@ -451,13 +452,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[84:85]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -468,12 +469,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
-; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock12
+; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock14
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow26
+; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow28
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -482,7 +483,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow27
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
@@ -502,9 +503,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s8, 10
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s9, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v57, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v57, 3
-; GLOBALNESS0-NEXT:    s_mov_b32 s83, s55
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v57, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v57, 7
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s55
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
@@ -521,12 +522,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_16
-; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow18
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[68:69]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
@@ -535,7 +536,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[84:85]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -552,21 +553,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[80:81]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[68:69]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s84, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s85, s39, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s82, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s83, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
@@ -574,9 +575,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
@@ -589,23 +590,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
-; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow25
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v57, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v57, 8
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v57, 10
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT:    s_mov_b32 s55, s81
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v57, 9
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v57, 11
-; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow26
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 3
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb69.i
@@ -631,7 +632,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
@@ -649,7 +650,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index eaab3531824c4..fc42f476fe7d0 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -164,98 +164,102 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
 ;
 ; GFX6-LABEL: test_udivrem_v2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s0, 0, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX6-NEXT:    s_mul_i32 s6, s6, s2
-; GFX6-NEXT:    s_sub_i32 s0, s0, s6
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_mul_i32 s4, s4, s8
+; GFX6-NEXT:    s_sub_i32 s2, s2, s4
+; GFX6-NEXT:    s_sub_i32 s4, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, 0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v0, s4, v1
+; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s9
+; GFX6-NEXT:    s_sub_i32 s0, s3, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_udivrem_v2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT:    s_sub_i32 s6, 0, s2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX8-NEXT:    s_sub_i32 s0, 0, s6
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8-NEXT:    s_mul_i32 s6, s6, s2
-; GFX8-NEXT:    s_sub_i32 s0, s0, s6
-; GFX8-NEXT:    s_sub_i32 s6, s0, s2
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX8-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX8-NEXT:    s_sub_i32 s6, s0, s2
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX8-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX8-NEXT:    s_sub_i32 s2, 0, s3
-; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    s_mul_i32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s2, s2, s4
+; GFX8-NEXT:    s_sub_i32 s4, s2, s6
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX8-NEXT:    s_sub_i32 s4, s2, s6
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX8-NEXT:    s_sub_i32 s4, 0, s7
+; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v2, s3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    s_mul_i32 s0, s0, s3
-; GFX8-NEXT:    s_sub_i32 s0, s1, s0
-; GFX8-NEXT:    s_sub_i32 s1, s0, s3
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX8-NEXT:    s_mul_i32 s0, s0, s7
+; GFX8-NEXT:    s_sub_i32 s0, s3, s0
+; GFX8-NEXT:    s_sub_i32 s1, s0, s7
+; GFX8-NEXT:    s_cmp_ge_u32 s0, s7
 ; GFX8-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX8-NEXT:    s_sub_i32 s1, s0, s3
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX8-NEXT:    s_sub_i32 s1, s0, s7
+; GFX8-NEXT:    s_cmp_ge_u32 s0, s7
 ; GFX8-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 983acfc2c0699..bd311a1054a41 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -340,8 +340,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
-; GCN-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
index c3935821c31dd..e1ddd155b5437 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
@@ -25,18 +25,19 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a
 define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: zext_i16_to_i64_uniform:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s4, s8, 0xffff
-; GCN-NEXT:    s_add_u32 s4, s6, s4
-; GCN-NEXT:    s_addc_u32 s5, s7, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_and_b32 s0, s8, 0xffff
+; GCN-NEXT:    s_add_u32 s0, s2, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_addc_u32 s1, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %zext = zext i16 %a to i64
   %res = add i64 %b, %zext
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
index c53f4b6d7ff2b..f8659812ad2ad 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
@@ -7,19 +7,24 @@ define void @merge_i32_v2i16_f32_v4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT:    [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 2
-; CHECK-NEXT:    [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR1]], i64 3
-; CHECK-NEXT:    [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT:    [[LOAD2_MUT1:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[LOAD4_MUT2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[LOAD3_MUT3:%.*]] = extractelement <3 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT1]] to <2 x i16>
+; CHECK-NEXT:    [[LOAD3_TOORIG:%.*]] = bitcast i32 [[LOAD3_MUT3]] to float
+; CHECK-NEXT:    [[LOAD4_TOORIG:%.*]] = bitcast i32 [[LOAD4_MUT2]] to <4 x i8>
 ; CHECK-NEXT:    [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
 ; CHECK-NEXT:    store i32 [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
 ; CHECK-NEXT:    [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT:    store <2 x i16> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
-; CHECK-NEXT:    [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2
-; CHECK-NEXT:    store float [[LOAD3]], ptr addrspace(2) [[STORE_GEP3]], align 4
+; CHECK-NEXT:    [[LOAD2_BC:%.*]] = bitcast <2 x i16> [[LOAD2_TOORIG]] to i32
+; CHECK-NEXT:    store i32 [[LOAD2_BC]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT:    [[LOAD3_BC:%.*]] = bitcast float [[LOAD3_TOORIG]] to i32
 ; CHECK-NEXT:    [[STORE_GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(2) [[PTR2]], i64 3
-; CHECK-NEXT:    store <4 x i8> [[LOAD4]], ptr addrspace(2) [[STORE_GEP4]], align 4
+; CHECK-NEXT:    [[LOAD4_BC:%.*]] = bitcast <4 x i8> [[LOAD4_TOORIG]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD4_BC]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD3_BC]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
@@ -45,13 +50,17 @@ define void @merge_f32_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2
 ; CHECK-LABEL: define void @merge_f32_v2f16_type(
 ; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0
-; CHECK-NEXT:    [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT:    [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT:    [[LOAD1_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[LOAD2_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[LOAD1_TOORIG:%.*]] = bitcast i32 [[LOAD1_MUT1]] to float
+; CHECK-NEXT:    [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT2]] to <2 x half>
 ; CHECK-NEXT:    [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
-; CHECK-NEXT:    store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
-; CHECK-NEXT:    [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT:    store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT:    [[LOAD1_BC:%.*]] = bitcast float [[LOAD1_TOORIG]] to i32
+; CHECK-NEXT:    [[LOAD2_BC:%.*]] = bitcast <2 x half> [[LOAD2_TOORIG]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD1_BC]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD2_BC]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index d6b51039d5b44..ac22859ed4e34 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -130,21 +130,39 @@ entry:
   ret void
 }
 
-; Ideally this would be merged
 define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
 ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4
+; CHECK-NEXT:    [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[LD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[LD_1_TOORIG:%.*]] = bitcast i32 [[LD_1_MUT2]] to <2 x i16>
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
+
+  %ld.0 = load i32, ptr addrspace(1) %a
+  %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1
+
+  ret void
+}
+
+define amdgpu_kernel void @merge_load_i32_v2i64(ptr addrspace(1) nocapture %a) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i64(
+; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
 ; CHECK-NEXT:    [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
-; CHECK-NEXT:    [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
+; CHECK-NEXT:    [[LD_1:%.*]] = load <2 x i8>, ptr addrspace(1) [[A_1]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
 
   %ld.0 = load i32, ptr addrspace(1) %a
-  %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1
+  %ld.1 = load <2 x i8>, ptr addrspace(1) %a.1
 
   ret void
 }



More information about the llvm-commits mailing list