[llvm] [LSV] Merge contiguous chains across scalar types (PR #154069)

Anshil Gandhi via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 18 00:38:14 PDT 2025


https://github.com/gandhi56 created https://github.com/llvm/llvm-project/pull/154069

This change enables the Load/Store Vectorizer to merge and vectorize contiguous chains even when their scalar element types differ, as long as the total bitwidth matches. To do so, we rebase offsets between chains, normalize value types to a common integer type, and insert the necessary casts around loads and stores. This uncovers more vectorization opportunities and explains the expected codegen updates across AMDGPU tests.

Key changes:
- Chain merging
  - Build contiguous subchains and then merge adjacent ones when:
    - They refer to the same underlying pointer object and address space.
    - They are either all loads or all stores.
    - A constant leader-to-leader delta exists.
    - Rebasing one chain into the other's coordinate space does not overlap.
    - All elements have equal total bit width.
  - Rebase the second chain by the computed delta and append it to the first.

- Type normalization and casting
  - Normalize merged chains to a common integer type sized to the total bits.
  - For loads: create a new load of the normalized type, copy metadata, and cast back to the original type for uses if needed.
  - For stores: bitcast the value to the normalized type and store that.
  - Insert zext/trunc for integer size changes; use bit-or-pointer casts when sizes match.

- Cleanups
  - Erase replaced instructions and DCE pointer operands when safe.
  - New helpers: computeLeaderDelta, chainsOverlapAfterRebase, rebaseChain, normalizeChainToType, and allElemsMatchTotalBits.

Impact:
- Increases vectorization opportunities across mixed-typed but size-compatible access chains.
- Large set of expected AMDGPU codegen diffs due to more/changed vectorization.

>From 9dd14e7711066f6694a6da0a731f9babfd7f9d97 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Thu, 14 Aug 2025 16:23:40 -0600
Subject: [PATCH] [LSV] Merge contiguous chains across scalar types

This change enables the Load/Store Vectorizer to
merge and vectorize contiguous chains even when
their scalar element types differ, as long as the
total bitwidth matches. To do so, we rebase
offsets between chains, normalize value types to
a common integer type, and insert the necessary
casts around loads and stores. This uncovers more
vectorization opportunities and explains the
expected codegen updates across AMDGPU tests.

Key changes:
- Chain merging
  - Build contiguous subchains and then merge
  adjacent ones when:
    - They refer to the same underlying pointer
    object and address space.
    - They are either all loads or all stores.
    - A constant leader-to-leader delta exists.
    - Rebasing one chain into the other's
    coordinate space does not overlap.
    - All elements have equal total bit width.
  - Rebase the second chain by the computed delta
  and append it to the first.

- Type normalization and casting
  - Normalize merged chains to a common integer
  type sized to the total bits.
  - For loads: create a new load of the normalized
  type, copy metadata, and cast back to the
  original type for uses if needed.
  - For stores: bitcast the value to the
  normalized type and store that.
  - Insert zext/trunc for integer size changes;
  use bit-or-pointer casts when sizes match.

- Cleanups
  - Erase replaced instructions and DCE pointer
  operands when safe.
  - New helpers: computeLeaderDelta,
  chainsOverlapAfterRebase, rebaseChain,
    normalizeChainToType, and
    allElemsMatchTotalBits.

Impact:
- Increases vectorization opportunities across
mixed-typed but size-compatible access chains.
- Large set of expected AMDGPU codegen diffs due
to more/changed vectorization.
---
 llvm/include/llvm/Transforms/Utils/Local.h    |    4 +
 llvm/lib/Transforms/Utils/Local.cpp           |   45 +
 .../Vectorize/LoadStoreVectorizer.cpp         |  283 +++-
 .../AMDGPU/32-bit-local-address-space.ll      |   18 +-
 .../AMDGPU/GlobalISel/amdgpu-irtranslator.ll  |    3 +
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  |  336 ++--
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   85 +-
 .../GlobalISel/llvm.amdgcn.div.scale.ll       |   20 +-
 llvm/test/CodeGen/AMDGPU/add_i1.ll            |   78 +-
 llvm/test/CodeGen/AMDGPU/add_i64.ll           |  133 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  242 +--
 llvm/test/CodeGen/AMDGPU/alloca.ll            |    1 +
 llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll       |  448 ++++-
 .../AMDGPU/amdgcn-load-offset-from-reg.ll     |    5 +
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 1487 +++++++++--------
 llvm/test/CodeGen/AMDGPU/and.ll               |   57 +-
 .../CodeGen/AMDGPU/atomic_cmp_swap_local.ll   |  243 ++-
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll |   47 +-
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |   14 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i16.ll  |   36 +-
 llvm/test/CodeGen/AMDGPU/fabs.bf16.ll         |  103 +-
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |   28 +-
 llvm/test/CodeGen/AMDGPU/fabs.ll              |   14 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |  104 +-
 llvm/test/CodeGen/AMDGPU/fdiv.ll              |  333 ++--
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      | 1308 +++++++--------
 llvm/test/CodeGen/AMDGPU/fnearbyint.ll        |   10 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |   54 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll    |  108 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |   14 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.ll         |   14 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |   26 +-
 llvm/test/CodeGen/AMDGPU/fneg.ll              |   14 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |  360 ++--
 llvm/test/CodeGen/AMDGPU/fshl.ll              |  192 +--
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  195 +--
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    | 1351 +++++++--------
 llvm/test/CodeGen/AMDGPU/half.ll              |  174 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |   54 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |   22 +-
 llvm/test/CodeGen/AMDGPU/kernel-args.ll       |   24 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll    |  275 ++-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |   56 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |   86 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |   86 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |   36 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |   76 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |   76 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |   18 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |   56 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |  192 +--
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |    8 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |   68 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   52 +-
 llvm/test/CodeGen/AMDGPU/s_addk_i32.ll        |    4 +-
 .../AMDGPU/sext-divergence-driven-isel.ll     |   44 +-
 llvm/test/CodeGen/AMDGPU/shl.ll               |  500 +++---
 .../AMDGPU/si-triv-disjoint-mem-access.ll     |   28 +-
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     |   64 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  259 +--
 llvm/test/CodeGen/AMDGPU/udivrem.ll           |  126 +-
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |    2 +-
 .../AMDGPU/zext-divergence-driven-isel.ll     |   21 +-
 .../AMDGPU/merge-vectors-complex.ll           |   39 +-
 .../AMDGPU/merge-vectors.ll                   |   24 +-
 65 files changed, 5418 insertions(+), 4865 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index df146458b4e6f..ad29d69900235 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -448,6 +448,10 @@ LLVM_ABI void combineAAMetadata(Instruction *K, const Instruction *J);
 /// replacement for the source instruction).
 LLVM_ABI void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source);
 
+/// Copy the metadata from the source instruction to the destination (the
+/// replacement for the source instruction).
+LLVM_ABI void copyMetadataForStore(StoreInst &Dest, const StoreInst &Source);
+
 /// Patch the replacement so that it is not more restrictive than the value
 /// being replaced. It assumes that the replacement does not get moved from
 /// its original position.
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index f5208d50c6aae..ecdc920d442dc 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3496,6 +3496,51 @@ void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
   }
 }
 
+void llvm::copyMetadataForStore(StoreInst &Dest, const StoreInst &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  MDBuilder MDB(Dest.getContext());
+  Type *NewType = Dest.getType();
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_access_group:
+    case LLVMContext::MD_noundef:
+    case LLVMContext::MD_noalias_addrspace:
+    case LLVMContext::MD_mem_parallel_loop_access:
+      Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_tbaa: {
+      MDNode *NewTyNode =
+          MDB.createTBAAScalarTypeNode(NewType->getStructName(), N);
+      Dest.setMetadata(LLVMContext::MD_tbaa, NewTyNode);
+      break;
+    }
+    case LLVMContext::MD_nonnull:
+      break;
+
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewType->isPointerTy())
+        Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_range:
+      break;
+    }
+  }
+}
+
 void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
   auto *ReplInst = dyn_cast<Instruction>(Repl);
   if (!ReplInst)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 89f63c3b66aad..12ac5c891aefa 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -112,6 +112,7 @@
 #include <optional>
 #include <tuple>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -268,11 +269,6 @@ class Vectorizer {
   /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
   bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
 
-  /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
-  /// in the same BB with the same value for getUnderlyingObject() etc.
-  bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
-                             ArrayRef<Instruction *> EqClass);
-
   /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
   /// where all instructions access a known, constant offset from the first
   /// instruction.
@@ -337,12 +333,24 @@ class Vectorizer {
   EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
                                                 BasicBlock::iterator End);
 
+  /// Inserts a cast instruction to convert Inst to DstTy.
+  Value *insertCast(Value *Val, Type *DstTy);
+
   /// Partitions Instrs into "chains" where every instruction has a known
   /// constant offset from the first instr in the chain.
   ///
   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
   /// in the chain is the leader, and an instr touches distance 0 from itself.
   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+  // Helpers for chain merging.
+  std::optional<APInt> computeLeaderDelta(Instruction *I1, Instruction *I2);
+  bool chainsOverlapAfterRebase(const Chain &A, const Chain &B,
+                                const APInt &Delta) const;
+  static bool allElemsMatchTotalBits(const Chain &C, unsigned TotalBits,
+                                     const DataLayout &DL);
+  static void rebaseChain(Chain &C, const APInt &Delta);
+  void normalizeChainToType(Chain &C, Type *CastTy);
 };
 
 class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -424,6 +432,20 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
   return Changed ? PA : PreservedAnalyses::all();
 }
 
+static const Value *getUnderlyingPtrObject(const Value *Ptr) {
+  const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
+  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+    // The select's themselves are distinct instructions even if they share
+    // the same condition and evaluate to consecutive pointers for true and
+    // false values of the condition. Therefore using the select's themselves
+    // for grouping instructions would put consecutive accesses into different
+    // lists and they won't be even checked for being consecutive, and won't
+    // be vectorized.
+    return Sel->getCondition();
+  }
+  return ObjPtr;
+}
+
 bool Vectorizer::run() {
   bool Changed = false;
   // Break up the BB if there are any instrs which aren't guaranteed to transfer
@@ -467,6 +489,98 @@ bool Vectorizer::run() {
   return Changed;
 }
 
+Value *Vectorizer::insertCast(Value *Val, Type *DstTy) {
+  if (DL.getTypeSizeInBits(Val->getType()) == DL.getTypeSizeInBits(DstTy)) {
+    return Builder.CreateBitOrPointerCast(Val, DstTy, Val->getName() + ".bc");
+  }
+
+  // If the types are of different sizes and both are integers, we can use
+  // zext or sext to cast.
+  if (Val->getType()->isIntegerTy() && DstTy->isIntegerTy()) {
+    if (DL.getTypeSizeInBits(Val->getType()) < DL.getTypeSizeInBits(DstTy)) {
+      return Builder.CreateZExt(Val, DstTy, Val->getName() + ".bc");
+    }
+    return Builder.CreateTrunc(Val, DstTy, Val->getName() + ".bc");
+  }
+
+  return nullptr;
+}
+
+std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
+                                                    Instruction *I2) {
+  assert((isa<LoadInst>(I1) || isa<StoreInst>(I1)) &&
+         (isa<LoadInst>(I2) || isa<StoreInst>(I2)) &&
+         "computeLeaderDelta must be called with load or store instructions");
+  Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1;
+  const Value *Ptr1 = getLoadStorePointerOperand(I1);
+  const Value *Ptr2 = getLoadStorePointerOperand(I2);
+  return getConstantOffset(const_cast<Value *>(Ptr1), const_cast<Value *>(Ptr2),
+                           CtxInst);
+}
+
+bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B,
+                                          const APInt &Delta) const {
+  for (const ChainElem &EB : B) {
+    APInt OffB = EB.OffsetFromLeader + Delta;
+    unsigned SizeB = DL.getTypeStoreSize(getLoadStoreType(EB.Inst));
+    APInt EndB = OffB + SizeB;
+    for (const ChainElem &EA : A) {
+      APInt OffA = EA.OffsetFromLeader;
+      unsigned SizeA = DL.getTypeStoreSize(getLoadStoreType(EA.Inst));
+      APInt EndA = OffA + SizeA;
+      if (OffB.slt(EndA) && OffA.slt(EndB))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool Vectorizer::allElemsMatchTotalBits(const Chain &C, unsigned TotalBits,
+                                        const DataLayout &DL) {
+  return llvm::all_of(C, [&](const ChainElem &E) {
+    return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
+  });
+}
+
+void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) {
+  for (ChainElem &E : C)
+    E.OffsetFromLeader += Delta;
+}
+
+void Vectorizer::normalizeChainToType(Chain &C, Type *CastTy) {
+  for (ChainElem &Elem : C) {
+    Instruction *Inst = Elem.Inst;
+    Type *OrigValTy = getLoadStoreType(Inst);
+    if (OrigValTy == CastTy)
+      continue;
+
+    if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+      Builder.SetInsertPoint(LI);
+      LoadInst *NewLoad = Builder.CreateLoad(CastTy, LI->getPointerOperand(),
+                                             LI->getName() + ".mut");
+      copyMetadataForLoad(*NewLoad, *LI);
+      Value *CastBack = insertCast(NewLoad, OrigValTy);
+      if (!CastBack)
+        llvm_unreachable("Failed to insert cast");
+      LI->replaceAllUsesWith(CastBack);
+      ToErase.emplace_back(LI);
+      Elem.Inst = NewLoad;
+    } else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+      Builder.SetInsertPoint(SI);
+      Value *CastVal = insertCast(SI->getValueOperand(), CastTy);
+      if (!CastVal)
+        llvm_unreachable("Failed to insert cast");
+      StoreInst *NewStore =
+          Builder.CreateStore(CastVal, SI->getPointerOperand());
+      NewStore->setAlignment(SI->getAlign());
+      NewStore->setVolatile(SI->isVolatile());
+      copyMetadataForStore(*NewStore, *SI);
+      ToErase.emplace_back(SI);
+      Elem.Inst = NewStore;
+    }
+  }
+}
+
 bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
                                BasicBlock::iterator End) {
   LLVM_DEBUG({
@@ -479,49 +593,111 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
   });
 
   bool Changed = false;
+  SmallVector<Chain> ContiguousSubChains;
+
   for (const auto &[EqClassKey, EqClass] :
-       collectEquivalenceClasses(Begin, End))
-    Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
+       collectEquivalenceClasses(Begin, End)) {
 
-  return Changed;
-}
+    LLVM_DEBUG({
+      dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+             << " keyed on " << EqClassKey << ":\n";
+      for (Instruction *I : EqClass)
+        dbgs() << "  " << *I << "\n";
+    });
 
-bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
-                                       ArrayRef<Instruction *> EqClass) {
-  bool Changed = false;
+    for (Chain &C : gatherChains(EqClass)) {
 
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
-           << " keyed on " << EqClassKey << ":\n";
-    for (Instruction *I : EqClass)
-      dbgs() << "  " << *I << "\n";
-  });
+      // Split up the chain into increasingly smaller chains, until we can
+      // finally vectorize the chains.
+      //
+      // (Don't be scared by the depth of the loop nest here.  These operations
+      // are all at worst O(n lg n) in the number of instructions, and splitting
+      // chains doesn't change the number of instrs.  So the whole loop nest is
+      // O(n lg n).)
+      for (auto &C : splitChainByMayAliasInstrs(C)) {
+        for (auto &C : splitChainByContiguity(C)) {
+          ContiguousSubChains.emplace_back(C);
+        }
+      }
+    }
+  }
 
-  std::vector<Chain> Chains = gatherChains(EqClass);
-  LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
-                    << " nontrivial chains.\n";);
-  for (Chain &C : Chains)
-    Changed |= runOnChain(C);
-  return Changed;
-}
+  // Merge chains in reverse order, so that the first chain is the largest.
+  for (int I = ContiguousSubChains.size() - 1; I > 0; I--) {
+    Chain &C1 = ContiguousSubChains[I - 1];
+    Chain &C2 = ContiguousSubChains[I];
 
-bool Vectorizer::runOnChain(Chain &C) {
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
-    dumpChain(C);
-  });
+    // If the scalar types of the chains are the same, we can merge them
+    // without inserting any casts.
+    if (getLoadStoreType(C1[0].Inst)->getScalarType() ==
+        getLoadStoreType(C2[0].Inst)->getScalarType())
+      continue;
+
+    const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst);
+    const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst);
+    unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace();
+    unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace();
+    bool C1IsLoad = isa<LoadInst>(C1[0].Inst);
+    bool C2IsLoad = isa<LoadInst>(C2[0].Inst);
+
+    // If the chains are mapped to different types, have distinct underlying
+    // pointer objects, or include both loads and stores, skip.
+    if (getUnderlyingPtrObject(C1Ptr) != getUnderlyingPtrObject(C2Ptr) ||
+        C1IsLoad != C2IsLoad || AS1 != AS2)
+      continue;
+
+    // Compute constant offset between chain leaders; if unknown, skip.
+    std::optional<APInt> DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst);
+    if (!DeltaOpt)
+      continue;
+
+    // Check that rebasing C2 into C1's coordinate space will not overlap C1.
+    if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt))
+      continue;
+
+    // Determine the common integer cast type for normalization and ensure total
+    // bitwidth matches across all elements of both chains.
+    Type *C1ElemTy = getLoadStoreType(C1[0].Inst);
+    unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy);
+    auto AllElemsMatchTotalBits = [&](const Chain &C) {
+      return llvm::all_of(C, [&](const ChainElem &E) {
+        return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
+      });
+    };
+    if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2))
+      continue;
+
+    // Rebase C2's offsets into C1's coordinate space prior to merging.
+    rebaseChain(C2, *DeltaOpt);
+
+    // Merge C2 into C1 by appending all elements of C2 to C1, then erase C2
+    // from ContiguousSubChains.
+    C1.insert(C1.end(), C2.begin(), C2.end());
+    ContiguousSubChains.erase(ContiguousSubChains.begin() + I);
+
+    // Normalize the value operand/result type of each instruction in C1 to
+    // C1CastTy.
+    Type *C1CastTy =
+        Type::getIntNTy(C1ElemTy->getContext(), DL.getTypeSizeInBits(C1ElemTy));
+    normalizeChainToType(C1, C1CastTy);
+  }
+
+  for (auto &C : ContiguousSubChains) {
+    if (C.size() <= 1)
+      continue;
+    for (auto &AlignedSubChain : splitChainByAlignment(C))
+      Changed |= vectorizeChain(AlignedSubChain);
+  }
+
+  // Erase all instructions scheduled for deletion in this pseudo-BB.
+  for (Instruction *I : ToErase) {
+    auto *PtrOperand = getLoadStorePointerOperand(I);
+    if (I->use_empty())
+      I->eraseFromParent();
+    RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
+  }
+  ToErase.clear();
 
-  // Split up the chain into increasingly smaller chains, until we can finally
-  // vectorize the chains.
-  //
-  // (Don't be scared by the depth of the loop nest here.  These operations are
-  // all at worst O(n lg n) in the number of instructions, and splitting chains
-  // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
-  bool Changed = false;
-  for (auto &C : splitChainByMayAliasInstrs(C))
-    for (auto &C : splitChainByContiguity(C))
-      for (auto &C : splitChainByAlignment(C))
-        Changed |= vectorizeChain(C);
   return Changed;
 }
 
@@ -578,7 +754,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
         LLVM_DEBUG(
             dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
                    << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
-        if (NewChain.size() > 1) {
+        if (!NewChain.empty()) {
           LLVM_DEBUG({
             dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
             dumpChain(NewChain);
@@ -590,7 +766,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
         NewChain = SmallVector<ChainElem, 1>({*ChainIt});
       }
     }
-    if (NewChain.size() > 1) {
+    if (!NewChain.empty()) {
       LLVM_DEBUG({
         dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
         dumpChain(NewChain);
@@ -643,8 +819,6 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
       Ret.push_back({*It});
   }
 
-  // Filter out length-1 chains, these are uninteresting.
-  llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
   return Ret;
 }
 
@@ -1427,20 +1601,6 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                       BasicBlock::iterator End) {
   EquivalenceClassMap Ret;
 
-  auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
-    const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
-    if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
-      // The select's themselves are distinct instructions even if they share
-      // the same condition and evaluate to consecutive pointers for true and
-      // false values of the condition. Therefore using the select's themselves
-      // for grouping instructions would put consecutive accesses into different
-      // lists and they won't be even checked for being consecutive, and won't
-      // be vectorized.
-      return Sel->getCondition();
-    }
-    return ObjPtr;
-  };
-
   for (Instruction &I : make_range(Begin, End)) {
     auto *LI = dyn_cast<LoadInst>(&I);
     auto *SI = dyn_cast<StoreInst>(&I);
@@ -1488,7 +1648,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
         (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
       continue;
 
-    Ret[{GetUnderlyingObject(Ptr), AS,
+    Ret[{getUnderlyingPtrObject(Ptr), AS,
          DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
          /*IsLoad=*/LI != nullptr}]
         .emplace_back(&I);
@@ -1583,8 +1743,7 @@ std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
   Ret.reserve(Chains.size());
   // Iterate over MRU rather than Chains so the order is deterministic.
   for (auto &E : MRU)
-    if (E.second.size() > 1)
-      Ret.emplace_back(std::move(E.second));
+    Ret.emplace_back(std::move(E.second));
   return Ret;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 840165d5a7e7a..fc1b636f525fd 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -298,26 +298,24 @@ define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val)
 define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
 ; GFX7-LABEL: local_address_gep_store:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_add_i32 s0, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX7-NEXT:    s_add_i32 s0, s0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-NEXT:    ds_write_b32 v1, v0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: local_address_gep_store:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    s_add_i32 s0, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX8-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    ds_write_b32 v1, v0
 ; GFX8-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
index 523d51ddcd2bc..a9271794bc03b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel %s -o - 2>&1 | FileCheck %s
 
 ; This file checks that the translation from llvm IR to generic
@@ -11,3 +12,5 @@ define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
   store i32 %res, ptr addrspace(1) poison
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 63009bdc2643f..4abdbe20033a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -65,32 +65,28 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -148,32 +144,28 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -231,32 +223,28 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -314,32 +302,28 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -397,32 +381,28 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -480,32 +460,28 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -563,32 +539,28 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -646,32 +618,28 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -729,32 +697,28 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -812,32 +776,28 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -895,32 +855,28 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -978,32 +934,28 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 6846137272ec6..7c82c32488beb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -513,15 +513,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x4
-; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xd
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0xd
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_and_b32 s2, 1, s6
-; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX7-NEXT:    s_and_b32 s3, 1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_nop 1
@@ -531,18 +529,15 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x10
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8-NEXT:    s_nop 2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    s_and_b32 s2, 1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, 1.0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -550,65 +545,57 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W32:       ; %bb.0:
-; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x34
-; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x10
-; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x8
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10_W32-NEXT:    s_clause 0x1
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX10_W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    s_and_b32 s4, 1, s6
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x34
-; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x10
-; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x8
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10_W64-NEXT:    s_clause 0x1
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX10_W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    s_and_b32 s4, 1, s6
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W32:       ; %bb.0:
-; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x34
-; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x10
-; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x8
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11_W32-NEXT:    s_clause 0x1
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11_W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    s_and_b32 s4, 1, s6
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x34
-; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x10
-; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x8
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11_W64-NEXT:    s_clause 0x1
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11_W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    s_and_b32 s4, 1, s6
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s2, 1.0, v0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index ce195593627db..1e6a8a562fd7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -684,15 +684,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[0:1], s[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[0:1], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -760,15 +759,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[6:7], v[0:1], s[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[4:5], v[0:1], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -836,15 +834,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[6:7], s[6:7], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -912,15 +909,14 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
 ;
 ; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x54
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], s[6:7], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], s[4:5], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
index ff1a3ee38be1d..7c13a0bd97850 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -1,11 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 
-; GCN-LABEL: {{^}}add_var_var_i1:
-; GFX9:  v_xor_b32_e32
-; GFX10: v_xor_b32_e32
 define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; GFX9-LABEL: add_var_var_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in0
   %b = load volatile i1, ptr addrspace(1) %in1
   %add = add i1 %a, %b
@@ -13,21 +25,64 @@ define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1
   ret void
 }
 
-; GCN-LABEL: {{^}}add_var_imm_i1:
-; GFX9:  s_xor_b64
-; GFX10: s_xor_b32
 define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX9-LABEL: add_var_imm_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in
   %add = add i1 %a, 1
   store i1 %add, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}add_i1_cf:
-; GCN: ; %endif
-; GFX9: s_xor_b64
-; GFX10: s_xor_b32
 define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; GFX9-LABEL: add_i1_cf:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9-NEXT:  ; %bb.1: ; %else
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[8:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:  .LBB2_2: ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3: ; %if
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[4:5], exec
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; GFX9-NEXT:  .LBB2_4: ; %endif
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[4:5], -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -49,3 +104,6 @@ endif:
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX10: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index 9400bf6098552..e6394ee4ad014 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -1,12 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 
 declare i32 @llvm.amdgcn.workitem.id.x() readnone
 
-; SI-LABEL: {{^}}test_i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
 define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; SI-LABEL: test_i64_vreg:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s14, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[12:15], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
   %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
@@ -18,10 +38,22 @@ define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; Check that the SGPR add operand is correctly moved to a VGPR.
-; SI-LABEL: {{^}}sgpr_operand:
-; SI: s_add_u32
-; SI: s_addc_u32
 define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in_bar, i64 %a) {
+; SI-LABEL: sgpr_operand:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xf
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_u32 s4, s6, s4
+; SI-NEXT:    s_addc_u32 s5, s7, s5
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %foo = load i64, ptr addrspace(1) %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, ptr addrspace(1) %out
@@ -30,35 +62,76 @@ define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrs
 
 ; Swap the arguments. Check that the SGPR -> VGPR copy works with the
 ; SGPR as other operand.
-;
-; SI-LABEL: {{^}}sgpr_operand_reversed:
-; SI: s_add_u32
-; SI: s_addc_u32
 define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %a) {
+; SI-LABEL: sgpr_operand_reversed:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_u32 s4, s4, s6
+; SI-NEXT:    s_addc_u32 s5, s5, s7
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %foo = load i64, ptr addrspace(1) %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-
-; SI-LABEL: {{^}}test_v2i64_sreg:
-; SI: s_add_u32
-; SI: s_addc_u32
-; SI: s_add_u32
-; SI: s_addc_u32
 define amdgpu_kernel void @test_v2i64_sreg(ptr addrspace(1) noalias %out, <2 x i64> %a, <2 x i64> %b) {
+; SI-LABEL: test_v2i64_sreg:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_u32 s4, s10, s14
+; SI-NEXT:    s_addc_u32 s5, s11, s15
+; SI-NEXT:    s_add_u32 s6, s8, s12
+; SI-NEXT:    s_addc_u32 s7, s9, s13
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}test_v2i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
-; SI: v_add_i32
-; SI: v_addc_u32
 define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; SI-LABEL: test_v2i64_vreg:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s14, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v5, 0
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
@@ -69,14 +142,18 @@ define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr ad
   ret void
 }
 
-; SI-LABEL: {{^}}trunc_i64_add_to_i32:
-; SI: s_load_dword s[[SREG0:[0-9]+]]
-; SI: s_load_dword s[[SREG1:[0-9]+]]
-; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI-NOT: addc
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
 define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) {
+; SI-LABEL: trunc_i64_add_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_add_i32 s4, s4, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 3160e38df5e3f..4c5a1d71e41b2 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -513,53 +513,53 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-LABEL: introduced_copy_to_sgpr:
 ; GFX908:       ; %bb.0: ; %bb
 ; GFX908-NEXT:    global_load_ushort v16, v[0:1], off glc
-; GFX908-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX908-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX908-NEXT:    s_load_dword s0, s[8:9], 0x18
-; GFX908-NEXT:    s_mov_b32 s12, 0
-; GFX908-NEXT:    s_mov_b32 s9, s12
+; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX908-NEXT:    s_load_dwordx2 s[16:17], s[8:9], 0x0
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x10
+; GFX908-NEXT:    s_mov_b32 s4, 0
+; GFX908-NEXT:    s_mov_b32 s9, s4
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX908-NEXT:    s_sub_i32 s1, 0, s7
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s0
+; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX908-NEXT:    s_sub_i32 s3, 0, s1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s2
 ; GFX908-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX908-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX908-NEXT:    s_mul_i32 s1, s1, s2
-; GFX908-NEXT:    s_mul_hi_u32 s1, s2, s1
-; GFX908-NEXT:    s_add_i32 s2, s2, s1
-; GFX908-NEXT:    s_mul_hi_u32 s1, s6, s2
-; GFX908-NEXT:    s_mul_i32 s2, s1, s7
-; GFX908-NEXT:    s_sub_i32 s2, s6, s2
-; GFX908-NEXT:    s_add_i32 s3, s1, 1
-; GFX908-NEXT:    s_sub_i32 s6, s2, s7
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX908-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX908-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX908-NEXT:    s_add_i32 s3, s1, 1
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX908-NEXT:    s_cselect_b32 s8, s3, s1
-; GFX908-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX908-NEXT:    s_mul_i32 s3, s3, s5
+; GFX908-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX908-NEXT:    s_add_i32 s5, s5, s3
+; GFX908-NEXT:    s_mul_hi_u32 s3, s0, s5
+; GFX908-NEXT:    s_mul_i32 s5, s3, s1
+; GFX908-NEXT:    s_sub_i32 s0, s0, s5
+; GFX908-NEXT:    s_add_i32 s8, s3, 1
+; GFX908-NEXT:    s_sub_i32 s5, s0, s1
+; GFX908-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX908-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX908-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX908-NEXT:    s_add_i32 s5, s3, 1
+; GFX908-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX908-NEXT:    s_cselect_b32 s8, s5, s3
+; GFX908-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s2
-; GFX908-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
-; GFX908-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX908-NEXT:    s_lshl_b64 s[12:13], s[6:7], 5
+; GFX908-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX908-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX908-NEXT:    s_or_b32 s14, s14, 28
-; GFX908-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
+; GFX908-NEXT:    s_or_b32 s12, s12, 28
+; GFX908-NEXT:    s_lshl_b64 s[14:15], s[8:9], 5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s2, v16
 ; GFX908-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX908-NEXT:    s_mul_i32 s3, s5, s2
-; GFX908-NEXT:    s_mul_hi_u32 s5, s4, s2
-; GFX908-NEXT:    s_mul_i32 s2, s4, s2
+; GFX908-NEXT:    s_mul_i32 s3, s17, s2
+; GFX908-NEXT:    s_mul_hi_u32 s5, s16, s2
+; GFX908-NEXT:    s_mul_i32 s2, s16, s2
 ; GFX908-NEXT:    s_add_i32 s3, s5, s3
-; GFX908-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
+; GFX908-NEXT:    s_lshl_b64 s[16:17], s[2:3], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
-; GFX908-NEXT:  .LBB3_1: ; %Flow20
+; GFX908-NEXT:  .LBB3_1: ; %Flow21
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
@@ -572,47 +572,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX908-NEXT:    s_mov_b32 s13, s12
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
+; GFX908-NEXT:    s_mov_b32 s5, s4
 ; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v4, s12
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v6, s12
-; GFX908-NEXT:    v_mov_b32_e32 v8, s12
-; GFX908-NEXT:    v_mov_b32_e32 v5, s13
-; GFX908-NEXT:    v_mov_b32_e32 v7, s13
-; GFX908-NEXT:    v_mov_b32_e32 v9, s13
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
+; GFX908-NEXT:    v_mov_b32_e32 v7, s5
+; GFX908-NEXT:    v_mov_b32_e32 v9, s5
+; GFX908-NEXT:    v_mov_b32_e32 v5, s5
+; GFX908-NEXT:    v_mov_b32_e32 v6, s4
+; GFX908-NEXT:    v_mov_b32_e32 v8, s4
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
-; GFX908-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX908-NEXT:    s_mov_b64 s[20:21], s[12:13]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s13, v3
-; GFX908-NEXT:    s_add_u32 s9, s9, 1
-; GFX908-NEXT:    s_addc_u32 s13, s13, 0
-; GFX908-NEXT:    s_mul_hi_u32 s22, s6, s9
-; GFX908-NEXT:    s_mul_i32 s13, s6, s13
-; GFX908-NEXT:    s_mul_i32 s23, s7, s9
-; GFX908-NEXT:    s_add_i32 s13, s22, s13
-; GFX908-NEXT:    s_mul_i32 s9, s6, s9
-; GFX908-NEXT:    s_add_i32 s13, s13, s23
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX908-NEXT:    s_add_u32 s5, s5, 1
+; GFX908-NEXT:    s_addc_u32 s9, s9, 0
+; GFX908-NEXT:    s_mul_hi_u32 s22, s10, s5
+; GFX908-NEXT:    s_mul_i32 s9, s10, s9
+; GFX908-NEXT:    s_mul_i32 s23, s11, s5
+; GFX908-NEXT:    s_add_i32 s9, s22, s9
+; GFX908-NEXT:    s_mul_i32 s5, s10, s5
+; GFX908-NEXT:    s_add_i32 s9, s9, s23
 ; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT:    s_add_u32 s20, s20, s4
+; GFX908-NEXT:    s_add_u32 s20, s20, s16
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
-; GFX908-NEXT:    s_addc_u32 s21, s21, s5
+; GFX908-NEXT:    s_addc_u32 s21, s21, s17
 ; GFX908-NEXT:    s_mov_b64 s[22:23], 0
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
 ; GFX908-NEXT:  .LBB3_5: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    s_add_u32 s22, s20, s9
-; GFX908-NEXT:    s_addc_u32 s23, s21, s13
+; GFX908-NEXT:    s_add_u32 s22, s20, s5
+; GFX908-NEXT:    s_addc_u32 s23, s21, s9
 ; GFX908-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    global_load_dword v20, v19, s[22:23] offset:-8 glc
@@ -660,17 +660,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
-; GFX908-NEXT:  .LBB3_10: ; %Flow19
+; GFX908-NEXT:  .LBB3_10: ; %Flow20
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  ; %bb.11: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_add_u32 s10, s10, s8
-; GFX908-NEXT:    s_addc_u32 s11, s11, 0
-; GFX908-NEXT:    s_add_u32 s14, s14, s16
-; GFX908-NEXT:    s_addc_u32 s15, s15, s17
+; GFX908-NEXT:    s_add_u32 s6, s6, s8
+; GFX908-NEXT:    s_addc_u32 s7, s7, 0
+; GFX908-NEXT:    s_add_u32 s12, s12, s14
+; GFX908-NEXT:    s_addc_u32 s13, s13, s15
 ; GFX908-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX908-NEXT:    s_branch .LBB3_1
 ; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
@@ -679,52 +679,52 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-LABEL: introduced_copy_to_sgpr:
 ; GFX90A:       ; %bb.0: ; %bb
 ; GFX90A-NEXT:    global_load_ushort v18, v[0:1], off glc
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX90A-NEXT:    s_load_dword s0, s[8:9], 0x18
-; GFX90A-NEXT:    s_mov_b32 s12, 0
-; GFX90A-NEXT:    s_mov_b32 s9, s12
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[8:9], 0x0
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x10
+; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_mov_b32 s9, s4
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX90A-NEXT:    s_sub_i32 s1, 0, s7
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX90A-NEXT:    s_sub_i32 s3, 0, s1
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GFX90A-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; GFX90A-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX90A-NEXT:    s_mul_i32 s1, s1, s2
-; GFX90A-NEXT:    s_mul_hi_u32 s1, s2, s1
-; GFX90A-NEXT:    s_add_i32 s2, s2, s1
-; GFX90A-NEXT:    s_mul_hi_u32 s1, s6, s2
-; GFX90A-NEXT:    s_mul_i32 s2, s1, s7
-; GFX90A-NEXT:    s_sub_i32 s2, s6, s2
-; GFX90A-NEXT:    s_add_i32 s3, s1, 1
-; GFX90A-NEXT:    s_sub_i32 s6, s2, s7
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX90A-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX90A-NEXT:    s_add_i32 s3, s1, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
-; GFX90A-NEXT:    s_cselect_b32 s8, s3, s1
-; GFX90A-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX90A-NEXT:    s_mul_i32 s3, s3, s5
+; GFX90A-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX90A-NEXT:    s_add_i32 s5, s5, s3
+; GFX90A-NEXT:    s_mul_hi_u32 s3, s0, s5
+; GFX90A-NEXT:    s_mul_i32 s5, s3, s1
+; GFX90A-NEXT:    s_sub_i32 s0, s0, s5
+; GFX90A-NEXT:    s_add_i32 s8, s3, 1
+; GFX90A-NEXT:    s_sub_i32 s5, s0, s1
+; GFX90A-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX90A-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX90A-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX90A-NEXT:    s_add_i32 s5, s3, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX90A-NEXT:    s_cselect_b32 s8, s5, s3
+; GFX90A-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s2
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
-; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[6:7], 5
+; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX90A-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX90A-NEXT:    s_or_b32 s14, s14, 28
-; GFX90A-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
+; GFX90A-NEXT:    s_or_b32 s12, s12, 28
+; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[8:9], 5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_readfirstlane_b32 s2, v18
 ; GFX90A-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX90A-NEXT:    s_mul_i32 s3, s5, s2
-; GFX90A-NEXT:    s_mul_hi_u32 s5, s4, s2
-; GFX90A-NEXT:    s_mul_i32 s2, s4, s2
+; GFX90A-NEXT:    s_mul_i32 s3, s17, s2
+; GFX90A-NEXT:    s_mul_hi_u32 s5, s16, s2
+; GFX90A-NEXT:    s_mul_i32 s2, s16, s2
 ; GFX90A-NEXT:    s_add_i32 s3, s5, s3
-; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
+; GFX90A-NEXT:    s_lshl_b64 s[16:17], s[2:3], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
-; GFX90A-NEXT:  .LBB3_1: ; %Flow20
+; GFX90A-NEXT:  .LBB3_1: ; %Flow21
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
@@ -737,34 +737,34 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX90A-NEXT:    s_mov_b32 s13, s12
+; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
+; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
-; GFX90A-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
+; GFX90A-NEXT:    s_mov_b64 s[20:21], s[12:13]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v4
-; GFX90A-NEXT:    v_readfirstlane_b32 s13, v5
-; GFX90A-NEXT:    s_add_u32 s9, s9, 1
-; GFX90A-NEXT:    s_addc_u32 s13, s13, 0
-; GFX90A-NEXT:    s_mul_hi_u32 s22, s6, s9
-; GFX90A-NEXT:    s_mul_i32 s13, s6, s13
-; GFX90A-NEXT:    s_mul_i32 s23, s7, s9
-; GFX90A-NEXT:    s_add_i32 s13, s22, s13
-; GFX90A-NEXT:    s_mul_i32 s9, s6, s9
-; GFX90A-NEXT:    s_add_i32 s13, s13, s23
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX90A-NEXT:    s_add_u32 s5, s5, 1
+; GFX90A-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NEXT:    s_mul_hi_u32 s22, s10, s5
+; GFX90A-NEXT:    s_mul_i32 s9, s10, s9
+; GFX90A-NEXT:    s_mul_i32 s23, s11, s5
+; GFX90A-NEXT:    s_add_i32 s9, s22, s9
+; GFX90A-NEXT:    s_mul_i32 s5, s10, s5
+; GFX90A-NEXT:    s_add_i32 s9, s9, s23
 ; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    s_add_u32 s20, s20, s4
-; GFX90A-NEXT:    s_addc_u32 s21, s21, s5
+; GFX90A-NEXT:    s_add_u32 s20, s20, s16
+; GFX90A-NEXT:    s_addc_u32 s21, s21, s17
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[22:23], 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
@@ -772,8 +772,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  .LBB3_5: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    s_add_u32 s22, s20, s9
-; GFX90A-NEXT:    s_addc_u32 s23, s21, s13
+; GFX90A-NEXT:    s_add_u32 s22, s20, s5
+; GFX90A-NEXT:    s_addc_u32 s23, s21, s9
 ; GFX90A-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v20, v19, s[22:23] offset:-8 glc
@@ -814,17 +814,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
-; GFX90A-NEXT:  .LBB3_10: ; %Flow19
+; GFX90A-NEXT:  .LBB3_10: ; %Flow20
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.11: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_add_u32 s10, s10, s8
-; GFX90A-NEXT:    s_addc_u32 s11, s11, 0
-; GFX90A-NEXT:    s_add_u32 s14, s14, s16
-; GFX90A-NEXT:    s_addc_u32 s15, s15, s17
+; GFX90A-NEXT:    s_add_u32 s6, s6, s8
+; GFX90A-NEXT:    s_addc_u32 s7, s7, 0
+; GFX90A-NEXT:    s_add_u32 s12, s12, s14
+; GFX90A-NEXT:    s_addc_u32 s13, s13, s15
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX90A-NEXT:    s_branch .LBB3_1
 ; GFX90A-NEXT:  .LBB3_12: ; %DummyReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/alloca.ll b/llvm/test/CodeGen/AMDGPU/alloca.ll
index 0e82bb97d5dcc..7955255efc523 100644
--- a/llvm/test/CodeGen/AMDGPU/alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/alloca.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llvm-as -data-layout=A5 < %s | llvm-dis | FileCheck %s
 ; RUN: llc -mtriple amdgcn-amd-amdhsa < %s
 ; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple amdgcn-amd-amdhsa
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f9b754688c08e..80e1e67bc2ebb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,13 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+; GCN-LABEL: kernel_ieee_mode_default:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+; GCN-LABEL: kernel_ieee_mode_on:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+; GCN-LABEL: kernel_ieee_mode_off:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 0
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_default() #0 {
+; GCN-LABEL: func_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_on() #1 {
+; GCN-LABEL: func_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_off() #2 {
+; GCN-LABEL: func_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_default() #0 {
+; GCN-LABEL: cs_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_on() #1 {
+; GCN-LABEL: cs_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_off() #2 {
+; GCN-LABEL: cs_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_default() #0 {
+; GCN-LABEL: ps_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_on() #1 {
+; GCN-LABEL: ps_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_off() #2 {
+; GCN-LABEL: ps_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 95f59479c73e8..877d8fda4c8b3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,GISEL %s
 
@@ -242,3 +243,7 @@ attributes #4 = { nounwind writeonly }
 !9 = !{!"doff_0_0_b"}
 !10 = !{}
 !11 = !{!"doff_1_0_b"}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b7097a9557b75..53b2d48d64091 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2547,44 +2547,45 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: udiv_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s6, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_and_b32 s4, s9, 0xffff
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_lshr_b32 s4, s11, 16
+; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v4, v5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
+; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
@@ -2600,6 +2601,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    s_mov_b32 s1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2607,42 +2609,43 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s7, 16
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -2650,7 +2653,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
@@ -2659,8 +2661,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -2762,49 +2763,51 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: urem_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s6, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_lshr_b32 s5, s6, 16
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mad_f32 v1, -v1, v2, v4
-; GFX6-NEXT:    s_and_b32 s6, s11, 0xffff
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX6-NEXT:    s_and_b32 s5, s9, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX6-NEXT:    v_mad_f32 v1, -v1, v3, v4
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
+; GFX6-NEXT:    s_and_b32 s8, s9, 0xffff
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v1
-; GFX6-NEXT:    s_lshr_b32 s4, s11, 16
+; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
-; GFX6-NEXT:    s_lshr_b32 s5, s9, 16
+; GFX6-NEXT:    s_lshr_b32 s5, s7, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
@@ -2815,10 +2818,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s9, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s7, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2830,67 +2833,67 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    s_and_b32 s8, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    s_and_b32 s4, s3, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
-; GFX9-NEXT:    s_and_b32 s5, s1, 0xffff
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s5
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX9-NEXT:    s_and_b32 s8, s3, 0xffff
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
 ; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
-; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, s1, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
+; GFX9-NEXT:    v_sub_u32_e32 v4, s2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -3000,62 +3003,64 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: sdiv_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s7, s10
+; GFX6-NEXT:    s_sext_i32_i16 s6, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s7
+; GFX6-NEXT:    s_xor_b32 s6, s7, s6
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    s_or_b32 s8, s6, 1
+; GFX6-NEXT:    s_mov_b32 s1, s9
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    s_ashr_i32 s5, s10, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX6-NEXT:    s_cselect_b32 s6, s8, 0
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
+; GFX6-NEXT:    s_ashr_i32 s6, s10, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_xor_b32 s4, s6, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    s_sext_i32_i16 s5, s11
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    s_sext_i32_i16 s6, s5
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s9
+; GFX6-NEXT:    s_sext_i32_i16 s4, s11
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    s_ashr_i32 s5, s11, 16
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
-; GFX6-NEXT:    s_ashr_i32 s4, s9, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s11, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
@@ -3080,13 +3085,13 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: sdiv_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    s_sext_i32_i16 s4, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT:    s_sext_i32_i16 s5, s0
+; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX9-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
@@ -3098,61 +3103,61 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX9-NEXT:    s_ashr_i32 s5, s6, 16
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
-; GFX9-NEXT:    s_sext_i32_i16 s0, s1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    s_sext_i32_i16 s4, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    v_add_u32_e32 v4, s2, v4
+; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 16
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    v_add_u32_e32 v1, s2, v5
 ; GFX9-NEXT:    s_ashr_i32 s2, s3, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
-; GFX9-NEXT:    s_ashr_i32 s0, s1, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s2, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v5|, |v0|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v6
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -3270,53 +3275,55 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX6-LABEL: srem_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s0, s8
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX6-NEXT:    s_sext_i32_i16 s1, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s1
+; GFX6-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_or_b32 s10, s0, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s10, 16
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT:    s_cselect_b32 s0, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_ashr_i32 s5, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    s_lshr_b32 s7, s10, 16
-; GFX6-NEXT:    s_or_b32 s8, s4, 1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    s_lshr_b32 s10, s6, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s8, 16
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s11
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
+; GFX6-NEXT:    s_sext_i32_i16 s4, s9
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s9
+; GFX6-NEXT:    s_sext_i32_i16 s5, s7
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v1
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
@@ -3330,30 +3337,30 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
-; GFX6-NEXT:    s_ashr_i32 s4, s11, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s9, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s9, 16
+; GFX6-NEXT:    s_ashr_i32 s5, s7, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_lshr_b32 s6, s9, 16
-; GFX6-NEXT:    s_lshr_b32 s7, s11, 16
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    s_or_b32 s8, s4, 1
+; GFX6-NEXT:    s_lshr_b32 s8, s9, 16
+; GFX6-NEXT:    s_or_b32 s9, s4, 1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s9, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v5
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s9, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s8
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s7, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -3362,13 +3369,13 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ;
 ; GFX9-LABEL: srem_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s8, s2
+; GFX9-NEXT:    s_sext_i32_i16 s8, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
-; GFX9-NEXT:    s_sext_i32_i16 s9, s0
+; GFX9-NEXT:    s_sext_i32_i16 s9, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX9-NEXT:    s_xor_b32 s4, s9, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
@@ -3380,69 +3387,69 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s10, 0
-; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s0, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s10, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s2, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9-NEXT:    s_xor_b32 s2, s10, s0
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v1, s4, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_or_b32 s2, s2, 1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
-; GFX9-NEXT:    s_sext_i32_i16 s8, s1
+; GFX9-NEXT:    s_xor_b32 s4, s10, s2
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX9-NEXT:    s_or_b32 s6, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v4
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
+; GFX9-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX9-NEXT:    s_sext_i32_i16 s6, s7
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s6
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
+; GFX9-NEXT:    s_sext_i32_i16 s8, s3
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s8
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX9-NEXT:    s_xor_b32 s0, s8, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT:    s_xor_b32 s2, s8, s6
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s3
-; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s1, 16
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 16
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s4
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 16
+; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, s2, v5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    s_xor_b32 s0, s2, s3
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s4, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s5, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s3
+; GFX9-NEXT:    s_or_b32 s6, s2, 1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v5|, |v4|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX9-NEXT:    v_add_u32_e32 v4, s2, v6
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
+; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
@@ -3838,46 +3845,48 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: udiv_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_and_b32 s1, s6, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_and_b32 s4, s9, 0xffff
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -3885,48 +3894,47 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s7, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
-; GFX9-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
 ; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v6, v2, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v6, v0, s[6:7]
+; GFX9-NEXT:    global_store_short v6, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v6, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -4006,52 +4014,54 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: urem_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
-; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT:    s_lshr_b32 s6, s10, 16
+; GFX6-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX6-NEXT:    s_and_b32 s1, s10, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v4
+; GFX6-NEXT:    s_mov_b32 s1, s9
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    s_and_b32 s6, s11, 0xffff
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
-; GFX6-NEXT:    s_and_b32 s6, s9, 0xffff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
+; GFX6-NEXT:    v_mad_f32 v2, -v1, v4, v3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s11
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_mad_f32 v2, -v2, v3, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s5
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s11, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -4059,33 +4069,34 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s2, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    s_and_b32 s8, s0, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s5, s6, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffff
+; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
@@ -4094,18 +4105,17 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX9-NEXT:    v_mad_f32 v2, -v2, v3, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s5
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s0, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s1, v2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -4191,46 +4201,47 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: sdiv_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s7, s10
+; GFX6-NEXT:    s_sext_i32_i16 s6, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s7
+; GFX6-NEXT:    s_xor_b32 s6, s7, s6
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    s_or_b32 s8, s6, 1
+; GFX6-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    s_ashr_i32 s5, s10, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX6-NEXT:    s_cselect_b32 s6, s8, 0
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s6, v2
+; GFX6-NEXT:    s_ashr_i32 s6, s10, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_xor_b32 s4, s6, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    s_sext_i32_i16 s5, s11
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s9
+; GFX6-NEXT:    s_sext_i32_i16 s4, s11
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
@@ -4243,6 +4254,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    s_mov_b32 s1, s9
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -4253,13 +4265,13 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    s_sext_i32_i16 s4, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT:    s_sext_i32_i16 s5, s0
+; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s5
 ; GFX9-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
@@ -4271,44 +4283,44 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s8, 0
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s5, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NEXT:    v_add_u32_e32 v2, s4, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    s_or_b32 s0, s0, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
-; GFX9-NEXT:    s_sext_i32_i16 s0, s1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    s_sext_i32_i16 s4, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    v_add_u32_e32 v3, s2, v4
+; GFX9-NEXT:    s_sext_i32_i16 s2, s3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s2, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v4|, |v0|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v5
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT:    global_store_short v1, v0, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v1, v2, s[6:7]
+; GFX9-NEXT:    global_store_short v1, v0, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -4400,68 +4412,70 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX6-LABEL: srem_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sext_i32_i16 s4, s10
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s8
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_sext_i32_i16 s0, s8
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX6-NEXT:    s_sext_i32_i16 s1, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s1
+; GFX6-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_or_b32 s10, s0, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
-; GFX6-NEXT:    s_ashr_i32 s4, s10, 16
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT:    s_cselect_b32 s0, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_ashr_i32 s5, s6, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    s_lshr_b32 s7, s10, 16
-; GFX6-NEXT:    s_or_b32 s8, s4, 1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    s_lshr_b32 s10, s6, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s8, 16
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
-; GFX6-NEXT:    s_sext_i32_i16 s4, s11
+; GFX6-NEXT:    s_sext_i32_i16 s4, s9
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    s_sext_i32_i16 s5, s9
+; GFX6-NEXT:    s_sext_i32_i16 s5, s7
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    s_or_b32 s7, s4, 1
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    s_cselect_b32 s4, s7, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s11
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s9
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s10, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
@@ -4470,12 +4484,12 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s8, s2
+; GFX9-NEXT:    s_sext_i32_i16 s8, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
-; GFX9-NEXT:    s_sext_i32_i16 s9, s0
+; GFX9-NEXT:    s_sext_i32_i16 s9, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX9-NEXT:    s_xor_b32 s4, s9, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
@@ -4487,51 +4501,51 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX9-NEXT:    s_cselect_b32 s4, s10, 0
-; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s0, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s10, s2, 16
+; GFX9-NEXT:    s_ashr_i32 s2, s6, 16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9-NEXT:    s_xor_b32 s2, s10, s0
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v1, s4, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT:    s_or_b32 s2, s2, 1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
+; GFX9-NEXT:    s_xor_b32 s4, s10, s2
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX9-NEXT:    s_or_b32 s6, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
-; GFX9-NEXT:    s_sext_i32_i16 s2, s3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
-; GFX9-NEXT:    s_sext_i32_i16 s3, s1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s3
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v3
+; GFX9-NEXT:    s_sext_i32_i16 s4, s7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX9-NEXT:    s_sext_i32_i16 s5, s3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX9-NEXT:    s_xor_b32 s0, s3, s2
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX9-NEXT:    s_or_b32 s4, s0, 1
+; GFX9-NEXT:    s_xor_b32 s2, s5, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s6, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
+; GFX9-NEXT:    v_add_u32_e32 v2, s2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
-; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
-; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
+; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
@@ -5665,29 +5679,31 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s10
+; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX6-NEXT:    s_sub_i32 s1, 0, s0
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s11
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s0
+; GFX6-NEXT:    s_mov_b32 s8, s4
+; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX6-NEXT:    s_mul_i32 s1, s1, s0
-; GFX6-NEXT:    s_sub_i32 s1, s8, s1
+; GFX6-NEXT:    s_sub_i32 s1, s6, s1
 ; GFX6-NEXT:    s_sub_i32 s3, s1, s0
 ; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
@@ -5702,10 +5718,10 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX6-NEXT:    s_mul_i32 s0, s0, s2
-; GFX6-NEXT:    s_sub_i32 s0, s9, s0
+; GFX6-NEXT:    s_sub_i32 s0, s7, s0
 ; GFX6-NEXT:    s_sub_i32 s1, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
 ; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
@@ -5716,19 +5732,19 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s2
+; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
@@ -5740,37 +5756,37 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s0, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
 ; GFX9-NEXT:    s_mul_i32 s5, s4, s7
-; GFX9-NEXT:    s_sub_i32 s0, s0, s5
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX9-NEXT:    s_add_i32 s9, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_sub_i32 s5, s2, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
 ; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    s_cselect_b32 s0, s5, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s6
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
 ; GFX9-NEXT:    s_add_i32 s8, s8, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s5, s4, s6
-; GFX9-NEXT:    s_sub_i32 s1, s1, s5
+; GFX9-NEXT:    s_sub_i32 s3, s3, s5
 ; GFX9-NEXT:    s_add_i32 s7, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s1, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX9-NEXT:    s_sub_i32 s5, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
 ; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
 ; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
@@ -6006,64 +6022,67 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_sub_i32 s0, 0, s6
+; GFX6-NEXT:    s_lshl_b32 s8, 0x1000, s1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX6-NEXT:    s_mul_i32 s6, s6, s2
-; GFX6-NEXT:    s_sub_i32 s0, s0, s6
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_mul_i32 s4, s4, s6
+; GFX6-NEXT:    s_sub_i32 s2, s2, s4
+; GFX6-NEXT:    s_sub_i32 s4, s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, 0, s8
+; GFX6-NEXT:    v_mul_lo_u32 v0, s4, v1
+; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s8
+; GFX6-NEXT:    s_sub_i32 s0, s3, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s2
+; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
@@ -6075,33 +6094,33 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s0, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s7
-; GFX9-NEXT:    s_sub_i32 s0, s0, s4
-; GFX9-NEXT:    s_sub_i32 s4, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
-; GFX9-NEXT:    s_sub_i32 s4, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s4, s2, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_sub_i32 s4, s2, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s7
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s6
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
 ; GFX9-NEXT:    s_add_i32 s8, s8, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s6
-; GFX9-NEXT:    s_sub_i32 s1, s1, s4
-; GFX9-NEXT:    s_sub_i32 s4, s1, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
-; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
-; GFX9-NEXT:    s_sub_i32 s4, s1, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
-; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    s_sub_i32 s4, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    s_sub_i32 s4, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = urem <2 x i32> %x, %shl.y
@@ -6483,136 +6502,138 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX6-NEXT:    s_abs_i32 s6, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_sub_i32 s7, 0, s6
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s6
+; GFX6-NEXT:    s_abs_i32 s8, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s0, 0, s8
+; GFX6-NEXT:    s_lshl_b32 s9, 0x1000, s7
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v0
-; GFX6-NEXT:    s_abs_i32 s7, s0
-; GFX6-NEXT:    s_xor_b32 s0, s0, s2
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_abs_i32 s4, s2
+; GFX6-NEXT:    s_xor_b32 s2, s2, s6
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s6
-; GFX6-NEXT:    s_sub_i32 s2, s7, s2
-; GFX6-NEXT:    s_sub_i32 s7, s2, s6
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX6-NEXT:    s_mul_i32 s5, s5, s8
+; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s5, s4, s8
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_abs_i32 s2, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
+; GFX6-NEXT:    s_abs_i32 s8, s9
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_sub_i32 s0, 0, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_xor_b32 s3, s1, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    s_abs_i32 s1, s1
-; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_abs_i32 s1, s3
+; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    s_ashr_i32 s3, s3, 31
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_lo_u32 v3, s6, v2
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v2
+; GFX6-NEXT:    s_xor_b32 s0, s3, s9
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX6-NEXT:    s_mul_i32 s0, s0, s2
-; GFX6-NEXT:    s_sub_i32 s0, s1, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s2
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX6-NEXT:    s_mul_i32 s2, s2, s8
+; GFX6-NEXT:    s_sub_i32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, s1, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s3, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX9-NEXT:    s_abs_i32 s6, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s3
-; GFX9-NEXT:    s_abs_i32 s3, s0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT:    s_abs_i32 s8, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    s_lshl_b32 s4, 0x1000, s7
+; GFX9-NEXT:    s_abs_i32 s5, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s2, 0, s6
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX9-NEXT:    s_sub_i32 s6, 0, s8
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s2, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s2, s8, s2
-; GFX9-NEXT:    s_add_i32 s8, s8, s2
-; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s8
-; GFX9-NEXT:    s_mul_i32 s8, s2, s6
-; GFX9-NEXT:    s_sub_i32 s3, s3, s8
-; GFX9-NEXT:    s_add_i32 s9, s2, 1
-; GFX9-NEXT:    s_sub_i32 s8, s3, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
-; GFX9-NEXT:    s_cselect_b32 s2, s9, s2
-; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
-; GFX9-NEXT:    s_add_i32 s8, s2, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
-; GFX9-NEXT:    s_cselect_b32 s6, s8, s2
-; GFX9-NEXT:    s_abs_i32 s8, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    s_xor_b32 s5, s6, s0
-; GFX9-NEXT:    s_sub_i32 s6, 0, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT:    s_mul_i32 s7, s6, s8
+; GFX9-NEXT:    s_sub_i32 s5, s5, s7
+; GFX9-NEXT:    s_add_i32 s9, s6, 1
+; GFX9-NEXT:    s_sub_i32 s7, s5, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s8
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s6, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s8
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s6
+; GFX9-NEXT:    s_abs_i32 s6, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_xor_b32 s5, s5, s2
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
+; GFX9-NEXT:    s_sub_i32 s2, s5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s0, s5, s0
-; GFX9-NEXT:    s_xor_b32 s4, s1, s7
-; GFX9-NEXT:    s_abs_i32 s1, s1
+; GFX9-NEXT:    s_xor_b32 s4, s3, s4
+; GFX9-NEXT:    s_abs_i32 s3, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s6
-; GFX9-NEXT:    s_add_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s1, s5
-; GFX9-NEXT:    s_mul_i32 s6, s5, s8
-; GFX9-NEXT:    s_sub_i32 s1, s1, s6
+; GFX9-NEXT:    s_mul_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s7
+; GFX9-NEXT:    s_add_i32 s8, s5, 1
+; GFX9-NEXT:    s_sub_i32 s7, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
 ; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_sub_i32 s6, s1, s8
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s8
-; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
-; GFX9-NEXT:    s_cselect_b32 s1, s6, s1
-; GFX9-NEXT:    s_add_i32 s6, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s8
-; GFX9-NEXT:    s_cselect_b32 s1, s6, s5
-; GFX9-NEXT:    s_xor_b32 s1, s1, s4
-; GFX9-NEXT:    s_sub_i32 s1, s1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
@@ -6935,122 +6956,125 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX6-NEXT:    s_abs_i32 s2, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s6
+; GFX6-NEXT:    s_abs_i32 s6, s0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_sub_i32 s0, 0, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
-; GFX6-NEXT:    s_abs_i32 s6, s0
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_lshl_b32 s5, 0x1000, s7
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_abs_i32 s4, s2
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 31
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_mul_i32 s7, s7, s2
-; GFX6-NEXT:    s_sub_i32 s6, s6, s7
-; GFX6-NEXT:    s_sub_i32 s7, s6, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s2
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_sub_i32 s7, s6, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s7, s6
-; GFX6-NEXT:    s_abs_i32 s3, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s6, 0, s3
-; GFX6-NEXT:    s_abs_i32 s8, s1
-; GFX6-NEXT:    s_xor_b32 s2, s2, s0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s6
+; GFX6-NEXT:    s_sub_i32 s4, s4, s7
+; GFX6-NEXT:    s_sub_i32 s7, s4, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s6
+; GFX6-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX6-NEXT:    s_sub_i32 s7, s4, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s6
+; GFX6-NEXT:    s_cselect_b32 s8, s7, s4
+; GFX6-NEXT:    s_abs_i32 s9, s5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX6-NEXT:    s_sub_i32 s4, 0, s9
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_ashr_i32 s1, s3, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s0, s2, s0
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_abs_i32 s0, s3
+; GFX6-NEXT:    s_xor_b32 s3, s8, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_sub_i32 s2, s3, s2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s2, s8, s2
-; GFX6-NEXT:    s_sub_i32 s8, s2, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
-; GFX6-NEXT:    s_sub_i32 s8, s2, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
-; GFX6-NEXT:    s_xor_b32 s2, s2, s1
-; GFX6-NEXT:    s_sub_i32 s1, s2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX6-NEXT:    s_mul_i32 s3, s3, s9
+; GFX6-NEXT:    s_sub_i32 s0, s0, s3
+; GFX6-NEXT:    s_sub_i32 s3, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX6-NEXT:    s_sub_i32 s3, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX6-NEXT:    s_xor_b32 s0, s0, s1
+; GFX6-NEXT:    s_sub_i32 s0, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s2
-; GFX9-NEXT:    s_abs_i32 s2, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_sub_i32 s7, 0, s2
-; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
-; GFX9-NEXT:    s_abs_i32 s0, s0
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s6
+; GFX9-NEXT:    s_abs_i32 s6, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_lshl_b32 s4, 0x1000, s7
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
 ; GFX9-NEXT:    s_add_i32 s8, s8, s7
-; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s8
-; GFX9-NEXT:    s_mul_i32 s7, s7, s2
-; GFX9-NEXT:    s_sub_i32 s0, s0, s7
-; GFX9-NEXT:    s_sub_i32 s7, s0, s2
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX9-NEXT:    s_cselect_b32 s0, s7, s0
-; GFX9-NEXT:    s_sub_i32 s7, s0, s2
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX9-NEXT:    s_cselect_b32 s0, s7, s0
-; GFX9-NEXT:    s_abs_i32 s7, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_xor_b32 s0, s0, s6
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_i32 s5, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s7, s2, s8
+; GFX9-NEXT:    s_mul_i32 s7, s7, s6
+; GFX9-NEXT:    s_sub_i32 s2, s2, s7
+; GFX9-NEXT:    s_sub_i32 s7, s2, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX9-NEXT:    s_sub_i32 s7, s2, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX9-NEXT:    s_abs_i32 s4, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s7, 0, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s0, s0, s6
-; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_abs_i32 s1, s1
+; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX9-NEXT:    s_abs_i32 s3, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s1, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s7
-; GFX9-NEXT:    s_sub_i32 s1, s1, s5
-; GFX9-NEXT:    s_sub_i32 s5, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
-; GFX9-NEXT:    s_sub_i32 s5, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
-; GFX9-NEXT:    s_xor_b32 s1, s1, s4
-; GFX9-NEXT:    s_sub_i32 s1, s1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT:    s_mul_i32 s5, s5, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s5
+; GFX9-NEXT:    s_sub_i32 s5, s3, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_sub_i32 s5, s3, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_xor_b32 s3, s3, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
@@ -10099,6 +10123,15 @@ define <2 x i64> @srem_zero_zero() {
 ; GCN-LABEL: kernel:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
+; GFX6-LABEL: srem_zero_zero:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: srem_zero_zero:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %B = srem <2 x i64> zeroinitializer, zeroinitializer
   ret <2 x i64> %B
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index c6233642110ea..39765120941f7 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -210,7 +210,7 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i
 }
 
 ; FUNC-LABEL: {{^}}s_and_32_bit_constant_i64:
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
@@ -222,13 +222,12 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
 }
 
 ; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
-; SI: s_load_dwordx2
-; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_load_dword [[B:s[0-9]+]]
+; SI: s_load_dwordx4
+; SI: s_load_dword
 ; SI: s_load_dwordx2
 ; SI-NOT: and
-; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
-; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
+; SI: s_lshl_b32 [[C:s[0-9]+]], [[A:s[0-9]+]], 1
+; SI: s_lshl_b32 [[D:s[0-9]+]], [[B:s[0-9]+]], 1
 ; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
 ; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
 ; SI-NOT: and
@@ -370,13 +369,13 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
-; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
+; SI: s_load_dwordx4
+; SI: s_lshl_b32 [[B:s[0-9]+]], [[A:s[0-9]+]], 1{{$}}
 ; SI-NOT: and
-; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
+; SI: s_and_b32 s{{[0-9]+}}, [[B:s[0-9]+]], 64
 ; SI-NOT: and
 ; SI: s_add_u32
-; SI-NEXT: s_addc_u32
+; SI-NEXT: s_mov_b32
 define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a, i32, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
@@ -386,7 +385,7 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 ; SI-NOT: and
@@ -400,8 +399,7 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
 ; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
 
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
 ; SI-NOT: and
@@ -415,8 +413,7 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a
 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
 
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
 ; SI-NOT: and
@@ -430,8 +427,7 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p
 ; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
 
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
 ; SI-NOT: and
@@ -445,8 +441,7 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a
 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64:
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
 
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
 ; SI-NOT: and
@@ -458,8 +453,7 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64:
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
 ; SI-NOT: and
@@ -471,8 +465,7 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64:
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
 ; SI-NOT: and
@@ -486,8 +479,7 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p
 ; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64:
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
 
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
 ; SI-NOT: and
@@ -501,8 +493,7 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a
 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64:
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
 
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
 ; SI-NOT: and
@@ -518,8 +509,7 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p
 ; low 32-bits, which is not a valid 64-bit inline immmediate.
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
-; SI: s_load_dword s
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
@@ -531,8 +521,7 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64:
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
@@ -544,8 +533,7 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou
 }
 
 ; Shift into upper 32-bits
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
@@ -557,8 +545,7 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64:
-; SI: s_load_dword
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI-NOT: and
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index b50112fe24879..d772a114d7b28 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -1,24 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; PREGFX11: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GFX11: ds_cmpstore_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %out, [8 x i32], ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+; SI-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[4:5], 0x13
+; SI-NEXT:    s_load_dword s3, s[4:5], 0x1c
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    v_mov_b32_e32 v0, 7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v2, s3
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x4c
+; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x70
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -26,24 +47,42 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %o
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; PREGFX11: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
-; GFX11: ds_cmpstore_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
-; GCN: [[RESULT]]
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i64 %swap) nounwind {
+; SI-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-NEXT:    v_mov_b32_e32 v0, 7
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[0:1], v4, v[2:3], v[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
@@ -51,13 +90,48 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %o
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
-; GFX9PLUS-NOT: m0
-; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GFX9PLUS: ds_{{cmpst|cmpstore}}_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+; SI-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s3, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_sub_i32 s1, s1, s2
+; SI-NEXT:    s_lshl_b32 s1, s1, 2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_i32 s1, s3, s1
+; SI-NEXT:    s_add_i32 s1, s1, 16
+; SI-NEXT:    v_mov_b32_e32 v0, 7
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_cmpst_rtn_b32 v0, v2, v0, v1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
@@ -67,47 +141,80 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
-; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; PREGFX11: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GFX11: ds_cmpstore_b32 [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+; SI-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
+; SI-NEXT:    s_load_dword s1, s[4:5], 0x12
+; SI-NEXT:    v_mov_b32_e32 v0, 7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_cmpst_b32 v1, v0, v2 offset:16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x48
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    ds_cmpstore_b32 v1, v2, v0 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; PREGFX11: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
-; GFX11: ds_cmpstore_b64 [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(ptr addrspace(3) %ptr, i64 %swap) nounwind {
+; SI-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; SI-NEXT:    v_mov_b32_e32 v0, 7
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    ds_cmpstore_b64 v4, v[2:3], v[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CIVI: {{.*}}
+; GCN: {{.*}}
+; GFX8PLUS: {{.*}}
+; GFX9PLUS: {{.*}}
+; PREGFX11: {{.*}}
+; SICI: {{.*}}
+; SICIVI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index b03ade4b527e6..75b06acecf32a 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1295,10 +1295,10 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    s_cmp_ge_i32 s2, s3
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GCN-NEXT:  .LBB10_4: ; %Flow5
+; GCN-NEXT:  .LBB10_4: ; %Flow7
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_vccz .LBB10_5
-; GCN-NEXT:  ; %bb.10: ; %Flow5
+; GCN-NEXT:  ; %bb.10: ; %Flow7
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
 ; GCN-NEXT:  .Lpost_getpc13:
 ; GCN-NEXT:    s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295
@@ -1316,15 +1316,15 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:  .LBB10_6:
 ; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:  .LBB10_7: ; %bb19
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 2
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: long_branch_hang:
@@ -1363,7 +1363,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:  .LBB10_4: ; %Flow5
+; GFX11-NEXT:  .LBB10_4: ; %Flow7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[8:9]
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB10_6
@@ -1381,15 +1381,14 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:  .LBB10_6:
 ; GFX11-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-NEXT:  .LBB10_7: ; %bb19
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1424,10 +1423,10 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc13)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[8:9]
-; GFX12-NEXT:  .LBB10_2: ; %Flow5
+; GFX12-NEXT:  .LBB10_2: ; %Flow7
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_vccz .LBB10_3
-; GFX12-NEXT:  ; %bb.12: ; %Flow5
+; GFX12-NEXT:  ; %bb.12: ; %Flow7
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
 ; GFX12-NEXT:  .Lpost_getpc14:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1490,14 +1489,12 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:  .LBB10_6:
 ; GFX12-NEXT:    ; implicit-def: $vgpr0
 ; GFX12-NEXT:  .LBB10_7: ; %bb19
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
-; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eaeff8eb1..a5bd004db9ea7 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -271,13 +271,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 12b26cb6d8c19..ef4780be197f0 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -186,30 +186,32 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(ptr addrspace(1) %ou
 ;
 ; GCN-LABEL: dynamic_extract_vector_elt_v3i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x15
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x15
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s2, s2, 4
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s2
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_lshl_b32 s4, s6, 4
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX89-LABEL: dynamic_extract_vector_elt_v3i16:
 ; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX89-NEXT:    s_load_dword s8, s[4:5], 0x54
-; GFX89-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
-; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_lshl_b32 s4, s8, 4
-; GFX89-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
-; GFX89-NEXT:    v_mov_b32_e32 v0, s4
-; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_lshl_b32 s0, s8, 4
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; GFX89-NEXT:    v_mov_b32_e32 v0, s0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX89-NEXT:    s_endpgm
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 71b1a16c79e69..8c474aae3985c 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -218,21 +218,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s3, s3, 16
-; CI-NEXT:    s_and_b32 s5, s2, 0xffff0000
-; CI-NEXT:    v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT:    v_mul_f32_e64 v1, 1.0, |s3|
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s5|
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; CI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabs_v4bf16:
@@ -242,23 +234,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s3, 0x7fff
-; VI-NEXT:    s_lshr_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s5, s2, 0x7fff
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0x7fff
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; VI-NEXT:    s_and_b32 s4, 0xffff, s4
-; VI-NEXT:    s_and_b32 s5, 0xffff, s5
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s3, s4, s3
-; VI-NEXT:    s_or_b32 s2, s5, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_fabs_v4bf16:
@@ -266,55 +248,24 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, s3, 0x7fff
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NEXT:    s_and_b32 s5, s2, 0x7fff
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, 0x7fff
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
+; GFX9-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
+; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-TRUE16-LABEL: s_fabs_v4bf16:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s2
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_fabs_v4bf16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0x7fff
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0x7fff
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: s_fabs_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %fabs = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %in)
   store <4 x bfloat> %fabs, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 7b6a363c42708..c4b51b5380ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
-; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; CI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabs_v4f16:
@@ -234,13 +234,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 6bcb086944c91..baf9b0abf7b0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -115,13 +115,13 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset0_b32 s3, 31
-; VI-NEXT:    s_bitset0_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fffffff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 61f5b73033f5e..a00afb6fcf6ef 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -472,50 +472,52 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
 define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) {
 ; SI-LABEL: s_test_copysign_v2f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT:    s_brev_b32 s8, -2
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_bfi_b32 v1, s8, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_bfi_b32 v0, s8, v0, v2
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_bfi_b32 v1, s0, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_bfi_b32 v0, s0, v0, v2
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_copysign_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_brev_b32 s6, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_bfi_b32 v1, s6, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_bfi_b32 v0, s6, v2, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_bfi_b32 v3, s6, v2, v3
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_bfi_b32 v2, s6, v2, v4
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_v2f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v2
-; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s3, v0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v2
+; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, ptr addrspace(1) %out, align 8
@@ -877,46 +879,46 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) {
 define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) {
 ; SI-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_brev_b32 s4, -2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_brev_b32 s0, -2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_bfi_b32 v2, s0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %sign.trunc = fptrunc double %sign to float
   %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index b826e6c469d8e..72e3549656327 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -938,16 +938,18 @@ entry:
 define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32:
 ; GFX6-FASTFMA:       ; %bb.0: ; %entry
-; GFX6-FASTFMA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-FASTFMA-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GFX6-FASTFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-FASTFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, -1
 ; GFX6-FASTFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v1, s9
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], s11, s11, v1
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s4, s0
+; GFX6-FASTFMA-NEXT:    s_mov_b32 s5, s1
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[0:1], s9, s9, v1
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s11
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s9, v0, s9
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s3, v0, s3
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v4, v3, v3
@@ -956,13 +958,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v3, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v0, -v2, v4, v0
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], s10, s10, v2
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[0:1], s8, s8, v2
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v1, v0, s11, v1
-; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s8, v0, s8
+; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v1, v0, s9, v1
+; GFX6-FASTFMA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v0, vcc, s2, v0, s2
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v4, v4
@@ -972,20 +974,21 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v0, -v3, v5, v0
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v0, v0, v4, v5
-; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v0, s10, v2
-; GFX6-FASTFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v0, s8, v2
+; GFX6-FASTFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-FASTFMA-NEXT:    s_endpgm
 ;
 ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32:
 ; GFX6-SLOWFMA:       ; %bb.0: ; %entry
-; GFX6-SLOWFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-SLOWFMA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-SLOWFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-SLOWFMA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GFX6-SLOWFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[6:7], s3, s3, v0
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s1, v2, s1
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v4, s0
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[6:7], s5, s5, v0
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s3, v2, s3
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX6-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v5, -v1, v3, 1.0
@@ -995,14 +998,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v5, v2
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[6:7], s2, s2, v4
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[6:7], s4, s4, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v5
-; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v3, s2
-; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, s0, v3, s0
-; GFX6-SLOWFMA-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, -1
+; GFX6-SLOWFMA-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, s2, v3, s2
+; GFX6-SLOWFMA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v5, v2
-; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v1, s3, v0
+; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v1, s5, v0
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v0, -v2, v5, 1.0
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v0, v0, v5, v5
@@ -1012,22 +1014,24 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v0, v2, v0, v5
-; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v0, s2, v4
-; GFX6-SLOWFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v0, s4, v4
+; GFX6-SLOWFMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-SLOWFMA-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: s_fdiv_v2f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], s11, s11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_mov_b32 s4, s0
+; GFX7-NEXT:    s_mov_b32 s5, s1
+; GFX7-NEXT:    v_div_scale_f32 v2, s[0:1], s9, s9, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s11
-; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s9, v0, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s9
+; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s3, v0, s3
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
@@ -1036,13 +1040,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v3, v4
 ; GFX7-NEXT:    v_fma_f32 v0, -v2, v4, v0
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], s10, s10, v2
+; GFX7-NEXT:    v_div_scale_f32 v3, s[0:1], s8, s8, v2
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX7-NEXT:    v_div_fixup_f32 v1, v0, s11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v0, s10
-; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s8, v0, s8
+; GFX7-NEXT:    v_div_fixup_f32 v1, v0, s9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_div_scale_f32 v0, vcc, s2, v0, s2
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v4, v4
@@ -1052,19 +1056,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX7-NEXT:    v_fma_f32 v0, -v3, v5, v0
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v4, v5
-; GFX7-NEXT:    v_div_fixup_f32 v0, v0, s10, v2
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT:    v_div_fixup_f32 v0, v0, s8, v2
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    v_div_scale_f32 v1, s[6:7], s3, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s1, v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_div_scale_f32 v1, s[6:7], s5, s5, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s3, v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v5, -v1, v3, 1.0
@@ -1074,13 +1079,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX8-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v5, v2
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX8-NEXT:    v_div_scale_f32 v2, s[6:7], s2, s2, v4
+; GFX8-NEXT:    v_div_scale_f32 v2, s[6:7], s4, s4, v4
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_div_scale_f32 v3, vcc, s0, v3, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_div_scale_f32 v3, vcc, s2, v3, s2
 ; GFX8-NEXT:    v_rcp_f32_e32 v5, v2
-; GFX8-NEXT:    v_div_fixup_f32 v1, v1, s3, v0
+; GFX8-NEXT:    v_div_fixup_f32 v1, v1, s5, v0
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v0, -v2, v5, 1.0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v5, v5
@@ -1090,19 +1094,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v2, v0, v5
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_div_fixup_f32 v0, v0, s2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_div_fixup_f32 v0, v0, s4, v4
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_v2f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s6, s3, s3, s1
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s1, s3, s1
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s7, s7, s3
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s3, s7, s3
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -1112,12 +1117,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_scale_f32 v2, s6, s2, s2, s0
+; GFX10-NEXT:    v_div_scale_f32 v2, s4, s6, s6, s2
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX10-NEXT:    v_div_fixup_f32 v1, v0, s3, s1
-; GFX10-NEXT:    v_div_scale_f32 v0, vcc_lo, s0, s2, s0
+; GFX10-NEXT:    v_div_fixup_f32 v1, v0, s7, s3
+; GFX10-NEXT:    v_div_scale_f32 v0, vcc_lo, s2, s6, s2
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
@@ -1128,19 +1132,18 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s2, s0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s6, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s1
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s1, s3, s1
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s5, s5, s3
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s3, s5, s3
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -1151,11 +1154,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX11-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX11-NEXT:    s_denorm_mode 12
-; GFX11-NEXT:    v_div_scale_f32 v2, null, s2, s2, s0
+; GFX11-NEXT:    v_div_scale_f32 v2, null, s4, s4, s2
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX11-NEXT:    v_div_fixup_f32 v1, v0, s3, s1
-; GFX11-NEXT:    v_div_scale_f32 v0, vcc_lo, s0, s2, s0
+; GFX11-NEXT:    v_div_fixup_f32 v1, v0, s5, s3
+; GFX11-NEXT:    v_div_scale_f32 v0, vcc_lo, s2, s4, s2
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -1167,8 +1170,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s2, s0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s4, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32:
@@ -1193,58 +1196,60 @@ entry:
 define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX67-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX67-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX67-NEXT:    v_rcp_f32_e32 v1, s3
-; GFX67-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX67-NEXT:    v_mul_f32_e32 v1, s1, v1
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s8
+; GFX67-NEXT:    v_rcp_f32_e32 v1, s9
+; GFX67-NEXT:    s_mov_b32 s4, s0
+; GFX67-NEXT:    s_mov_b32 s5, s1
+; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX67-NEXT:    v_mul_f32_e32 v1, s3, v1
 ; GFX67-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX8-NEXT:    v_rcp_f32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, s1, v1
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_rcp_f32_e32 v2, s6
+; GFX8-NEXT:    v_rcp_f32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, s3, v3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX10-NEXT:    v_rcp_f32_e32 v1, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, s1, v1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s6
+; GFX10-NEXT:    v_rcp_f32_e32 v1, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_f32_e32 v1, s3
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s6
+; GFX11-NEXT:    v_rcp_f32_e32 v1, s7
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_ulp25_v2f32:
@@ -1269,58 +1274,60 @@ entry:
 define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX67-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX67-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX67-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX67-NEXT:    v_mul_f32_e32 v0, s0, v2
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s9
+; GFX67-NEXT:    v_rcp_f32_e32 v2, s8
+; GFX67-NEXT:    s_mov_b32 s4, s0
+; GFX67-NEXT:    s_mov_b32 s5, s1
+; GFX67-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v2
 ; GFX67-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX8-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_rcp_f32_e32 v2, s7
+; GFX8-NEXT:    v_rcp_f32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_f32_e32 v3, s3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v4
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX10-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[6:7]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_rcp_f32_e32 v2, s6
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v2
+; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_v2f32_fast_math:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX11-NEXT:    v_rcp_f32_e32 v2, s2
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v2, s6
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
-; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
+; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32_fast_math:
@@ -1345,58 +1352,60 @@ entry:
 define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
 ; GFX67-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX67-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX67-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX67-NEXT:    v_mul_f32_e32 v0, s0, v2
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s9
+; GFX67-NEXT:    v_rcp_f32_e32 v2, s8
+; GFX67-NEXT:    s_mov_b32 s4, s0
+; GFX67-NEXT:    s_mov_b32 s5, s1
+; GFX67-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v2
 ; GFX67-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX8-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_rcp_f32_e32 v2, s7
+; GFX8-NEXT:    v_rcp_f32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_f32_e32 v3, s3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v4
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_rcp_f32_e32 v2, s2
-; GFX10-NEXT:    v_mul_f32_e32 v1, s1, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v2
-; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[6:7]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_rcp_f32_e32 v2, s6
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v2
+; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_v2f32_arcp_math:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX11-NEXT:    v_rcp_f32_e32 v2, s2
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v2, s6
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
-; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
+; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32_arcp_math:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 3304dbf3eaa3d..a98bf785b617f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -279,13 +279,12 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -298,13 +297,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_add_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -317,13 +315,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_add_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -334,17 +331,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -361,7 +357,7 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -383,7 +379,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -405,7 +401,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -426,7 +422,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -583,13 +579,12 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -600,13 +595,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_add_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -617,13 +611,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_add_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -634,17 +627,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -660,7 +652,7 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -680,7 +672,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_add_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -700,7 +692,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_add_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -721,7 +713,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_add_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -887,13 +879,12 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -906,13 +897,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_and_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -925,13 +915,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_and_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -942,17 +931,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -969,7 +957,7 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -991,7 +979,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1013,7 +1001,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1034,7 +1022,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1191,13 +1179,12 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1208,13 +1195,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_and_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1225,13 +1211,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_and_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1242,17 +1227,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1268,7 +1252,7 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1288,7 +1272,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_and_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1308,7 +1292,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_and_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1329,7 +1313,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_and_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1495,13 +1479,12 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -1514,13 +1497,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_sub_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -1533,13 +1515,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_sub_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1550,17 +1531,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1577,7 +1557,7 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1599,7 +1579,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1621,7 +1601,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1642,7 +1622,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1799,13 +1779,12 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1816,13 +1795,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_sub_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1833,13 +1811,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_sub_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1850,17 +1827,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1876,7 +1852,7 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1896,7 +1872,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_sub_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1916,7 +1892,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_sub_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1937,7 +1913,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2098,13 +2074,12 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -2116,13 +2091,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2134,13 +2108,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2150,17 +2123,16 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2176,7 +2148,7 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2198,7 +2170,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2220,7 +2192,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2241,7 +2213,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2392,13 +2364,12 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2408,13 +2379,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -2424,13 +2394,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2440,17 +2409,16 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2465,7 +2433,7 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2485,7 +2453,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_max_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2505,7 +2473,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_max_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2526,7 +2494,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_max_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2686,13 +2654,12 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -2704,13 +2671,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN2-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2722,13 +2688,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2738,17 +2703,16 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2764,7 +2728,7 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2786,7 +2750,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2808,7 +2772,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2829,7 +2793,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2980,13 +2944,12 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2996,13 +2959,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_umax_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -3012,13 +2974,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3028,17 +2989,16 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3053,7 +3013,7 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3073,7 +3033,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN2-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3093,7 +3053,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN3-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3114,7 +3074,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3274,13 +3234,12 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -3292,13 +3251,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_min_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -3310,13 +3268,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3326,17 +3283,16 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3352,7 +3308,7 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3374,7 +3330,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3396,7 +3352,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3417,7 +3373,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3568,13 +3524,12 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -3584,13 +3539,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_min_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -3600,13 +3554,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3616,17 +3569,16 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3641,7 +3593,7 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3661,7 +3613,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_min_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3681,7 +3633,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_min_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3702,7 +3654,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_min_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3862,13 +3814,12 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -3880,13 +3831,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN2-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -3898,13 +3848,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN3-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3914,17 +3863,16 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3940,7 +3888,7 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3962,7 +3910,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3984,7 +3932,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4005,7 +3953,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4156,13 +4104,12 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -4172,13 +4119,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_umin_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -4188,13 +4134,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_umin_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4204,17 +4149,16 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4229,7 +4173,7 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4249,7 +4193,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN2-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4269,7 +4213,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN3-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4290,7 +4234,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4455,13 +4399,12 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -4474,13 +4417,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 ;
 ; GCN2-LABEL: atomic_or_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -4493,13 +4435,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 ;
 ; GCN3-LABEL: atomic_or_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4510,17 +4451,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4537,7 +4477,7 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4559,7 +4499,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4581,7 +4521,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4602,7 +4542,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4759,13 +4699,12 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -4776,13 +4715,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ;
 ; GCN2-LABEL: atomic_or_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -4793,13 +4731,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ;
 ; GCN3-LABEL: atomic_or_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4810,17 +4747,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4836,7 +4772,7 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4856,7 +4792,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ;
 ; GCN2-LABEL: atomic_or_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4876,7 +4812,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ;
 ; GCN3-LABEL: atomic_or_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4897,7 +4833,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ; GFX11-LABEL: atomic_or_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5127,13 +5063,12 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -5146,13 +5081,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN2-LABEL: atomic_xchg_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -5165,13 +5099,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GCN3-LABEL: atomic_xchg_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -5182,17 +5115,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5209,7 +5141,7 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5231,7 +5163,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5253,7 +5185,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5274,7 +5206,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5431,13 +5363,12 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -5448,13 +5379,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_xchg_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -5465,13 +5395,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_xchg_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -5482,17 +5411,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5508,7 +5436,7 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5528,7 +5456,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN2-LABEL: atomic_xchg_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5548,7 +5476,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ;
 ; GCN3-LABEL: atomic_xchg_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5569,7 +5497,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5739,18 +5667,17 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5760,18 +5687,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5781,16 +5707,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1] offset:16
@@ -5800,17 +5725,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5828,7 +5752,7 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
@@ -5852,7 +5776,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5876,7 +5800,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5899,7 +5823,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
@@ -6060,16 +5984,15 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
+; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -6079,16 +6002,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
+; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -6098,16 +6020,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s1
+; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -6117,17 +6038,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6144,7 +6064,7 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
@@ -6166,7 +6086,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -6188,7 +6108,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -6211,7 +6131,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
@@ -6379,13 +6299,12 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -6398,13 +6317,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_xor_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -6417,13 +6335,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_xor_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6434,17 +6351,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6461,7 +6377,7 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6483,7 +6399,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6505,7 +6421,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6526,7 +6442,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6683,13 +6599,12 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -6700,13 +6615,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_xor_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -6717,13 +6631,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_xor_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6734,17 +6647,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -6760,7 +6672,7 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6780,7 +6692,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_xor_i32_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6800,7 +6712,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_xor_i32_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6821,7 +6733,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9340,13 +9252,12 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_incr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -9359,13 +9270,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_inc_i32_incr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -9378,13 +9288,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_inc_i32_incr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -9395,17 +9304,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_inc_i32_incr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9422,7 +9330,7 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9444,7 +9352,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9466,7 +9374,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9487,7 +9395,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_inc_i32_ret_incr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9644,13 +9552,12 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_incr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -9661,13 +9568,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_inc_i32_incr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -9678,13 +9584,12 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_inc_i32_incr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -9695,17 +9600,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_inc_i32_incr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9721,7 +9625,7 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_inc_i32_ret_incr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9741,7 +9645,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_inc_i32_ret_incr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9761,7 +9665,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_inc_i32_ret_incr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9782,7 +9686,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_inc_i32_ret_incr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10080,13 +9984,12 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_decr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    s_add_u32 s0, s0, 16
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
@@ -10099,13 +10002,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN2-LABEL: atomic_dec_i32_decr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -10118,13 +10020,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GCN3-LABEL: atomic_dec_i32_decr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -10135,17 +10036,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 ;
 ; GFX11-LABEL: atomic_dec_i32_decr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -10162,7 +10062,7 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10184,7 +10084,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10206,7 +10106,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10227,7 +10127,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX11-LABEL: atomic_dec_i32_ret_decr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10384,13 +10284,12 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_decr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN1-NEXT:    s_add_u32 s0, s2, s0
-; GCN1-NEXT:    s_addc_u32 s1, s3, s1
+; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN1-NEXT:    s_add_u32 s0, s0, s2
+; GCN1-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -10401,13 +10300,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN2-LABEL: atomic_dec_i32_decr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s2, s0
-; GCN2-NEXT:    s_addc_u32 s1, s3, s1
+; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s2
+; GCN2-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -10418,13 +10316,12 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GCN3-LABEL: atomic_dec_i32_decr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s2, s0
-; GCN3-NEXT:    s_addc_u32 s1, s3, s1
+; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s2
+; GCN3-NEXT:    s_addc_u32 s1, s1, s3
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -10435,17 +10332,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ;
 ; GFX11-LABEL: atomic_dec_i32_decr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -10461,7 +10357,7 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_dec_i32_ret_decr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10481,7 +10377,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN2-LABEL: atomic_dec_i32_ret_decr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10501,7 +10397,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ;
 ; GCN3-LABEL: atomic_dec_i32_ret_decr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10522,7 +10418,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ; GFX11-LABEL: atomic_dec_i32_ret_decr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index e9fd6119d0c36..6fb21b5c31b5b 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -135,11 +135,11 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_rndne_f32_e32 v1, s3
-; VI-NEXT:    v_rndne_f32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_rndne_f32_e32 v3, s3
+; VI-NEXT:    v_rndne_f32_e32 v2, s2
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fnearbyint_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 86e73ed03f187..574749f3536a3 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -4160,42 +4160,40 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
 define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) {
 ; SI-LABEL: s_fneg_select_infloop_regression_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
-; SI-NEXT:    v_bfrev_b32_e32 v0, 1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-NEXT:    v_bfrev_b32_e32 v3, 1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitcmp1_b32 s6, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; SI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_bitcmp1_b32 s4, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; SI-NEXT:    v_cndmask_b32_e64 v2, -v2, v3, s[2:3]
 ; SI-NEXT:    s_cselect_b32 s0, 0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v3, s3
-; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v3, v2, 0, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
-; VI-NEXT:    v_bfrev_b32_e32 v0, 1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
+; VI-NEXT:    v_bfrev_b32_e32 v3, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s6, 0
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; VI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_bitcmp1_b32 s4, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; VI-NEXT:    v_cndmask_b32_e64 v2, -v2, v3, s[2:3]
 ; VI-NEXT:    s_cselect_b32 s0, 0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v3, v2, 0, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, double 0.0, double %arg
   %i2 = fneg double %i
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index d189b6d4c1e83..c5d4f098e6e7e 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -675,29 +675,23 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s4, s2, 16
-; CI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT:    s_and_b32 s2, s3, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s5, s3, 16
-; CI-NEXT:    v_mul_f32_e64 v3, 1.0, |s2|
-; CI-NEXT:    v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT:    v_mul_f32_e64 v1, 1.0, |s5|
-; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s1, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff0000
+; CI-NEXT:    v_mul_f32_e64 v2, -1.0, s3
+; CI-NEXT:    s_lshl_b32 s1, s1, 16
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_and_b32 s0, s2, 0x7fff
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; CI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    v_mul_f32_e64 v3, -1.0, s1
+; CI-NEXT:    s_and_b32 s1, s2, 0x7fff0000
+; CI-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; CI-NEXT:    v_mul_f32_e64 v2, -1.0, s1
+; CI-NEXT:    s_lshl_b32 s0, s0, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_mul_f32_e64 v4, -1.0, s0
+; CI-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fneg_fabs_v4bf16:
@@ -707,23 +701,25 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_bitset1_b32 s3, 15
-; VI-NEXT:    s_bitset1_b32 s2, 15
-; VI-NEXT:    s_bitset1_b32 s5, 15
-; VI-NEXT:    s_bitset1_b32 s4, 15
-; VI-NEXT:    s_and_b32 s3, 0xffff, s3
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_and_b32 s2, 0xffff, s2
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_or_b32 s3, s3, s5
-; VI-NEXT:    s_or_b32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s2, 0x7fff7fff
+; VI-NEXT:    s_and_b32 s1, s3, 0x7fff7fff
+; VI-NEXT:    s_bfe_u32 s3, s3, 0xf0010
+; VI-NEXT:    s_bfe_u32 s2, s2, 0xf0010
+; VI-NEXT:    s_xor_b32 s1, s1, 0x8000
+; VI-NEXT:    s_xor_b32 s3, s3, 0x8000
+; VI-NEXT:    s_xor_b32 s0, s0, 0x8000
+; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
+; VI-NEXT:    s_and_b32 s1, 0xffff, s1
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_and_b32 s0, 0xffff, s0
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_or_b32 s1, s1, s3
+; VI-NEXT:    s_or_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fneg_fabs_v4bf16:
@@ -731,14 +727,16 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    s_bitset1_b32 s3, 15
-; GFX9-NEXT:    s_bitset1_b32 s2, 15
-; GFX9-NEXT:    s_bitset1_b32 s5, 15
-; GFX9-NEXT:    s_bitset1_b32 s4, 15
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s4, s2, 0x7fff7fff
+; GFX9-NEXT:    s_and_b32 s5, s3, 0x7fff7fff
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0xf0010
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0xf0010
+; GFX9-NEXT:    s_xor_b32 s3, s3, 0x8000
+; GFX9-NEXT:    s_xor_b32 s5, s5, 0x8000
+; GFX9-NEXT:    s_xor_b32 s2, s2, 0x8000
+; GFX9-NEXT:    s_xor_b32 s4, s4, 0x8000
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -748,14 +746,16 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX11-NEXT:    s_bitset1_b32 s3, 15
-; GFX11-NEXT:    s_bitset1_b32 s2, 15
-; GFX11-NEXT:    s_bitset1_b32 s4, 15
-; GFX11-NEXT:    s_bitset1_b32 s5, 15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-NEXT:    s_and_b32 s4, s2, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s5, s3, 0x7fff7fff
+; GFX11-NEXT:    s_bfe_u32 s3, s3, 0xf0010
+; GFX11-NEXT:    s_bfe_u32 s2, s2, 0xf0010
+; GFX11-NEXT:    s_xor_b32 s3, s3, 0x8000
+; GFX11-NEXT:    s_xor_b32 s2, s2, 0x8000
+; GFX11-NEXT:    s_xor_b32 s4, s4, 0x8000
+; GFX11-NEXT:    s_xor_b32 s5, s5, 0x8000
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index eca8c2837b0fc..64a223877e056 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -516,13 +516,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_or_b32 s3, s3, 0x80008000
-; CIVI-NEXT:    s_or_b32 s2, s2, 0x80008000
-; CIVI-NEXT:    v_mov_b32_e32 v3, s1
-; CIVI-NEXT:    v_mov_b32_e32 v0, s2
-; CIVI-NEXT:    v_mov_b32_e32 v1, s3
-; CIVI-NEXT:    v_mov_b32_e32 v2, s0
-; CIVI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    s_or_b32 s0, s3, 0x80008000
+; CIVI-NEXT:    s_or_b32 s1, s2, 0x80008000
+; CIVI-NEXT:    v_mov_b32_e32 v2, s1
+; CIVI-NEXT:    v_mov_b32_e32 v3, s0
+; CIVI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CIVI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fneg_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 1fa9bfa3cfa3f..b93a598cb52ae 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -215,13 +215,13 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset1_b32 s3, 31
-; VI-NEXT:    s_bitset1_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_or_b32 s0, s3, 0x80000000
+; VI-NEXT:    s_or_b32 s1, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 1b092b283290a..191253a42a076 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1633,34 +1633,34 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo
 define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) {
 ; GFX7-LABEL: multiple_uses_fneg_select_f64:
 ; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
 ; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x6
 ; GFX7-NEXT:    s_add_i32 s12, s12, s17
 ; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_bitcmp1_b32 s6, 0
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX7-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, -v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: multiple_uses_fneg_select_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s6, s[8:9], 0x10
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bitcmp1_b32 s6, 0
@@ -1682,7 +1682,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x10
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x18
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 7262724064918..8f01bbf6d0f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -66,13 +66,13 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
-; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_xor_b32 s0, s3, 0x80000000
+; VI-NEXT:    s_xor_b32 s1, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fneg_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 873fceedd7b72..0d2a4ade3c1e6 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -66,32 +66,30 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -149,32 +147,30 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -232,32 +228,30 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -315,32 +309,30 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -398,32 +390,30 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -481,32 +471,30 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -564,32 +552,30 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -647,32 +633,30 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -730,32 +714,30 @@ main_body:
 define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -813,32 +795,30 @@ main_body:
 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 4 offen sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -896,32 +876,30 @@ main_body:
 define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -979,32 +957,30 @@ main_body:
 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX90A-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX942:       ; %bb.0: ; %main_body
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x3c
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x3c
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 idxen offset:4 sc0 nt
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 43caa4c739fb3..718cc4afe22da 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -179,22 +179,22 @@ entry:
 define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
 ; SI-LABEL: fshl_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; SI-NEXT:    s_not_b32 s3, s5
-; SI-NEXT:    s_lshr_b32 s1, s1, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_not_b32 s1, s4
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; SI-NEXT:    s_lshr_b32 s0, s0, 1
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    s_not_b32 s1, s7
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    v_alignbit_b32 v0, s3, v0, 1
+; SI-NEXT:    s_lshr_b32 s0, s3, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_alignbit_b32 v1, s0, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_not_b32 s1, s6
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
+; SI-NEXT:    s_lshr_b32 s0, s2, 1
 ; SI-NEXT:    v_mov_b32_e32 v2, s1
 ; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -202,47 +202,43 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; VI-LABEL: fshl_v2i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_not_b32 s7, s7
-; VI-NEXT:    s_lshr_b32 s3, s1, 1
-; VI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_not_b32 s1, s6
-; VI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; VI-NEXT:    s_lshr_b32 s0, s0, 1
-; VI-NEXT:    v_mov_b32_e32 v2, s1
-; VI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_not_b32 s1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 s0, s3, 1
+; VI-NEXT:    v_alignbit_b32 v2, s3, v2, 1
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_alignbit_b32 v3, s0, v2, v3
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_not_b32 s1, s6
+; VI-NEXT:    v_alignbit_b32 v2, s2, v2, 1
+; VI-NEXT:    s_lshr_b32 s0, s2, 1
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_alignbit_b32 v2, s0, v2, v4
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshl_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 1
-; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; GFX9-NEXT:    s_not_b32 s1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    s_not_b32 s1, s15
+; GFX9-NEXT:    s_lshr_b32 s0, s11, 1
+; GFX9-NEXT:    v_alignbit_b32 v0, s11, v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_not_b32 s1, s8
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    s_not_b32 s1, s14
+; GFX9-NEXT:    v_alignbit_b32 v0, s10, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s10, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32:
@@ -265,40 +261,34 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; GFX10-LABEL: fshl_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v0, s1, s3, 1
-; GFX10-NEXT:    v_alignbit_b32 v3, s0, s2, 1
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_not_b32 s2, s7
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX10-NEXT:    s_not_b32 s3, s6
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, v0, s2
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, v3, s3
+; GFX10-NEXT:    v_alignbit_b32 v0, s11, s13, 1
+; GFX10-NEXT:    v_alignbit_b32 v3, s10, s12, 1
+; GFX10-NEXT:    s_lshr_b32 s0, s11, 1
+; GFX10-NEXT:    s_not_b32 s1, s15
+; GFX10-NEXT:    s_lshr_b32 s2, s10, 1
+; GFX10-NEXT:    s_not_b32 s3, s14
+; GFX10-NEXT:    v_alignbit_b32 v1, s0, v0, s1
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, v3, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_v2i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, s1, s3, 1
-; GFX11-NEXT:    v_alignbit_b32 v3, s0, s2, 1
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX11-NEXT:    s_not_b32 s2, s7
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX11-NEXT:    s_not_b32 s3, s6
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, v0, s2
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v3, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v0, s3, s5, 1
+; GFX11-NEXT:    v_alignbit_b32 v3, s2, s4, 1
+; GFX11-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX11-NEXT:    s_not_b32 s4, s7
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX11-NEXT:    s_not_b32 s5, s6
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, v0, s4
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, v3, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -309,43 +299,45 @@ entry:
 define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
 ; SI-LABEL: fshl_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 23
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 25
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v2i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_alignbit_b32 v3, s3, v2, 23
+; VI-NEXT:    v_alignbit_b32 v2, s2, v4, 25
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshl_v2i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 25
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, 23
+; GFX9-NEXT:    v_alignbit_b32 v0, s2, v3, 25
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32_imm:
@@ -365,25 +357,25 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX10-LABEL: fshl_v2i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 23
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 25
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s7, 23
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s6, 25
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_v2i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 23
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 25
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s5, 23
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s4, 25
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 4a79096442c96..7705fa7b5423e 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -221,51 +221,47 @@ entry:
 define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
 ; SI-LABEL: fshr_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v2i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_alignbit_b32 v3, s3, v2, v3
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_alignbit_b32 v2, s2, v4, v2
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshr_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
+; GFX9-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-NEXT:    v_alignbit_b32 v1, s11, v0, v1
+; GFX9-NEXT:    v_alignbit_b32 v0, s10, v3, v4
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -285,79 +281,64 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; GFX10-LABEL: fshr_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s15
+; GFX10-NEXT:    v_mov_b32_e32 v2, s14
+; GFX10-NEXT:    v_alignbit_b32 v1, s11, s13, v0
+; GFX10-NEXT:    v_alignbit_b32 v0, s10, s12, v2
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: fshr_v2i32:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT:    s_clause 0x2
-; GFX11-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
-; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0.l
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s2, s4, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fshr_v2i32:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX11-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s2, s4, v2
+; GFX11-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX12-TRUE16-LABEL: fshr_v2i32:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT:    s_clause 0x2
-; GFX12-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
-; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0.l
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s2, s4, v0.h
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX12-FAKE16-LABEL: fshr_v2i32:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT:    s_clause 0x2
-; GFX12-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX12-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s3, s5, v0
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s2, s4, v2
+; GFX12-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -368,43 +349,45 @@ entry:
 define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
 ; SI-LABEL: fshr_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 9
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v2i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_alignbit_b32 v3, s3, v2, 9
+; VI-NEXT:    v_alignbit_b32 v2, s2, v4, 7
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshr_v2i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 7
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, 9
+; GFX9-NEXT:    v_alignbit_b32 v0, s2, v3, 7
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v2i32_imm:
@@ -424,37 +407,37 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX10-LABEL: fshr_v2i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 7
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s7, 9
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s6, 7
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_v2i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 7
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s5, 9
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s4, 7
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: fshr_v2i32_imm:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX12-NEXT:    v_alignbit_b32 v0, s0, s2, 7
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT:    v_alignbit_b32 v1, s3, s5, 9
+; GFX12-NEXT:    v_alignbit_b32 v0, s2, s4, 7
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 0512b9bc2b54a..ff98e89a9faa4 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -321,14 +321,13 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -338,13 +337,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_add_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -357,14 +355,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_add_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_add v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -373,16 +370,15 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -399,7 +395,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_add_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -420,7 +416,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_add_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -444,7 +440,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -462,7 +458,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -613,30 +609,29 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_add_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -647,14 +642,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_add_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_add v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -663,16 +657,15 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_add_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -688,7 +681,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_add_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -709,7 +702,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_add_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -731,7 +724,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_add_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -749,7 +742,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_add_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -901,14 +894,13 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -918,13 +910,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_and_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -937,14 +928,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_and_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_and v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -953,16 +943,15 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -979,7 +968,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_and_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1000,7 +989,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_and_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1024,7 +1013,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1042,7 +1031,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -1193,30 +1182,29 @@ entry:
 define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_and_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1227,14 +1215,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_and_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_and v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1243,16 +1230,15 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_and_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1268,7 +1254,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_and_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1289,7 +1275,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_and_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1311,7 +1297,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_and_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1329,7 +1315,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_and_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -1481,14 +1467,13 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -1498,13 +1483,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_sub_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1517,14 +1501,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_sub_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_sub v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1533,16 +1516,15 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1559,7 +1541,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1580,7 +1562,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1604,7 +1586,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1622,7 +1604,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -1773,30 +1755,29 @@ entry:
 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_sub_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1807,14 +1788,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_sub_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_sub v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1823,16 +1803,15 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_sub_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1848,7 +1827,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_sub_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1869,7 +1848,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_sub_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1891,7 +1870,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_sub_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -1909,7 +1888,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_sub_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2057,14 +2036,13 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -2072,13 +2050,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_max_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2089,30 +2066,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smax v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2128,7 +2103,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_max_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2148,7 +2123,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_max_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2171,7 +2146,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2188,7 +2163,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2327,28 +2302,27 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_max_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -2357,30 +2331,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smax v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_max_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2395,7 +2367,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_max_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2415,7 +2387,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_max_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2436,7 +2408,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_max_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2453,7 +2425,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_max_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2593,14 +2565,13 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -2608,13 +2579,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_umax_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2625,30 +2595,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2664,7 +2632,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2684,7 +2652,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2707,7 +2675,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2724,7 +2692,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -2863,28 +2831,27 @@ entry:
 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umax_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -2893,30 +2860,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umax_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2931,7 +2896,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_umax_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2951,7 +2916,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_umax_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2972,7 +2937,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umax_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2989,7 +2954,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_umax_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3129,14 +3094,13 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -3144,13 +3108,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_min_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -3161,30 +3124,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3200,7 +3161,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_min_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3220,7 +3181,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_min_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3243,7 +3204,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -3260,7 +3221,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3399,28 +3360,27 @@ entry:
 define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_min_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3429,30 +3389,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_min_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3467,7 +3425,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_min_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3487,7 +3445,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_min_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3508,7 +3466,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_min_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -3525,7 +3483,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_min_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3665,14 +3623,13 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -3680,13 +3637,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_umin_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -3697,30 +3653,28 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umin_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3736,7 +3690,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3756,7 +3710,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; VI-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3779,7 +3733,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -3796,7 +3750,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -3935,28 +3889,27 @@ entry:
 define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umin_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3965,30 +3918,28 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX9-LABEL: atomic_umin_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: atomic_umin_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -4003,7 +3954,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_umin_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4023,7 +3974,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_umin_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4044,7 +3995,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umin_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4061,7 +4012,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_umin_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -4212,14 +4163,13 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -4229,13 +4179,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
 ;
 ; VI-LABEL: atomic_or_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -4248,14 +4197,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
 ;
 ; GFX9-LABEL: atomic_or_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_or v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4264,16 +4212,15 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -4290,7 +4237,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ; SI-LABEL: atomic_or_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4311,7 +4258,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ;
 ; VI-LABEL: atomic_or_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4335,7 +4282,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4353,7 +4300,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 ; GFX11-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -4504,30 +4451,29 @@ entry:
 define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_or_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -4538,14 +4484,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_or_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_or v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4554,16 +4499,15 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_or_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -4579,7 +4523,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ; SI-LABEL: atomic_or_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4600,7 +4544,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ;
 ; VI-LABEL: atomic_or_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4622,7 +4566,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: atomic_or_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4640,7 +4584,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX11-LABEL: atomic_or_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -4849,14 +4793,13 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -4866,13 +4809,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_xchg_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -4885,14 +4827,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_xchg_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4901,16 +4842,15 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -4927,7 +4867,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4948,7 +4888,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4972,7 +4912,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -4990,7 +4930,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -5141,30 +5081,29 @@ entry:
 define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_xchg_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -5175,14 +5114,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX9-LABEL: atomic_xchg_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -5191,16 +5129,15 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in
 ;
 ; GFX11-LABEL: atomic_xchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5216,7 +5153,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_xchg_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5237,7 +5174,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_xchg_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5259,7 +5196,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xchg_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -5277,7 +5214,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_xchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -5437,16 +5374,15 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s9, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s7, s[4:5], 0xf
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    v_mov_b32_e32 v3, s5
 ; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16
@@ -5456,18 +5392,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5477,17 +5412,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5495,17 +5429,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5522,9 +5455,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dword s10, s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x11
+; SI-NEXT:    s_load_dword s10, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -5532,8 +5465,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ; SI-NEXT:    s_lshl_b64 s[8:9], s[8:9], 2
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16 glc
@@ -5545,7 +5478,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5571,17 +5504,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5591,7 +5524,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -5752,35 +5685,34 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s9, s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v3, s5
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
@@ -5790,17 +5722,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x3c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x3c
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5808,17 +5739,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
 ;
 ; GFX11-LABEL: atomic_cmpxchg_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -5834,9 +5764,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dword s10, s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x11
+; SI-NEXT:    s_load_dword s10, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -5844,8 +5774,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; SI-NEXT:    s_lshl_b64 s[8:9], s[8:9], 2
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 glc
@@ -5857,7 +5787,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s9, s[4:5], 0x44
@@ -5881,17 +5811,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x44
+; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -5901,7 +5831,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -6055,14 +5985,13 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -6072,13 +6001,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_xor_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -6091,14 +6019,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_xor_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_xor v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -6107,16 +6034,15 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -6133,7 +6059,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6154,7 +6080,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6178,7 +6104,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -6196,7 +6122,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -6347,30 +6273,29 @@ entry:
 define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_xor_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -6381,14 +6306,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_xor_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_xor v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -6397,16 +6321,15 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX11-LABEL: atomic_xor_i32_addr64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -6422,7 +6345,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-LABEL: atomic_xor_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6443,7 +6366,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: atomic_xor_i32_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6465,7 +6388,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xor_i32_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -6483,7 +6406,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: atomic_xor_i32_ret_addr64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -8495,14 +8418,13 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_inc_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -8512,13 +8434,12 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_inc_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,14 +8452,13 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_inc_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_inc v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -8547,16 +8467,15 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_inc_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -8573,7 +8492,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8594,7 +8513,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8618,7 +8537,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -8636,7 +8555,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_inc_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -8974,14 +8893,13 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_dec_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; SI-NEXT:    s_lshl_b64 s[4:5], s[2:3], 2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 offset:16
@@ -8991,13 +8909,12 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_dec_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -9010,14 +8927,13 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_dec_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s2, s0
-; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_atomic_dec v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -9026,16 +8942,15 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX11-LABEL: atomic_dec_i32_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -9052,7 +8967,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9073,7 +8988,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9097,7 +9012,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9115,7 +9030,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: atomic_dec_i32_ret_addr64_offset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 117cf40de72d2..d91763030cf56 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -98,16 +98,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_add_u32 s4, s0, 4
-; CIVI-NEXT:    s_addc_u32 s5, s1, 0
-; CIVI-NEXT:    v_mov_b32_e32 v2, s4
-; CIVI-NEXT:    v_mov_b32_e32 v4, s3
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
-; CIVI-NEXT:    v_mov_b32_e32 v3, s5
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
-; CIVI-NEXT:    v_mov_b32_e32 v5, s2
-; CIVI-NEXT:    flat_store_short v[2:3], v4
-; CIVI-NEXT:    flat_store_dword v[0:1], v5
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
+; CIVI-NEXT:    s_add_u32 s0, s0, 4
+; CIVI-NEXT:    flat_store_dword v[0:1], v2
+; CIVI-NEXT:    s_addc_u32 s1, s1, 0
+; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    v_mov_b32_e32 v2, s3
+; CIVI-NEXT:    flat_store_short v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v3f16_arg:
@@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
-; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v3, s3
 ; CIVI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CIVI-NEXT:    s_endpgm
@@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; GFX11-LABEL: load_v4f16_arg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   store <4 x half> %arg, ptr addrspace(1) %out
@@ -348,37 +348,21 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 }
 
 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
-; CI-LABEL: extload_v3f16_to_v3f32_arg:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s4, s2, 16
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v4, s1
-; CI-NEXT:    v_mov_b32_e32 v3, s0
-; CI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: extload_v3f16_to_v3f32_arg:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s0
-; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
-; VI-NEXT:    s_endpgm
+; CIVI-LABEL: extload_v3f16_to_v3f32_arg:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT:    s_add_i32 s12, s12, s17
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_lshr_b32 s4, s2, 16
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; CIVI-NEXT:    v_mov_b32_e32 v3, s0
+; CIVI-NEXT:    v_mov_b32_e32 v4, s1
+; CIVI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
+; CIVI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
 ; GFX11:       ; %bb.0:
@@ -386,9 +370,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <3 x half> %arg to <3 x float>
@@ -404,14 +388,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s4, s3, 16
-; CI-NEXT:    s_lshr_b32 s5, s2, 16
+; CI-NEXT:    s_lshr_b32 s4, s2, 16
+; CI-NEXT:    s_lshr_b32 s5, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, s5
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, s0
+; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; CI-NEXT:    s_endpgm
 ;
@@ -424,12 +408,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -440,10 +424,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <4 x half> %arg to <4 x float>
@@ -724,61 +708,33 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 }
 
 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
-; CI-LABEL: extload_v4f16_to_v4f64_arg:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s4, s3, 16
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, s4
-; CI-NEXT:    s_lshr_b32 s5, s2, 16
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, s5
-; CI-NEXT:    s_add_u32 s2, s0, 16
-; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; CI-NEXT:    s_addc_u32 s3, s1, 0
-; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; CI-NEXT:    v_mov_b32_e32 v9, s3
-; CI-NEXT:    v_mov_b32_e32 v8, s2
-; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; CI-NEXT:    s_nop 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: extload_v4f16_to_v4f64_arg:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s3
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s5
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v6, s4
-; VI-NEXT:    s_add_u32 s2, s0, 16
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; VI-NEXT:    v_mov_b32_e32 v9, s3
-; VI-NEXT:    v_mov_b32_e32 v8, s2
-; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; VI-NEXT:    s_endpgm
+; CIVI-LABEL: extload_v4f16_to_v4f64_arg:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT:    s_add_i32 s12, s12, s17
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_lshr_b32 s5, s3, 16
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v0, s3
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; CIVI-NEXT:    s_lshr_b32 s4, s2, 16
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v4, s2
+; CIVI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; CIVI-NEXT:    s_add_u32 s2, s0, 16
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; CIVI-NEXT:    s_addc_u32 s3, s1, 0
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; CIVI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; CIVI-NEXT:    v_mov_b32_e32 v9, s3
+; CIVI-NEXT:    v_mov_b32_e32 v8, s2
+; CIVI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; CIVI-NEXT:    s_nop 0
+; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CIVI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
 ; GFX11:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index e1b4cad370f96..11826aa0b360d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -290,19 +290,19 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec,
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dword s5, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s0, s5, 4
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GCN-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -418,19 +418,19 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec,
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dword s5, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s0, s5, 4
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GCN-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -443,18 +443,18 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x34
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
-; GCN-NEXT:    s_lshl_b32 s6, s6, 3
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s4, s4, 3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_xor_b32 s1, s3, 0x1010101
+; GCN-NEXT:    s_xor_b32 s0, s2, 0x1010101
+; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 44bd4090436ef..0251a5af8916b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1573,13 +1573,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s0, 0x50005
+; VI-NEXT:    s_lshl_b32 s0, s8, 4
+; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s9, s8
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_mov_b32 s1, s0
-; VI-NEXT:    s_lshl_b32 s8, s8, 4
-; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; VI-NEXT:    s_xor_b64 s[8:9], s[2:3], s[8:9]
+; VI-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
 ; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
@@ -1680,14 +1680,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0xa
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; SI-NEXT:    s_and_b32 s5, s6, s5
 ; SI-NEXT:    s_xor_b32 s4, s5, s4
@@ -1697,14 +1696,13 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8
 ;
 ; VI-LABEL: dynamic_insertelement_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x28
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
 ; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
 ; VI-NEXT:    s_and_b32 s5, s6, s5
 ; VI-NEXT:    s_xor_b32 s4, s5, s4
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5396f63..b9623a45607bd 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s0, 4
-; VI-NEXT:    s_addc_u32 s5, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v4, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v5, s2
-; VI-NEXT:    flat_store_short v[2:3], v4
-; VI-NEXT:    flat_store_dword v[0:1], v5
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_add_u32 s0, s0, 4
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v3i16_arg:
@@ -1328,8 +1328,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -2393,8 +2393,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 76cff962f7c20..2050c76c09e42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
@@ -7,123 +8,210 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
 declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
 
-; GCN-LABEL: {{^}}test_div_fmas_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
-
-; GCN-DAG: s_bitcmp1_b32 s{{[0-9]+}}, 0
-
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x13
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s7, s[4:5], 0x1c
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x25
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
-; SI: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x25
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_div_fmas_f32 v0, 1.0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
-
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x16
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s6, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
-; GCN: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x13
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x1c
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, 1.0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f64:
-; GCN: v_div_fmas_f64
 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s8, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s9, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) nounwind {
+; GCN-LABEL: test_div_fmas_f32_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s3, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
-; GCN: s_mov_b64 vcc, 0
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
+; GCN-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_mov_b64 vcc, 0
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
-; GCN: s_mov_b64 vcc, -1
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
+; GCN-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_mov_b64 vcc, -1
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-
-; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
-; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}}
-; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
-; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
-; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 offset:4 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_and_b64 vcc, vcc, s[2:3]
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v3, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
@@ -143,26 +231,39 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-
-; SI: ; %entry
-; SI:     v_cmp_eq_u32_e64   [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
-; SI:     s_mov_b64          vcc, 0
-; SI:     s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
-
-; SI: ; %bb
-; SI:     buffer_load_dword  [[LOAD:v[0-9]+]],
-; SI:     v_cmp_ne_u32_e32   vcc, 0, [[LOAD]]
-; SI:     s_and_b64          vcc, vcc, exec
-
-; SI: ; %exit
-; SI:     s_or_b64           exec, exec, [[SAVE]]
-; SI-NOT: vcc
-; SI:     v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI:     buffer_store_dword
-; SI:     s_endpgm
-
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind {
+; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
+; GCN-NEXT:    buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GCN-NEXT:    s_mov_b64 vcc, 0
+; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[0:1]
+; GCN-NEXT:    s_cbranch_execz .LBB9_2
+; GCN-NEXT:  ; %bb.1: ; %bb
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, s3
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_and_b64 vcc, vcc, exec
+; GCN-NEXT:  .LBB9_2: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s11, s3
+; GCN-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GCN-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
@@ -188,3 +289,5 @@ exit:
   store float %result, ptr addrspace(1) %gep.out, align 4
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 4628a9c15391b..6af59bd5377e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -53,15 +53,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; SDAG-NEXT:    v_mov_b32_e32 v7, s15
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
@@ -100,15 +104,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; SDAG-NEXT:    v_mov_b32_e32 v7, s15
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
@@ -1899,15 +1907,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    v_mov_b32_e32 v6, s14
+; GCN-NEXT:    v_mov_b32_e32 v7, s15
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
@@ -1925,15 +1937,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    v_mov_b32_e32 v6, s14
+; GCN-NEXT:    v_mov_b32_e32 v7, s15
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 978f223aafb94..7140ba5cf2ad9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_exp_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8a000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3fb8a000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x42b17218
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x39a3b295
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x39a3b295, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3fb8a000, v6
-; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v5, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    v_exp_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    s_and_b32 s0, s2, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v8, s2, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, s0, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x39a3b295, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3fb8a000, v8
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v7, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v8, v8, v9
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v7
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v8
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v7
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, v5, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc2ce8ed0
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-SDAG-NEXT:    v_ldexp_f32 v2, v2, v4
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v6
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 70c3787bac9a1..491d5a8e077e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-LABEL: s_exp10_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x40549000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x421a209b
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x40549000, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3a2784bc
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3a2784bc, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x40549000, v6
-; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc23369f4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, v5, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    v_exp_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    s_and_b32 s0, s2, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v8, s2, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, s0, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x3a2784bc, v8
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x40549000, v8
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_rndne_f32_e32 v7, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v8, v8, v9
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v7
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v8
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v7
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, v5, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc23369f4
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-SDAG-NEXT:    v_ldexp_f32 v2, v2, v4
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v6
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 15bcab9f774e4..fd0f8e777eebd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -225,26 +225,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-LABEL: s_exp2_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_exp_f32_e32 v4, v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, s2, v2
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_add_f32_e32 v0, s2, v0
-; VI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT:    s_cselect_b32 s3, 0xffffffc0, 0
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, v2, s3
-; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-SDAG-NEXT:    s_cselect_b32 s2, 0xffffffc0, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, s2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffffc0, 0
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, v4, s0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffffc0, 0
+; VI-SDAG-NEXT:    v_ldexp_f32 v2, v2, s0
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 0d5846a4a4985..0b77260cbe9a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_log_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41b17218
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v2
 ; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_and_b32_e32 v6, 0xfffff000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v7, v5, v6
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x3f317000, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v7
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v8, v7
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT:    v_log_f32_e32 v7, v3
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v7
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v7, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v5
-; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v8, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v5, v1
-; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v5
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v5, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v6, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_log_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 8006876dbe3ff..594db9ce14a0e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-LABEL: s_log10_v2f32:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x411a209b
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v2
 ; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_and_b32_e32 v6, 0xfffff000, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v7, v5, v6
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x3e9a2000, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v7
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v8, v7
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT:    v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT:    v_log_f32_e32 v7, v3
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v7
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v7, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v5
-; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v8, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v5, v1
-; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v5
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v5, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v6, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_log10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index c1ac74e5094b0..9e5e9e8f7b6b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -285,16 +285,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
 ; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; VI-SDAG-NEXT:    v_ldexp_f32 v0, s2, v0
 ; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v4, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v4, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v5, v4
+; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_log2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 117f359be0c3b..04195ae8f6fce 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -903,14 +903,14 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; CI-LABEL: mad_i64_i32_uniform:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xb
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s3
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v2, s5
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s4, v2, v[0:1]
 ; CI-NEXT:    s_mov_b32 s4, s0
 ; CI-NEXT:    s_mov_b32 s5, s1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -919,17 +919,17 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; SI-LABEL: mad_i64_i32_uniform:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mul_hi_u32 v1, s2, v0
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mul_i32 s0, s2, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_mul_hi_u32 v1, s8, v0
+; SI-NEXT:    s_mul_i32 s0, s8, s9
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,14 +937,14 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ;
 ; GFX9-LABEL: mad_i64_i32_uniform:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2c
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s3
-; GFX9-NEXT:    s_mul_i32 s2, s2, s3
-; GFX9-NEXT:    s_add_u32 s2, s2, s6
-; GFX9-NEXT:    s_addc_u32 s3, s4, s7
+; GFX9-NEXT:    s_mul_i32 s5, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s7
+; GFX9-NEXT:    s_add_u32 s2, s5, s2
+; GFX9-NEXT:    s_addc_u32 s3, s4, s3
 ; GFX9-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -952,13 +952,13 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; GFX11-LABEL: mad_i64_i32_uniform:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_i32 s6, s2, s3
-; GFX11-NEXT:    s_mul_hi_u32 s3, s2, s3
-; GFX11-NEXT:    s_add_u32 s2, s6, s4
-; GFX11-NEXT:    s_addc_u32 s3, s3, s5
+; GFX11-NEXT:    s_mul_i32 s4, s6, s7
+; GFX11-NEXT:    s_mul_hi_u32 s5, s6, s7
+; GFX11-NEXT:    s_add_u32 s2, s4, s2
+; GFX11-NEXT:    s_addc_u32 s3, s5, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -967,16 +967,16 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; GFX12-LABEL: mad_i64_i32_uniform:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x2c
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s7, 0
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s6, s2
-; GFX12-NEXT:    s_mov_b32 s2, s3
-; GFX12-NEXT:    s_mov_b32 s3, s7
+; GFX12-NEXT:    s_mov_b32 s4, s6
+; GFX12-NEXT:    s_mov_b32 s6, s7
+; GFX12-NEXT:    s_mov_b32 s7, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_u64 s[2:3], s[6:7], s[2:3]
-; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ffaf62ff1e0..f17c1a4da5570 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -898,105 +898,105 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ;
 ; CI-LABEL: s_test_imin_sle_v4i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_ashr_i32 s6, s0, 16
-; CI-NEXT:    s_ashr_i32 s7, s1, 16
-; CI-NEXT:    s_sext_i32_i16 s0, s0
-; CI-NEXT:    s_sext_i32_i16 s1, s1
-; CI-NEXT:    s_ashr_i32 s8, s2, 16
-; CI-NEXT:    s_ashr_i32 s9, s3, 16
-; CI-NEXT:    s_sext_i32_i16 s2, s2
-; CI-NEXT:    s_sext_i32_i16 s3, s3
-; CI-NEXT:    s_min_i32 s7, s7, s9
-; CI-NEXT:    s_min_i32 s1, s1, s3
-; CI-NEXT:    s_min_i32 s3, s6, s8
-; CI-NEXT:    s_min_i32 s0, s0, s2
-; CI-NEXT:    s_lshl_b32 s7, s7, 16
-; CI-NEXT:    s_and_b32 s1, s1, 0xffff
-; CI-NEXT:    s_lshl_b32 s3, s3, 16
-; CI-NEXT:    s_and_b32 s0, s0, 0xffff
-; CI-NEXT:    s_or_b32 s1, s1, s7
-; CI-NEXT:    s_or_b32 s0, s0, s3
-; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    s_ashr_i32 s0, s2, 16
+; CI-NEXT:    s_ashr_i32 s1, s3, 16
+; CI-NEXT:    s_sext_i32_i16 s2, s2
+; CI-NEXT:    s_sext_i32_i16 s3, s3
+; CI-NEXT:    s_ashr_i32 s6, s4, 16
+; CI-NEXT:    s_ashr_i32 s7, s5, 16
+; CI-NEXT:    s_sext_i32_i16 s4, s4
+; CI-NEXT:    s_sext_i32_i16 s5, s5
+; CI-NEXT:    s_min_i32 s1, s1, s7
+; CI-NEXT:    s_min_i32 s3, s3, s5
+; CI-NEXT:    s_min_i32 s0, s0, s6
+; CI-NEXT:    s_min_i32 s2, s2, s4
+; CI-NEXT:    s_lshl_b32 s1, s1, 16
+; CI-NEXT:    s_and_b32 s3, s3, 0xffff
+; CI-NEXT:    s_lshl_b32 s0, s0, 16
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff
+; CI-NEXT:    s_or_b32 s1, s3, s1
+; CI-NEXT:    s_or_b32 s0, s2, s0
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_imin_sle_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s6, s3, 16
-; VI-NEXT:    s_ashr_i32 s7, s1, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_ashr_i32 s0, s5, 16
+; VI-NEXT:    s_ashr_i32 s1, s3, 16
+; VI-NEXT:    s_min_i32 s0, s1, s0
+; VI-NEXT:    s_sext_i32_i16 s1, s5
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_min_i32 s6, s7, s6
-; VI-NEXT:    s_min_i32 s1, s1, s3
-; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_min_i32 s1, s3, s1
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s6
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_ashr_i32 s1, s4, 16
 ; VI-NEXT:    s_ashr_i32 s3, s2, 16
-; VI-NEXT:    s_ashr_i32 s6, s0, 16
+; VI-NEXT:    s_min_i32 s1, s3, s1
+; VI-NEXT:    s_sext_i32_i16 s3, s4
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_min_i32 s3, s6, s3
-; VI-NEXT:    s_min_i32 s0, s0, s2
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_min_i32 s2, s2, s3
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_or_b32 s1, s2, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_test_imin_sle_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_pk_min_i16 v1, s1, v0
-; GFX9-NEXT:    v_pk_min_i16 v0, s0, v3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_pk_min_i16 v1, s3, v0
+; GFX9-NEXT:    v_pk_min_i16 v0, s2, v3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_imin_sle_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_min_i16 v1, s1, s3
-; GFX10-NEXT:    v_pk_min_i16 v0, s0, s2
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    v_pk_min_i16 v1, s3, s5
+; GFX10-NEXT:    v_pk_min_i16 v0, s2, s4
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_imin_sle_v4i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_i16 v1, s1, s3
-; GFX11-NEXT:    v_pk_min_i16 v0, s0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_pk_min_i16 v1, s3, s5
+; GFX11-NEXT:    v_pk_min_i16 v0, s2, s4
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -1387,77 +1387,77 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ;
 ; CI-LABEL: s_test_imin_slt_v2i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_min_i32 s1, s1, s3
-; CI-NEXT:    s_min_i32 s0, s0, s2
-; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    s_min_i32 s0, s3, s5
+; CI-NEXT:    s_min_i32 s1, s2, s4
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_imin_slt_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_min_i32 s1, s1, s3
-; VI-NEXT:    s_min_i32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_min_i32 s0, s3, s5
+; VI-NEXT:    s_min_i32 s1, s2, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_test_imin_slt_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_min_i32 s1, s1, s3
-; GFX9-NEXT:    s_min_i32 s0, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_min_i32 s3, s3, s5
+; GFX9-NEXT:    s_min_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_imin_slt_v2i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_min_i32 s0, s0, s2
-; GFX10-NEXT:    s_min_i32 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    s_min_i32 s2, s2, s4
+; GFX10-NEXT:    s_min_i32 s3, s3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_imin_slt_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s0, s0, s2
-; GFX11-NEXT:    s_min_i32 s1, s1, s3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_min_i32 s2, s2, s4
+; GFX11-NEXT:    s_min_i32 s3, s3, s5
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index bef38c1a65ef8..6351c5a9e7680 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -2196,8 +2196,8 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; GFX900-LABEL: fadd_fadd_fsub:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX900-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX900-NEXT:    v_add_f32_e32 v0, s1, v0
@@ -2205,14 +2205,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; GFX900-NEXT:    v_add_f32_e32 v3, s2, v0
 ; GFX900-NEXT:    v_sub_f32_e32 v0, s0, v1
 ; GFX900-NEXT:    v_subrev_f32_e32 v1, s3, v3
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; PACKED-SDAG-LABEL: fadd_fadd_fsub:
 ; PACKED-SDAG:       ; %bb.0: ; %bb
 ; PACKED-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; PACKED-SDAG-NEXT:    v_add_f32_e32 v0, s1, v0
@@ -2220,7 +2220,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v3, v0
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
-; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
+; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
 ; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 0a746b0a3f572..79ec56aed4582 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -94,62 +94,64 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; SI-LABEL: rotl_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s3, 32, s3
-; SI-NEXT:    s_sub_i32 s2, 32, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_sub_i32 s0, 32, s9
+; SI-NEXT:    s_sub_i32 s1, 32, s8
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_alignbit_b32 v1, s3, s3, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_alignbit_b32 v0, s2, s2, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: rotl_v2i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sub_i32 s2, 32, s2
-; GFX8-NEXT:    s_sub_i32 s3, 32, s3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, s0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_sub_i32 s1, 32, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_sub_i32 s0, 32, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_alignbit_b32 v3, s3, s3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v2
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: rotl_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sub_i32 s3, 32, s3
-; GFX10-NEXT:    s_sub_i32 s2, 32, s2
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    s_sub_i32 s4, 32, s7
+; GFX10-NEXT:    s_sub_i32 s5, 32, s6
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s3, s4
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s2, s5
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotl_v2i32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sub_i32 s3, 32, s3
-; GFX11-NEXT:    s_sub_i32 s2, 32, s2
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_sub_i32 s4, 32, s7
+; GFX11-NEXT:    s_sub_i32 s5, 32, s6
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s3, s4
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s2, s5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = shl <2 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d6e297e..2cc186b85b5d9 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -83,54 +83,56 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ;
 ; SI-LABEL: rotr_v2i32:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_alignbit_b32 v1, s3, s3, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_alignbit_b32 v0, s2, s2, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: rotr_v2i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, s0, s0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v3, s3, s3, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v4
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: rotr_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    v_alignbit_b32 v1, s3, s3, s7
+; GFX10-NEXT:    v_alignbit_b32 v0, s2, s2, s6
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotr_v2i32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s3, s3, s5
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s2, s4
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index 3140511f99678..f20c3b603b90f 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -56,8 +56,8 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 {
 }
 
 ; SI-LABEL: {{^}}s_addk_v2i32_k0:
-; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
-; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x42
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x41
 ; SI: s_endpgm
 ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b
 define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
index 8f944269869df..8d026a943dea7 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -25,18 +25,19 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a
 define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: sext_i16_to_i64_uniform:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_i64 s[4:5], s[6:7], 0x100000
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s5, s9, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_bfe_i64 s[0:1], s[8:9], 0x100000
+; GCN-NEXT:    s_add_u32 s0, s2, s0
+; GCN-NEXT:    s_addc_u32 s1, s3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %sext = sext i16 %a to i64
   %res = add i64 %b, %sext
@@ -89,18 +90,19 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16
 define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a, i64 %b) {
 ; GCN-LABEL: sext_i32_to_i64_uniform:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s5, s8, 31
-; GCN-NEXT:    s_add_u32 s4, s6, s8
-; GCN-NEXT:    s_addc_u32 s5, s7, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_ashr_i32 s1, s8, 31
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_add_u32 s0, s2, s8
+; GCN-NEXT:    s_addc_u32 s1, s3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %sext = sext i32 %a to i64
   %res = add i64 %b, %sext
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index a82a6a8a4c367..cd55dec6b2d57 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1361,28 +1361,30 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_64_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 64, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 64, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_64_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 64, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 64, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_64_i64:
@@ -1411,28 +1413,30 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_1_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 1, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 1, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_1_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 1, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 1, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_1_i64:
@@ -1462,28 +1466,30 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
 define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_1_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 1.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 1.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_1_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 1.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 1.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_1_0_i64:
@@ -1509,28 +1515,30 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -1.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -1.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -1.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -1.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
@@ -1556,28 +1564,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_0_5_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 0.5, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 0.5, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_0_5_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 0.5, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 0.5, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_0_5_i64:
@@ -1603,28 +1613,30 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -0.5, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -0.5, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -0.5, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -0.5, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
@@ -1650,28 +1662,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_2_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 2.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 2.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_2_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 2.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 2.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_2_0_i64:
@@ -1697,28 +1711,30 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -2.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -2.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -2.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -2.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
@@ -1744,28 +1760,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 4.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 4.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 4.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 4.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_4_0_i64:
@@ -1791,28 +1809,30 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], -4.0, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], -4.0, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], -4.0, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], -4.0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
@@ -1841,28 +1861,30 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], 0x40800000, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 0x40800000, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
@@ -1893,32 +1915,34 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s4, -4.0
-; SI-NEXT:    s_mov_b32 s5, -1
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s0, -4.0
+; SI-NEXT:    s_mov_b32 s1, -1
+; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s4, -4.0
-; VI-NEXT:    s_mov_b32 s5, -1
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, -4.0
+; VI-NEXT:    s_mov_b32 s1, -1
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
@@ -1949,32 +1973,34 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s4, 0
-; SI-NEXT:    s_mov_b32 s5, 4.0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s8, 0
+; SI-NEXT:    s_mov_b32 s9, 4.0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    s_mov_b32 s5, 4.0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s8, 0
+; VI-NEXT:    s_mov_b32 s9, 4.0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
@@ -2000,32 +2026,34 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %o
 define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s4, 0
-; SI-NEXT:    s_mov_b32 s5, -4.0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s8, 0
+; SI-NEXT:    s_mov_b32 s9, -4.0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    s_mov_b32 s5, -4.0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s8, 0
+; VI-NEXT:    s_mov_b32 s9, -4.0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 61da875cf2f28..86ea09914d497 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -339,38 +339,38 @@ define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(ptr a
 define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(ptr addrspace(1) %out, ptr addrspace(3) noalias %lptr, ptr addrspace(4) %ptr0) #0 {
 ; CI-LABEL: reorder_smrd_load_local_store_smrd_load:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dword s8, s[4:5], 0xb
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x63
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x1
 ; CI-NEXT:    v_mov_b32_e32 v1, s8
-; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    ds_write_b32 v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s4, s4, s5
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_add_i32 s0, s0, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: reorder_smrd_load_local_store_smrd_load:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    ds_write_b32 v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_add_i32 s2, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 47998767a948c..be10302c42854 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -331,39 +331,39 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s7, s2, 16
-; VI-NEXT:    s_sub_i32 s7, 0, s7
-; VI-NEXT:    s_sub_i32 s4, 0, s3
-; VI-NEXT:    s_lshr_b32 s6, s3, 16
-; VI-NEXT:    s_ashr_i32 s8, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s7, s7
-; VI-NEXT:    s_sub_i32 s5, 0, s2
-; VI-NEXT:    s_sub_i32 s6, 0, s6
-; VI-NEXT:    s_max_i32 s7, s8, s7
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s4, s4
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s6, s6
-; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_sub_i32 s1, 0, s2
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_i32 s0, 0, s3
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_sub_i32 s5, 0, s5
+; VI-NEXT:    s_ashr_i32 s6, s2, 16
+; VI-NEXT:    s_sext_i32_i16 s1, s1
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_max_i32 s3, s3, s4
-; VI-NEXT:    s_max_i32 s6, s8, s6
-; VI-NEXT:    s_max_i32 s2, s2, s5
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s4, s6, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_add_i32 s2, s2, 2
-; VI-NEXT:    s_or_b32 s3, s4, s3
-; VI-NEXT:    s_lshl_b32 s4, s7, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s2, s4, s2
-; VI-NEXT:    s_add_i32 s3, s3, 0x20000
-; VI-NEXT:    s_add_i32 s2, s2, 0x20000
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_sub_i32 s4, 0, s4
+; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    s_max_i32 s1, s2, s1
+; VI-NEXT:    s_sext_i32_i16 s0, s0
+; VI-NEXT:    s_sext_i32_i16 s2, s3
+; VI-NEXT:    s_max_i32 s5, s6, s5
+; VI-NEXT:    s_ashr_i32 s6, s3, 16
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_max_i32 s0, s2, s0
+; VI-NEXT:    s_max_i32 s4, s6, s4
+; VI-NEXT:    s_add_i32 s0, s0, 2
+; VI-NEXT:    s_lshl_b32 s2, s4, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_add_i32 s1, s1, 2
+; VI-NEXT:    s_or_b32 s0, s2, s0
+; VI-NEXT:    s_lshl_b32 s2, s5, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_or_b32 s1, s2, s1
+; VI-NEXT:    s_add_i32 s0, s0, 0x20000
+; VI-NEXT:    s_add_i32 s1, s1, 0x20000
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_abs_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d23e314b9465f..2dfd386636ec7 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -32,92 +32,93 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
-; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[54:55], s[8:9], 0x8
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[52:53]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
-; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
+; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    s_nop 0
+; GLOBALNESS1-NEXT:    s_load_dword s9, s[38:39], 0x20
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s54, 0
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[6:7], v[0:1]
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s8, 0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s9, 0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, v1
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s8, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s9, 1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s6, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s7, 1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
-; GLOBALNESS1-NEXT:    s_mov_b32 s82, s16
-; GLOBALNESS1-NEXT:    s_mov_b32 s83, s15
-; GLOBALNESS1-NEXT:    s_mov_b32 s84, s14
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s16
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 3
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v4
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 4
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 5
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v1
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 6
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 7
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v3
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s70, 8
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s71, 9
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 5
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
-; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
+; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow17
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
+; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow30
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
@@ -137,9 +138,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -154,12 +155,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
-; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock12
+; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock14
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow26
+; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow28
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -168,7 +169,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow27
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
@@ -188,8 +189,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s8, 10
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s9, 11
-; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 7
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s55
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
@@ -206,12 +208,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_16
-; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow18
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
@@ -220,7 +222,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[84:85]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -237,7 +239,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[80:81]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[68:69]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -251,9 +253,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
@@ -261,9 +263,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[70:71]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
@@ -274,25 +276,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
-; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow25
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v59, 8
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s8, v59, 10
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    s_mov_b32 s55, s83
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v59, 9
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_mov_b32 s55, s7
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v59, 11
-; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow26
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 3
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb69.i
@@ -318,9 +318,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
@@ -336,9 +336,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s84
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s81
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
@@ -346,92 +346,93 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS0-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
-; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[54:55], s[8:9], 0x8
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[52:53]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
-; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
+; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    s_nop 0
+; GLOBALNESS0-NEXT:    s_load_dword s9, s[38:39], 0x20
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s54, 0
-; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[6:7], v[0:1]
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s8, 0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s9, 0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, v1
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s8, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s9, 1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[82:83], 1, v3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s6, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s7, 1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS0-NEXT:    s_mov_b32 s71, s15
-; GLOBALNESS0-NEXT:    s_mov_b32 s82, s14
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 3
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v4
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 4
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 5
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v1
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 6
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 7
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s84, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s85, 9
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s82, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s83, 9
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 5
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
-; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
+; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow17
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
-; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
+; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow30
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
@@ -451,13 +452,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[84:85]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -468,12 +469,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
-; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock12
+; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock14
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow26
+; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow28
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
@@ -482,7 +483,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow27
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
@@ -502,9 +503,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s8, 10
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s9, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 3
-; GLOBALNESS0-NEXT:    s_mov_b32 s83, s55
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 7
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s55
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
@@ -521,12 +522,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[96:97], 0, v2
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[98:99], 1, v0
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_16
-; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow18
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[68:69]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
@@ -535,7 +536,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[84:85]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -552,21 +553,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[98:99]
 ; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[80:81]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[68:69]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s84, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s85, s39, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s82, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s83, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
@@ -574,9 +575,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[84:85]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[82:83]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
@@ -589,23 +590,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
-; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow23
+; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow25
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v59, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v59, 8
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v59, 10
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v59, 9
+; GLOBALNESS0-NEXT:    s_mov_b32 s55, s81
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v59, 9
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v59, 11
-; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow24
+; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow26
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 3
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb69.i
@@ -631,7 +632,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
@@ -649,7 +650,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s82
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 74e536f813716..66761139dc8ed 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -164,98 +164,102 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
 ;
 ; GFX6-LABEL: test_udivrem_v2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s0, 0, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX6-NEXT:    s_mul_i32 s6, s6, s2
-; GFX6-NEXT:    s_sub_i32 s0, s0, s6
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s6, s0, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_mul_i32 s4, s4, s8
+; GFX6-NEXT:    s_sub_i32 s2, s2, s4
+; GFX6-NEXT:    s_sub_i32 s4, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s4, 0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v0, s4, v1
+; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s9
+; GFX6-NEXT:    s_sub_i32 s0, s3, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_udivrem_v2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT:    s_sub_i32 s6, 0, s2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX8-NEXT:    s_sub_i32 s0, 0, s6
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8-NEXT:    s_mul_i32 s6, s6, s2
-; GFX8-NEXT:    s_sub_i32 s0, s0, s6
-; GFX8-NEXT:    s_sub_i32 s6, s0, s2
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX8-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX8-NEXT:    s_sub_i32 s6, s0, s2
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s2
-; GFX8-NEXT:    s_cselect_b32 s0, s6, s0
-; GFX8-NEXT:    s_sub_i32 s2, 0, s3
-; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    s_mul_i32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s2, s2, s4
+; GFX8-NEXT:    s_sub_i32 s4, s2, s6
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX8-NEXT:    s_sub_i32 s4, s2, s6
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX8-NEXT:    s_sub_i32 s4, 0, s7
+; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v2, s3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    s_mul_i32 s0, s0, s3
-; GFX8-NEXT:    s_sub_i32 s0, s1, s0
-; GFX8-NEXT:    s_sub_i32 s1, s0, s3
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX8-NEXT:    s_mul_i32 s0, s0, s7
+; GFX8-NEXT:    s_sub_i32 s0, s3, s0
+; GFX8-NEXT:    s_sub_i32 s1, s0, s7
+; GFX8-NEXT:    s_cmp_ge_u32 s0, s7
 ; GFX8-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX8-NEXT:    s_sub_i32 s1, s0, s3
-; GFX8-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX8-NEXT:    s_sub_i32 s1, s0, s7
+; GFX8-NEXT:    s_cmp_ge_u32 s0, s7
 ; GFX8-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 97738a7944741..d1500b290b154 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -258,8 +258,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
-; GCN-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %cast = uitofp <2 x i32> %in to <2 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
index 45cb7955b612d..b99b325d02cd8 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
@@ -25,18 +25,19 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a
 define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: zext_i16_to_i64_uniform:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s4, s8, 0xffff
-; GCN-NEXT:    s_add_u32 s4, s6, s4
-; GCN-NEXT:    s_addc_u32 s5, s7, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_and_b32 s0, s8, 0xffff
+; GCN-NEXT:    s_add_u32 s0, s2, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_addc_u32 s1, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %zext = zext i16 %a to i64
   %res = add i64 %b, %zext
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
index c53f4b6d7ff2b..f8659812ad2ad 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
@@ -7,19 +7,24 @@ define void @merge_i32_v2i16_f32_v4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT:    [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 2
-; CHECK-NEXT:    [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR1]], i64 3
-; CHECK-NEXT:    [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT:    [[LOAD2_MUT1:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[LOAD4_MUT2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[LOAD3_MUT3:%.*]] = extractelement <3 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT1]] to <2 x i16>
+; CHECK-NEXT:    [[LOAD3_TOORIG:%.*]] = bitcast i32 [[LOAD3_MUT3]] to float
+; CHECK-NEXT:    [[LOAD4_TOORIG:%.*]] = bitcast i32 [[LOAD4_MUT2]] to <4 x i8>
 ; CHECK-NEXT:    [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
 ; CHECK-NEXT:    store i32 [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
 ; CHECK-NEXT:    [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT:    store <2 x i16> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
-; CHECK-NEXT:    [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2
-; CHECK-NEXT:    store float [[LOAD3]], ptr addrspace(2) [[STORE_GEP3]], align 4
+; CHECK-NEXT:    [[LOAD2_BC:%.*]] = bitcast <2 x i16> [[LOAD2_TOORIG]] to i32
+; CHECK-NEXT:    store i32 [[LOAD2_BC]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT:    [[LOAD3_BC:%.*]] = bitcast float [[LOAD3_TOORIG]] to i32
 ; CHECK-NEXT:    [[STORE_GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(2) [[PTR2]], i64 3
-; CHECK-NEXT:    store <4 x i8> [[LOAD4]], ptr addrspace(2) [[STORE_GEP4]], align 4
+; CHECK-NEXT:    [[LOAD4_BC:%.*]] = bitcast <4 x i8> [[LOAD4_TOORIG]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD4_BC]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD3_BC]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
@@ -45,13 +50,17 @@ define void @merge_f32_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2
 ; CHECK-LABEL: define void @merge_f32_v2f16_type(
 ; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0
-; CHECK-NEXT:    [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT:    [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT:    [[LOAD1_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[LOAD2_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[LOAD1_TOORIG:%.*]] = bitcast i32 [[LOAD1_MUT1]] to float
+; CHECK-NEXT:    [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT2]] to <2 x half>
 ; CHECK-NEXT:    [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
-; CHECK-NEXT:    store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
-; CHECK-NEXT:    [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT:    store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT:    [[LOAD1_BC:%.*]] = bitcast float [[LOAD1_TOORIG]] to i32
+; CHECK-NEXT:    [[LOAD2_BC:%.*]] = bitcast <2 x half> [[LOAD2_TOORIG]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD1_BC]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD2_BC]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index d6b51039d5b44..ac22859ed4e34 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -130,21 +130,39 @@ entry:
   ret void
 }
 
-; Ideally this would be merged
 define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
 ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4
+; CHECK-NEXT:    [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[LD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[LD_1_TOORIG:%.*]] = bitcast i32 [[LD_1_MUT2]] to <2 x i16>
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
+
+  %ld.0 = load i32, ptr addrspace(1) %a
+  %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1
+
+  ret void
+}
+
+define amdgpu_kernel void @merge_load_i32_v2i64(ptr addrspace(1) nocapture %a) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i64(
+; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
 ; CHECK-NEXT:    [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
-; CHECK-NEXT:    [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
+; CHECK-NEXT:    [[LD_1:%.*]] = load <2 x i8>, ptr addrspace(1) [[A_1]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
 
   %ld.0 = load i32, ptr addrspace(1) %a
-  %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1
+  %ld.1 = load <2 x i8>, ptr addrspace(1) %a.1
 
   ret void
 }



More information about the llvm-commits mailing list