[llvm] Redesign Straight-Line Strength Reduction (SLSR) (PR #162930)

Fri Oct 10 22:49:51 PDT 2025

https://github.com/fiigii updated https://github.com/llvm/llvm-project/pull/162930

>From 993123240e39c75e09d067b8afdc6fa288b1c951 Mon Sep 17 00:00:00 2001
From: feip <feip at nvidia.com>
Date: Fri, 10 Oct 2025 14:23:03 -0700
Subject: [PATCH 1/5] Redesign Straight-Line Strength Reduction (SLSR)

---
 .../Scalar/StraightLineStrengthReduce.cpp     | 1101 +++++++++++++----
 .../NVPTX/slsr-i8-gep.ll                      |  156 +++
 .../NVPTX/slsr-var-delta.ll                   |   49 +
 .../path-compression.ll                       |   35 +
 .../pick-candidate.ll                         |   32 +
 .../StraightLineStrengthReduce/slsr-add.ll    |   96 ++
 .../StraightLineStrengthReduce/slsr-gep.ll    |  148 +++
 7 files changed, 1346 insertions(+), 271 deletions(-)
 create mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
 create mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
 create mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll
 create mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll

diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 7d017095c88ce..c43195e9825e7 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -12,17 +12,16 @@
 // effective in simplifying arithmetic statements derived from an unrolled loop.
 // It can also simplify the logic of SeparateConstOffsetFromGEP.
 //
-// There are many optimizations we can perform in the domain of SLSR. This file
-// for now contains only an initial step. Specifically, we look for strength
-// reduction candidates in the following forms:
+// There are many optimizations we can perform in the domain of SLSR.
+// We look for strength reduction candidates in the following forms:
 //
-// Form 1: B + i * S
-// Form 2: (B + i) * S
-// Form 3: &B[i * S]
+// Form Add: B + i * S
+// Form Mul: (B + i) * S
+// Form GEP: &B[i * S]
 //
 // where S is an integer variable, and i is a constant integer. If we found two
 // candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
-// in a simpler way with respect to S1. For example,
+// in a simpler way with respect to S1 (index delta). For example,
 //
 // S1: X = B + i * S
 // S2: Y = B + i' * S   => X + (i' - i) * S
@@ -35,8 +34,26 @@
 //
 // Note: (i' - i) * S is folded to the extent possible.
 //
+// For form Add and GEP, we can also rewrite a candidate in a simpler way
+// with respect to other dominating candidates if their B or S are different
+// but other parts are the same. For example,
+//
+// Base Delta:
+// S1: X = B  + i * S
+// S2: Y = B' + i * S   => X + (B' - B)
+//
+// S1: X = &B [i * S]
+// S2: Y = &B'[i * S]   => X + (B' - B)
+//
+// Stride Delta:
+// S1: X = B + i * S
+// S2: Y = B + i * S'   => X + i * (S' - S)
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i * S']   => X + i * (S' - S)
+//
 // This rewriting is in general a good idea. The code patterns we focus on
-// usually come from loop unrolling, so (i' - i) * S is likely the same
+// usually come from loop unrolling, so the delta is likely the same
 // across iterations and can be reused. When that happens, the optimized form
 // takes only one add starting from the second iteration.
 //
@@ -47,19 +64,14 @@
 // TODO:
 //
 // - Floating point arithmetics when fast math is enabled.
-//
-// - SLSR may decrease ILP at the architecture level. Targets that are very
-//   sensitive to ILP may want to disable it. Having SLSR to consider ILP is
-//   left as future work.
-//
-// - When (i' - i) is constant but i and i' are not, we could still perform
-//   SLSR.
 
 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -86,11 +98,14 @@
 #include <cstdint>
 #include <limits>
 #include <list>
+#include <queue>
 #include <vector>
 
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "slsr"
+
 static const unsigned UnknownAddressSpace =
     std::numeric_limits<unsigned>::max();
 
@@ -142,15 +157,23 @@ class StraightLineStrengthReduce {
       GEP,     // &B[..][i * S][..]
     };
 
+    enum DKind {
+      InvalidDelta, // reserved for the default constructor
+      IndexDelta,   // Delta is a constant from Index
+      BaseDelta,    // Delta is a constant or variable from Base
+      StrideDelta,  // Delta is a constant or variable from Stride
+    };
+
     Candidate() = default;
     Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
-              Instruction *I)
-        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
+              Instruction *I, const SCEV *StrideSCEV)
+        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
+          StrideSCEV(StrideSCEV) {}
 
     Kind CandidateKind = Invalid;
 
     const SCEV *Base = nullptr;
-
+    // TODO: Swap Index and Stride's name.
     // Note that Index and Stride of a GEP candidate do not necessarily have the
     // same integer type. In that case, during rewriting, Stride will be
     // sign-extended or truncated to Index's type.
@@ -177,22 +200,136 @@ class StraightLineStrengthReduce {
     // Points to the immediate basis of this candidate, or nullptr if we cannot
     // find any basis for this candidate.
     Candidate *Basis = nullptr;
+
+    DKind DeltaKind = InvalidDelta;
+
+    // Store SCEV of Stride to compute delta from different strides
+    const SCEV *StrideSCEV = nullptr;
+
+    // Points to (Y - X) that will be used to rewrite this candidate.
+    Value *Delta = nullptr;
+
+    /// Cost model: Evaluate the computational efficiency of the candidate.
+    ///
+    /// Efficiency levels (higher is better):
+    ///   5 - No instruction:
+    ///       [Variable] or [Const]
+    ///   4 - One instruction with one variable:
+    ///       [Variable + Const] or [Variable * Const]
+    ///   3 - One instruction with two variables:
+    ///       [Variable + Variable] or [Variable * Variable]
+    ///   2 - Two instructions with one variable:
+    ///       [Const + Const * Variable]
+    ///   1 - Two instructions with two variables:
+    ///       [Variable + Const * Variable]
+    static unsigned getComputationEfficiency(Kind CandidateKind,
+                                             const ConstantInt *Index,
+                                             const Value *Stride,
+                                             const SCEV *Base = nullptr) {
+      bool IsConstantBase = false;
+      bool IsZeroBase = false;
+      // When evaluating the efficiency of a rewrite, if the Basis's SCEV is
+      // not available, conservatively assume the base is not constant.
+      if (auto *ConstBase = dyn_cast_or_null<SCEVConstant>(Base)) {
+        IsConstantBase = true;
+        IsZeroBase = ConstBase->getValue()->isZero();
+      }
+
+      bool IsConstantStride = isa<ConstantInt>(Stride);
+      bool IsZeroStride =
+          IsConstantStride && cast<ConstantInt>(Stride)->isZero();
+      // All constants
+      if (IsConstantBase && IsConstantStride)
+        return 5;
+
+      // [(Base + Index) * Stride]
+      if (CandidateKind == Mul) {
+        if (IsZeroStride)
+          return 5;
+        if (Index->isZero())
+          return (IsConstantStride || IsConstantBase) ? 4 : 3;
+
+        if (IsConstantBase)
+          return IsZeroBase && (Index->isOne() || Index->isMinusOne()) ? 5 : 4;
+
+        if (IsConstantStride) {
+          auto *CI = cast<ConstantInt>(Stride);
+          return (CI->isOne() || CI->isMinusOne()) ? 4 : 2;
+        }
+        return 1;
+      }
+
+      // Base + Index * Stride
+      assert(CandidateKind == Add || CandidateKind == GEP);
+      if (Index->isZero() || IsZeroStride)
+        return 5;
+
+      bool IsSimpleIndex = Index->isOne() || Index->isMinusOne();
+
+      if (IsConstantBase)
+        return IsZeroBase ? (IsSimpleIndex ? 5 : 4) : (IsSimpleIndex ? 4 : 2);
+
+      if (IsConstantStride)
+        return IsZeroStride ? 5 : 4;
+
+      if (IsSimpleIndex)
+        return 3;
+
+      return 1;
+    }
+
+    // Evaluate if the given delta is profitable to rewrite this candidate.
+    bool isProfitableRewrite(const Value *Delta, const DKind DeltaKind) const {
+      // This function cannot accurately evaluate the profit of whole expression
+      // with context. A candidate (B + I * S) cannot express whether this
+      // instruction needs to compute on its own (I * S), which may be shared
+      // with other candidates or may need instructions to compute.
+      // If the rewritten form has the same strength, still rewrite to
+      // (X + Delta) since it may expose more CSE opportunities on Delta, as
+      // unrolled loops usually have identical Delta for each unrolled body.
+      //
+      // Note, this function should only be used on Index Delta rewrite.
+      // Base and Stride delta need context info to evaluate the register
+      // pressure impact from variable delta.
+      return getComputationEfficiency(CandidateKind, Index, Stride, Base) <=
+             getRewriteProfit(Delta, DeltaKind);
+    }
+
+    // Evaluate the rewrite profit of this candidate with its Basis
+    unsigned getRewriteProfit() const {
+      return Basis ? getRewriteProfit(Delta, DeltaKind) : 0;
+    }
+
+    // Evaluate the rewrite profit of this candidate with a given delta
+    unsigned getRewriteProfit(const Value *Delta, const DKind DeltaKind) const {
+      switch (DeltaKind) {
+      case BaseDelta: // [X + Delta]
+        return getComputationEfficiency(
+            CandidateKind,
+            ConstantInt::get(cast<IntegerType>(Delta->getType()), 1), Delta);
+      case StrideDelta: // [X + Index * Delta]
+        return getComputationEfficiency(CandidateKind, Index, Delta);
+      case IndexDelta: // [X + Delta * Stride]
+        return getComputationEfficiency(CandidateKind, cast<ConstantInt>(Delta),
+                                        Stride);
+      default:
+        return 0;
+      }
+    }
+
+    bool isHighEfficiency() const {
+      return getComputationEfficiency(CandidateKind, Index, Stride, Base) >= 4;
+    }
   };
 
   bool runOnFunction(Function &F);
 
 private:
-  // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
-  // share the same base and stride.
-  bool isBasisFor(const Candidate &Basis, const Candidate &C);
-
+  // Fetch straight-line basis for rewriting C, update C.Basis to point to it,
+  // and store the delta between C and its Basis in C.Delta.
+  void setBasisAndDeltaFor(Candidate &C);
   // Returns whether the candidate can be folded into an addressing mode.
-  bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
-                  const DataLayout *DL);
-
-  // Returns true if C is already in a simplest form and not worth being
-  // rewritten.
-  bool isSimplestForm(const Candidate &C);
+  bool isFoldable(const Candidate &C, TargetTransformInfo *TTI);
 
   // Checks whether I is in a candidate form. If so, adds all the matching forms
   // to Candidates, and tries to find the immediate basis for each of them.
@@ -216,12 +353,6 @@ class StraightLineStrengthReduce {
   // Allocate candidates and find bases for GetElementPtr instructions.
   void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
 
-  // A helper function that scales Idx with ElementSize before invoking
-  // allocateCandidatesAndFindBasis.
-  void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
-                                            Value *S, uint64_t ElementSize,
-                                            Instruction *I);
-
   // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
   // basis.
   void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
@@ -231,12 +362,6 @@ class StraightLineStrengthReduce {
   // Rewrites candidate C with respect to Basis.
   void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
 
-  // A helper function that factors ArrayIdx to a product of a stride and a
-  // constant index, and invokes allocateCandidatesAndFindBasis with the
-  // factorings.
-  void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
-                        GetElementPtrInst *GEP);
-
   // Emit code that computes the "bump" from Basis to C.
   static Value *emitBump(const Candidate &Basis, const Candidate &C,
                          IRBuilder<> &Builder, const DataLayout *DL);
@@ -247,12 +372,205 @@ class StraightLineStrengthReduce {
   TargetTransformInfo *TTI = nullptr;
   std::list<Candidate> Candidates;
 
-  // Temporarily holds all instructions that are unlinked (but not deleted) by
-  // rewriteCandidateWithBasis. These instructions will be actually removed
-  // after all rewriting finishes.
-  std::vector<Instruction *> UnlinkedInstructions;
+  // Map from SCEV to instructions that represent the value,
+  // instructions are sorted in depth-first order.
+  DenseMap<const SCEV *, SmallSetVector<Instruction *, 2>> SCEVToInsts;
+
+  // Record the dependency between instructions. If C.Basis == B, we would have
+  // {B.Ins -> {C.Ins, ...}}.
+  MapVector<Instruction *, std::vector<Instruction *>> DependencyGraph;
+
+  // Map between each instruction and its possible candidates.
+  DenseMap<Instruction *, SmallVector<Candidate *, 3>> RewriteCandidates;
+
+  // All instructions that have candidates sort in topological order based on
+  // dependency graph, from roots to leaves.
+  std::vector<Instruction *> SortedCandidateInsts;
+
+  // Record all instructions that are already rewritten and will be removed
+  // later.
+  std::vector<Instruction *> DeadInstructions;
+
+  // Classify candidates against Delta kind
+  class CandidateDictTy {
+  public:
+    using CandsTy = SmallVector<Candidate *, 8>;
+    using BBToCandsTy = DenseMap<const BasicBlock *, CandsTy>;
+
+  private:
+    // Index delta Basis must have the same (Base, StrideSCEV, Inst.Type)
+    using IndexDeltaKeyTy = std::tuple<const SCEV *, const SCEV *, Type *>;
+    DenseMap<IndexDeltaKeyTy, BBToCandsTy> IndexDeltaCandidates;
+
+    // Base delta Basis must have the same (StrideSCEV, Index, Inst.Type)
+    using BaseDeltaKeyTy = std::tuple<const SCEV *, ConstantInt *, Type *>;
+    DenseMap<BaseDeltaKeyTy, BBToCandsTy> BaseDeltaCandidates;
+
+    // Stride delta Basis must have the same (Base, Index, Inst.Type)
+    using StrideDeltaKeyTy = std::tuple<const SCEV *, ConstantInt *, Type *>;
+    DenseMap<StrideDeltaKeyTy, BBToCandsTy> StrideDeltaCandidates;
+
+  public:
+    // TODO: Disable index delta on GEP after we completely move
+    // from typed GEP to PtrAdd.
+    const BBToCandsTy *getCandidatesWithDeltaKind(const Candidate &C,
+                                                  Candidate::DKind K) const {
+      assert(K != Candidate::InvalidDelta);
+      if (K == Candidate::IndexDelta) {
+        IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, C.Ins->getType());
+        auto It = IndexDeltaCandidates.find(IndexDeltaKey);
+        if (It != IndexDeltaCandidates.end())
+          return &It->second;
+      } else if (K == Candidate::BaseDelta) {
+        BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, C.Ins->getType());
+        auto It = BaseDeltaCandidates.find(BaseDeltaKey);
+        if (It != BaseDeltaCandidates.end())
+          return &It->second;
+      } else {
+        assert(K == Candidate::StrideDelta);
+        StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, C.Ins->getType());
+        auto It = StrideDeltaCandidates.find(StrideDeltaKey);
+        if (It != StrideDeltaCandidates.end())
+          return &It->second;
+      }
+      return nullptr;
+    }
+
+    // Pointers to C must remain valid until CandidateDict is cleared.
+    void add(Candidate &C) {
+      Type *ValueType = C.Ins->getType();
+      BasicBlock *BB = C.Ins->getParent();
+      IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, ValueType);
+      BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, ValueType);
+      StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, ValueType);
+      IndexDeltaCandidates[IndexDeltaKey][BB].push_back(&C);
+      BaseDeltaCandidates[BaseDeltaKey][BB].push_back(&C);
+      StrideDeltaCandidates[StrideDeltaKey][BB].push_back(&C);
+    }
+    // Remove all mappings from set
+    void clear() {
+      IndexDeltaCandidates.clear();
+      BaseDeltaCandidates.clear();
+      StrideDeltaCandidates.clear();
+    }
+  } CandidateDict;
+
+  const SCEV *getAndRecordSCEV(Value *V) {
+    auto *S = SE->getSCEV(V);
+    if (auto *I = dyn_cast<Instruction>(V))
+      if (!isa<SCEVCouldNotCompute>(S) && !isa<SCEVUnknown>(S) &&
+          !isa<SCEVConstant>(S))
+        SCEVToInsts[S].insert(I);
+
+    return S;
+  }
+
+  // Get the nearest instruction before CI that represents the value of S,
+  // return nullptr if no instruction is associated with S or S is not a
+  // reusable expression.
+  Value *getNearestValueOfSCEV(const SCEV *S, const Instruction *CI) const {
+    if (isa<SCEVCouldNotCompute>(S))
+      return nullptr;
+
+    if (auto *SU = dyn_cast<SCEVUnknown>(S))
+      return SU->getValue();
+    if (auto *SC = dyn_cast<SCEVConstant>(S))
+      return SC->getValue();
+
+    auto It = SCEVToInsts.find(S);
+    if (It == SCEVToInsts.end())
+      return nullptr;
+
+    for (Instruction *I : reverse(It->second))
+      if (DT->dominates(I, CI))
+        return I;
+
+    return nullptr;
+  }
+
+  struct DeltaInfo {
+    Candidate *Cand;
+    Candidate::DKind DeltaKind;
+    Value *Delta;
+
+    DeltaInfo()
+        : Cand(nullptr), DeltaKind(Candidate::InvalidDelta), Delta(nullptr) {}
+    DeltaInfo(Candidate *Cand, Candidate::DKind DeltaKind, Value *Delta)
+        : Cand(Cand), DeltaKind(DeltaKind), Delta(Delta) {}
+    operator bool() const { return Cand != nullptr; }
+  };
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const DeltaInfo &DI);
+
+  DeltaInfo compressPath(Candidate &C, Candidate *Basis) const;
+
+  Candidate *pickRewriteCandidate(Instruction *I) const;
+  void sortCandidateInstructions();
+  static Constant *getIndexDelta(Candidate &C, Candidate &Basis);
+  static bool isSimilar(Candidate &C, Candidate &Basis, Candidate::DKind K);
+
+  // Add Basis -> C in DependencyGraph and propagate
+  // C.Stride and C.Delta's dependency to C
+  void addDependency(Candidate &C, Candidate *Basis) {
+    if (Basis)
+      DependencyGraph[Basis->Ins].emplace_back(C.Ins);
+
+    // If any candidate of Inst has a basis, then Inst will be rewritten,
+    // C must be rewritten after rewriting Inst, so we need to propagate
+    // the dependency to C
+    auto PropagateDependency = [&](Instruction *Inst) {
+      if (auto CandsIt = RewriteCandidates.find(Inst);
+          CandsIt != RewriteCandidates.end())
+        if (std::any_of(CandsIt->second.begin(), CandsIt->second.end(),
+                        [](Candidate *Cand) { return Cand->Basis; }))
+          DependencyGraph[Inst].emplace_back(C.Ins);
+    };
+
+    // If C has a variable delta and the delta is a candidate,
+    // propagate its dependency to C
+    if (auto *DeltaInst = dyn_cast_or_null<Instruction>(C.Delta))
+      PropagateDependency(DeltaInst);
+
+    // If the stride is a candidate, propagate its dependency to C
+    if (auto *StrideInst = dyn_cast<Instruction>(C.Stride))
+      PropagateDependency(StrideInst);
+  };
 };
 
+inline llvm::raw_ostream &
+operator<<(llvm::raw_ostream &OS,
+           const StraightLineStrengthReduce::Candidate &C) {
+  OS << "Ins: " << *C.Ins << "\n  Base: " << *C.Base
+     << "\n  Index: " << *C.Index << "\n  Stride: " << *C.Stride
+     << "\n  StrideSCEV: " << *C.StrideSCEV;
+  if (C.Basis)
+    OS << "\n  Delta: " << *C.Delta << "\n  Basis: \n  [ " << *C.Basis << " ]";
+  return OS;
+}
+
+LLVM_ATTRIBUTE_UNUSED
+inline llvm::raw_ostream &
+operator<<(llvm::raw_ostream &OS,
+           const StraightLineStrengthReduce::DeltaInfo &DI) {
+  OS << "Cand: " << *DI.Cand << "\n";
+  OS << "Delta Kind: ";
+  switch (DI.DeltaKind) {
+  case StraightLineStrengthReduce::Candidate::IndexDelta:
+    OS << "Index";
+    break;
+  case StraightLineStrengthReduce::Candidate::BaseDelta:
+    OS << "Base";
+    break;
+  case StraightLineStrengthReduce::Candidate::StrideDelta:
+    OS << "Stride";
+    break;
+  default:
+    break;
+  }
+  OS << "\nDelta: " << *DI.Delta;
+  return OS;
+}
+
 } // end anonymous namespace
 
 char StraightLineStrengthReduceLegacyPass::ID = 0;
@@ -269,17 +587,284 @@ FunctionPass *llvm::createStraightLineStrengthReducePass() {
   return new StraightLineStrengthReduceLegacyPass();
 }
 
-bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
-                                            const Candidate &C) {
-  return (Basis.Ins != C.Ins && // skip the same instruction
-          // They must have the same type too. Basis.Base == C.Base
-          // doesn't guarantee their types are the same (PR23975).
-          Basis.Ins->getType() == C.Ins->getType() &&
-          // Basis must dominate C in order to rewrite C with respect to Basis.
-          DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
-          // They share the same base, stride, and candidate kind.
-          Basis.Base == C.Base && Basis.Stride == C.Stride &&
-          Basis.CandidateKind == C.CandidateKind);
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+  if (A.getBitWidth() < B.getBitWidth())
+    A = A.sext(B.getBitWidth());
+  else if (A.getBitWidth() > B.getBitWidth())
+    B = B.sext(A.getBitWidth());
+}
+
+Constant *StraightLineStrengthReduce::getIndexDelta(Candidate &C,
+                                                    Candidate &Basis) {
+  APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+  unifyBitWidth(Idx, BasisIdx);
+  APInt IndexOffset = Idx - BasisIdx;
+  IntegerType *DeltaType =
+      IntegerType::get(C.Ins->getContext(), IndexOffset.getBitWidth());
+  return ConstantInt::get(DeltaType, IndexOffset);
+}
+
+bool StraightLineStrengthReduce::isSimilar(Candidate &C, Candidate &Basis,
+                                           Candidate::DKind K) {
+  bool SameType = false;
+  switch (K) {
+  case Candidate::StrideDelta:
+    SameType = C.StrideSCEV->getType() == Basis.StrideSCEV->getType();
+    break;
+  case Candidate::BaseDelta:
+    SameType = C.Base->getType() == Basis.Base->getType();
+    break;
+  case Candidate::IndexDelta:
+    SameType = true;
+    break;
+  default:;
+  }
+  return SameType && Basis.Ins != C.Ins &&
+         Basis.CandidateKind == C.CandidateKind;
+}
+
+void StraightLineStrengthReduce::setBasisAndDeltaFor(Candidate &C) {
+  auto SearchFrom = [this, &C](const CandidateDictTy::BBToCandsTy &BBToCands,
+                               auto IsTarget) -> bool {
+    // Search dominating candidates by walking the immediate-dominator chain
+    // from the candidate's defining block upward. Visiting blocks in this
+    // order ensures we prefer the closest dominating basis.
+    const BasicBlock *BB = C.Ins->getParent();
+    while (BB) {
+      auto It = BBToCands.find(BB);
+      if (It != BBToCands.end())
+        for (Candidate *Basis : reverse(It->second))
+          if (IsTarget(Basis))
+            return true;
+
+      const DomTreeNode *Node = DT->getNode(BB);
+      if (!Node)
+        break;
+      Node = Node->getIDom();
+      BB = Node ? Node->getBlock() : nullptr;
+    }
+    return false;
+  };
+
+  // Priority:
+  // Constant Delta from Index > Constant Delta from Base >
+  // Constant Delta from Stride > Variable Delta from Base or Stride
+  // TODO: Change the priority to align with the cost model.
+
+  // First, look for a constant index-diff basis
+  if (const auto *IndexDeltaCandidates =
+          CandidateDict.getCandidatesWithDeltaKind(C, Candidate::IndexDelta)) {
+    bool FoundConstDelta =
+        SearchFrom(*IndexDeltaCandidates, [&DT = DT, &C](Candidate *Basis) {
+          if (isSimilar(C, *Basis, Candidate::IndexDelta)) {
+            assert(DT->dominates(Basis->Ins, C.Ins));
+            auto *Delta = getIndexDelta(C, *Basis);
+            if (!C.isProfitableRewrite(Delta, Candidate::IndexDelta))
+              return false;
+            C.Basis = Basis;
+            C.DeltaKind = Candidate::IndexDelta;
+            C.Delta = Delta;
+            LLVM_DEBUG(dbgs() << "Found delta from Index " << *C.Delta << "\n");
+            return true;
+          }
+          return false;
+        });
+    if (FoundConstDelta)
+      return;
+  }
+
+  // No constant-index-diff basis found. look for the best possible base-diff
+  // or stride-diff basis
+  // Base/Stride diffs not supported for form (B + i) * S
+  if (C.CandidateKind == Candidate::Mul)
+    return;
+
+  auto For = [this, &C](Candidate::DKind K) {
+    // return true if find a Basis with constant delta and stop searching,
+    // return false if did not find a Basis or the delta is not a constant
+    // and continue searching for a Basis with constant delta
+    return [K, this, &C](Candidate *Basis) -> bool {
+      if (!isSimilar(C, *Basis, K))
+        return false;
+
+      assert(DT->dominates(Basis->Ins, C.Ins));
+      const SCEV *BasisPart =
+          (K == Candidate::BaseDelta) ? Basis->Base : Basis->StrideSCEV;
+      const SCEV *CandPart =
+          (K == Candidate::BaseDelta) ? C.Base : C.StrideSCEV;
+      const SCEV *Diff = SE->getMinusSCEV(CandPart, BasisPart);
+      Value *AvailableVal = getNearestValueOfSCEV(Diff, C.Ins);
+      if (!AvailableVal)
+        return false;
+
+      // Record delta if none has been found yet, or the new delta is
+      // a constant that is better than the existing delta.
+      if (!C.Delta || isa<ConstantInt>(AvailableVal)) {
+        C.Delta = AvailableVal;
+        C.Basis = Basis;
+        C.DeltaKind = K;
+      }
+      return isa<ConstantInt>(C.Delta);
+    };
+  };
+
+  if (const auto *BaseDeltaCandidates =
+          CandidateDict.getCandidatesWithDeltaKind(C, Candidate::BaseDelta)) {
+    if (SearchFrom(*BaseDeltaCandidates, For(Candidate::BaseDelta))) {
+      LLVM_DEBUG(dbgs() << "Found delta from Base: " << *C.Delta << "\n");
+      return;
+    }
+  }
+
+  if (const auto *StrideDeltaCandidates =
+          CandidateDict.getCandidatesWithDeltaKind(C, Candidate::StrideDelta)) {
+    if (SearchFrom(*StrideDeltaCandidates, For(Candidate::StrideDelta))) {
+      LLVM_DEBUG(dbgs() << "Found delta from Stride: " << *C.Delta << "\n");
+      return;
+    }
+  }
+
+  // If we did not find a constant delta, we might have found a variable delta
+  if (C.Delta) {
+    LLVM_DEBUG(dbgs() << "Found delta from ";
+               if (C.DeltaKind == Candidate::BaseDelta) dbgs() << "Base: ";
+               else dbgs() << "Stride: "; dbgs() << *C.Delta << "\n");
+    assert(C.DeltaKind != Candidate::InvalidDelta && C.Basis);
+  }
+}
+
+// Compress the path from `Basis` to the deepest Basis in the Basis chain
+// to avoid non-profitable data dependency and improve ILP.
+// X = A + 1
+// Y = X + 1
+// Z = Y + 1
+// ->
+// X = A + 1
+// Y = A + 2
+// Z = A + 3
+// Return the delta info for C aginst the new Basis
+StraightLineStrengthReduce::compressPath(Candidate &C,
+                                         Candidate *Basis) const->DeltaInfo {
+  if (!Basis || !Basis->Basis || C.CandidateKind == Candidate::Mul)
+    return {};
+  Candidate *Root = Basis;
+  Value *NewDelta = nullptr;
+  auto NewKind = Candidate::InvalidDelta;
+
+  while (Root->Basis) {
+    Candidate *NextRoot = Root->Basis;
+    if (C.Base == NextRoot->Base && C.StrideSCEV == NextRoot->StrideSCEV &&
+        isSimilar(C, *NextRoot, Candidate::IndexDelta)) {
+      ConstantInt *CI = cast<ConstantInt>(getIndexDelta(C, *NextRoot));
+      if (CI->isZero() || CI->isOne() || isa<SCEVConstant>(C.StrideSCEV)) {
+        Root = NextRoot;
+        NewKind = Candidate::IndexDelta;
+        NewDelta = CI;
+        continue;
+      }
+    }
+
+    const SCEV *CandPart = nullptr;
+    const SCEV *BasisPart = nullptr;
+    auto CurrKind = Candidate::InvalidDelta;
+    if (C.Base == NextRoot->Base && C.Index == NextRoot->Index) {
+      CandPart = C.StrideSCEV;
+      BasisPart = NextRoot->StrideSCEV;
+      CurrKind = Candidate::StrideDelta;
+    } else if (C.StrideSCEV == NextRoot->StrideSCEV &&
+               C.Index == NextRoot->Index) {
+      CandPart = C.Base;
+      BasisPart = NextRoot->Base;
+      CurrKind = Candidate::BaseDelta;
+    } else
+      break;
+
+    if (!isSimilar(C, *NextRoot, CurrKind))
+      break;
+
+    if (auto DeltaVal =
+            dyn_cast<SCEVConstant>(SE->getMinusSCEV(CandPart, BasisPart))) {
+      Root = NextRoot;
+      NewDelta = DeltaVal->getValue();
+      NewKind = CurrKind;
+    } else
+      break;
+  }
+
+  if (Root != Basis) {
+    assert(NewKind != Candidate::InvalidDelta && NewDelta);
+    LLVM_DEBUG(dbgs() << "Found new Basis with " << *NewDelta
+                      << " from path compression.\n");
+    return {Root, NewKind, NewDelta};
+  }
+
+  return {};
+}
+
+// Topologically sort candidate instructions based on their relationship in
+// dependency graph.
+void StraightLineStrengthReduce::sortCandidateInstructions() {
+  SortedCandidateInsts.clear();
+  // An instruction may have multiple candidates that get different Basis
+  // instructions, and each candidate can get dependencies from Basis and
+  // Stride when Stride will also be rewritten by SLSR. Hence, an instruction
+  // may have multiple dependencies. Use InDegree to ensure all dependencies
+  // processed before processing itself.
+  DenseMap<Instruction *, int> InDegree;
+  for (auto &KV : DependencyGraph) {
+    if (InDegree.find(KV.first) == InDegree.end())
+      InDegree[KV.first] = 0;
+
+    for (auto *Child : KV.second) {
+      InDegree[Child]++;
+    }
+  }
+  std::queue<Instruction *> WorkList;
+  DenseSet<Instruction *> Visited;
+
+  for (auto &KV : DependencyGraph)
+    if (InDegree[KV.first] == 0)
+      WorkList.push(KV.first);
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.front();
+    WorkList.pop();
+    if (!Visited.insert(I).second)
+      continue;
+
+    SortedCandidateInsts.push_back(I);
+
+    for (auto *Next : DependencyGraph[I]) {
+      InDegree[Next]--;
+      if (InDegree[Next] == 0)
+        WorkList.push(Next);
+    }
+  }
+
+  assert(SortedCandidateInsts.size() == DependencyGraph.size() &&
+         "Dependency graph should not have cycles");
+}
+
+StraightLineStrengthReduce::pickRewriteCandidate(
+    Instruction *I) const->Candidate * {
+  // Return the candidate of instruction I that has the highest profit.
+  auto It = RewriteCandidates.find(I);
+  if (It == RewriteCandidates.end())
+    return nullptr;
+
+  Candidate *BestC = nullptr;
+  unsigned BestProfit = 0;
+  for (Candidate *C : reverse(It->second))
+    if (C->Basis) {
+      unsigned Profit = C->getRewriteProfit();
+      if (Profit > BestProfit) {
+        BestProfit = Profit;
+        BestC = C;
+      }
+    }
+
+  return BestC;
 }
 
 static bool isGEPFoldable(GetElementPtrInst *GEP,
@@ -299,8 +884,7 @@ static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
 }
 
 bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
-                                            TargetTransformInfo *TTI,
-                                            const DataLayout *DL) {
+                                            TargetTransformInfo *TTI) {
   if (C.CandidateKind == Candidate::Add)
     return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
   if (C.CandidateKind == Candidate::GEP)
@@ -308,75 +892,39 @@ bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
   return false;
 }
 
-// Returns true if GEP has zero or one non-zero index.
-static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
-  unsigned NumNonZeroIndices = 0;
-  for (Use &Idx : GEP->indices()) {
-    ConstantInt *ConstIdx = dyn_cast<ConstantInt>(Idx);
-    if (ConstIdx == nullptr || !ConstIdx->isZero())
-      ++NumNonZeroIndices;
-  }
-  return NumNonZeroIndices <= 1;
-}
-
-bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
-  if (C.CandidateKind == Candidate::Add) {
-    // B + 1 * S or B + (-1) * S
-    return C.Index->isOne() || C.Index->isMinusOne();
-  }
-  if (C.CandidateKind == Candidate::Mul) {
-    // (B + 0) * S
-    return C.Index->isZero();
-  }
-  if (C.CandidateKind == Candidate::GEP) {
-    // (char*)B + S or (char*)B - S
-    return ((C.Index->isOne() || C.Index->isMinusOne()) &&
-            hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
-  }
-  return false;
-}
-
-// TODO: We currently implement an algorithm whose time complexity is linear in
-// the number of existing candidates. However, we could do better by using
-// ScopedHashTable. Specifically, while traversing the dominator tree, we could
-// maintain all the candidates that dominate the basic block being traversed in
-// a ScopedHashTable. This hash table is indexed by the base and the stride of
-// a candidate. Therefore, finding the immediate basis of a candidate boils down
-// to one hash-table look up.
 void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
     Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
     Instruction *I) {
-  Candidate C(CT, B, Idx, S, I);
-  // SLSR can complicate an instruction in two cases:
-  //
-  // 1. If we can fold I into an addressing mode, computing I is likely free or
-  // takes only one instruction.
-  //
-  // 2. I is already in a simplest form. For example, when
-  //      X = B + 8 * S
-  //      Y = B + S,
-  //    rewriting Y to X - 7 * S is probably a bad idea.
+  // Record the SCEV of S that we may use it as a variable delta.
+  // Ensure that we rewrite C with a existing IR that reproduces delta value.
+
+  Candidate C(CT, B, Idx, S, I, getAndRecordSCEV(S));
+  // If we can fold I into an addressing mode, computing I is likely free or
+  // takes only one instruction. So, we don't need to analyze or rewrite it.
   //
-  // In the above cases, we still add I to the candidate list so that I can be
-  // the basis of other candidates, but we leave I's basis blank so that I
-  // won't be rewritten.
-  if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
-    // Try to compute the immediate basis of C.
-    unsigned NumIterations = 0;
-    // Limit the scan radius to avoid running in quadratice time.
-    static const unsigned MaxNumIterations = 50;
-    for (auto Basis = Candidates.rbegin();
-         Basis != Candidates.rend() && NumIterations < MaxNumIterations;
-         ++Basis, ++NumIterations) {
-      if (isBasisFor(*Basis, C)) {
-        C.Basis = &(*Basis);
-        break;
-      }
+  // Currently, this algorithm can at best optimize complex computations into
+  // a `variable +/* constant` form. However, some targets have stricter
+  // constraints on the their addressing mode.
+  // For example, a `variable + constant` can only be folded to an addressing
+  // mode if the constant falls within a certain range.
+  // So, we also check if the instruction is already high efficient enough
+  // for the strength reduction algorithm.
+  if (!isFoldable(C, TTI) && !C.isHighEfficiency()) {
+    setBasisAndDeltaFor(C);
+
+    // Compress unnecessary rewrite to improve ILP
+    if (auto Res = compressPath(C, C.Basis)) {
+      C.Basis = Res.Cand;
+      C.DeltaKind = Res.DeltaKind;
+      C.Delta = Res.Delta;
     }
   }
   // Regardless of whether we find a basis for C, we need to push C to the
   // candidate list so that it can be the basis of other candidates.
+  LLVM_DEBUG(dbgs() << "Allocated Candidate: " << C << "\n");
   Candidates.push_back(C);
+  RewriteCandidates[C.Ins].push_back(&Candidates.back());
+  CandidateDict.add(Candidates.back());
 }
 
 void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
@@ -475,54 +1023,6 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
   }
 }
 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
-    const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
-    Instruction *I) {
-  // I = B + sext(Idx *nsw S) * ElementSize
-  //   = B + (sext(Idx) * sext(S)) * ElementSize
-  //   = B + (sext(Idx) * ElementSize) * sext(S)
-  // Casting to IntegerType is safe because we skipped vector GEPs.
-  IntegerType *PtrIdxTy = cast<IntegerType>(DL->getIndexType(I->getType()));
-  ConstantInt *ScaledIdx = ConstantInt::get(
-      PtrIdxTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
-  allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
-}
-
-void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
-                                                  const SCEV *Base,
-                                                  uint64_t ElementSize,
-                                                  GetElementPtrInst *GEP) {
-  // At least, ArrayIdx = ArrayIdx *nsw 1.
-  allocateCandidatesAndFindBasisForGEP(
-      Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
-      ArrayIdx, ElementSize, GEP);
-  Value *LHS = nullptr;
-  ConstantInt *RHS = nullptr;
-  // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
-  // itself. This would allow us to handle the shl case for free. However,
-  // matching SCEVs has two issues:
-  //
-  // 1. this would complicate rewriting because the rewriting procedure
-  // would have to translate SCEVs back to IR instructions. This translation
-  // is difficult when LHS is further evaluated to a composite SCEV.
-  //
-  // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
-  // to strip nsw/nuw flags which are critical for SLSR to trace into
-  // sext'ed multiplication.
-  if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
-    // SLSR is currently unsafe if i * S may overflow.
-    // GEP = Base + sext(LHS *nsw RHS) * ElementSize
-    allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
-  } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
-    // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
-    //     = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
-    APInt One(RHS->getBitWidth(), 1);
-    ConstantInt *PowerOf2 =
-        ConstantInt::get(RHS->getContext(), One << RHS->getValue());
-    allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
-  }
-}
-
 void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
     GetElementPtrInst *GEP) {
   // TODO: handle vector GEPs
@@ -546,11 +1046,14 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
     const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
     Value *ArrayIdx = GEP->getOperand(I);
     uint64_t ElementSize = GTI.getSequentialElementStride(*DL);
+    IntegerType *PtrIdxTy = cast<IntegerType>(DL->getIndexType(GEP->getType()));
+    ConstantInt *ElementSizeIdx = ConstantInt::get(PtrIdxTy, ElementSize, true);
     if (ArrayIdx->getType()->getIntegerBitWidth() <=
         DL->getIndexSizeInBits(GEP->getAddressSpace())) {
       // Skip factoring if ArrayIdx is wider than the index size, because
       // ArrayIdx is implicitly truncated to the index size.
-      factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+      allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx,
+                                     ArrayIdx, GEP);
     }
     // When ArrayIdx is the sext of a value, we try to factor that value as
     // well.  Handling this case is important because array indices are
@@ -561,55 +1064,101 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
             DL->getIndexSizeInBits(GEP->getAddressSpace())) {
       // Skip factoring if TruncatedArrayIdx is wider than the pointer size,
       // because TruncatedArrayIdx is implicitly truncated to the pointer size.
-      factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+      allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx,
+                                     TruncatedArrayIdx, GEP);
     }
 
     IndexExprs[I - 1] = OrigIndexExpr;
   }
 }
 
-// A helper function that unifies the bitwidth of A and B.
-static void unifyBitWidth(APInt &A, APInt &B) {
-  if (A.getBitWidth() < B.getBitWidth())
-    A = A.sext(B.getBitWidth());
-  else if (A.getBitWidth() > B.getBitWidth())
-    B = B.sext(A.getBitWidth());
-}
-
 Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
                                             const Candidate &C,
                                             IRBuilder<> &Builder,
                                             const DataLayout *DL) {
-  APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
-  unifyBitWidth(Idx, BasisIdx);
-  APInt IndexOffset = Idx - BasisIdx;
+  auto CreateMul = [&](Value *LHS, Value *RHS) {
+    if (isa<ConstantInt>(RHS)) {
+      APInt ConstRHS = cast<ConstantInt>(RHS)->getValue();
+      IntegerType *DeltaType =
+          IntegerType::get(C.Ins->getContext(), ConstRHS.getBitWidth());
+      if (ConstRHS.isPowerOf2()) {
+        ConstantInt *Exponent =
+            ConstantInt::get(DeltaType, ConstRHS.logBase2());
+        return Builder.CreateShl(LHS, Exponent);
+      }
+      if (ConstRHS.isNegatedPowerOf2()) {
+        ConstantInt *Exponent =
+            ConstantInt::get(DeltaType, (-ConstRHS).logBase2());
+        return Builder.CreateNeg(Builder.CreateShl(LHS, Exponent));
+      }
+    }
 
-  // Compute Bump = C - Basis = (i' - i) * S.
-  // Common case 1: if (i' - i) is 1, Bump = S.
-  if (IndexOffset == 1)
-    return C.Stride;
-  // Common case 2: if (i' - i) is -1, Bump = -S.
-  if (IndexOffset.isAllOnes())
-    return Builder.CreateNeg(C.Stride);
+    return Builder.CreateMul(LHS, RHS);
+  };
 
-  // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
-  // have different bit widths.
-  IntegerType *DeltaType =
-      IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
-  Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
-  if (IndexOffset.isPowerOf2()) {
-    // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
-    ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
-    return Builder.CreateShl(ExtendedStride, Exponent);
-  }
-  if (IndexOffset.isNegatedPowerOf2()) {
-    // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
-    ConstantInt *Exponent =
-        ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
-    return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
+  if (C.DeltaKind == Candidate::IndexDelta) {
+    APInt IndexOffset = cast<ConstantInt>(C.Delta)->getValue();
+    // IndexDelta
+    // X = B + i * S
+    // Y = B + i` * S
+    //   = B + (i' - i) * S
+    //   = X + Delta * S
+    // Bump = (i' - i) * S
+
+    // If Delta is 0, C is a fully redundant of C.Basis,
+    // just replace C.Ins with Basis.Ins
+    if (IndexOffset.isZero())
+      return nullptr;
+
+    // Compute Bump = C - Basis = (i' - i) * S.
+    // Common case 1: if (i' - i) is 1, Bump = S.
+    if (IndexOffset == 1)
+      return C.Stride;
+    // Common case 2: if (i' - i) is -1, Bump = -S.
+    if (IndexOffset.isAllOnes())
+      return Builder.CreateNeg(C.Stride);
+
+    IntegerType *DeltaType =
+        IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+    Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+
+    return CreateMul(ExtendedStride, C.Delta);
+  } else {
+    assert(C.DeltaKind == Candidate::StrideDelta ||
+           C.DeltaKind == Candidate::BaseDelta);
+    assert(C.CandidateKind != Candidate::Mul);
+    // StrideDelta
+    // X = B + i * S
+    // Y = B + i * S'
+    //   = B + i * (S + Delta)
+    //   = B + i * S + i * Delta
+    //   = X + i * StrideDelta
+    // Bump = i * (S' - S)
+    //
+    // BaseDelta
+    // X = B  + i * S
+    // Y = B' + i * S
+    //   = (B + Delta) + i * S
+    //   = X + BaseDelta
+    // Bump = (B' - B).
+    Value *Bump = C.Delta;
+    if (C.DeltaKind == Candidate::StrideDelta) {
+      // If this value is consumed by a GEP, promote StrideDelta before doing
+      // StrideDelta * Index to ensure the same semantics as the original GEP.
+      if (C.CandidateKind == Candidate::GEP) {
+        auto *GEP = cast<GetElementPtrInst>(C.Ins);
+        Type *NewScalarIndexTy =
+            DL->getIndexType(GEP->getPointerOperandType()->getScalarType());
+        Bump = Builder.CreateSExtOrTrunc(Bump, NewScalarIndexTy);
+      }
+      if (!C.Index->isOne()) {
+        Value *ExtendedIndex =
+            Builder.CreateSExtOrTrunc(C.Index, Bump->getType());
+        Bump = CreateMul(Bump, ExtendedIndex);
+      }
+    }
+    return Bump;
   }
-  Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
-  return Builder.CreateMul(ExtendedStride, Delta);
 }
 
 void StraightLineStrengthReduce::rewriteCandidateWithBasis(
@@ -617,62 +1166,62 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
   if (!DebugCounter::shouldExecute(StraightLineStrengthReduceCounter))
     return;
 
-  assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
-         C.Stride == Basis.Stride);
-  // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
-  // basis of a candidate cannot be unlinked before the candidate.
-  assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
-
-  // An instruction can correspond to multiple candidates. Therefore, instead of
-  // simply deleting an instruction when we rewrite it, we mark its parent as
-  // nullptr (i.e. unlink it) so that we can skip the candidates whose
-  // instruction is already rewritten.
-  if (!C.Ins->getParent())
-    return;
+  // If one of Base, Index, and Stride are different,
+  // other parts must be the same
+  assert(C.Delta && C.CandidateKind == Basis.CandidateKind &&
+         ((C.Base == Basis.Base && C.StrideSCEV == Basis.StrideSCEV &&
+           C.DeltaKind == Candidate::IndexDelta) ||
+          (C.Base == Basis.Base && C.Index == Basis.Index &&
+           C.DeltaKind == Candidate::StrideDelta) ||
+          (C.StrideSCEV == Basis.StrideSCEV && C.Index == Basis.Index &&
+           C.DeltaKind == Candidate::BaseDelta)));
 
   IRBuilder<> Builder(C.Ins);
   Value *Bump = emitBump(Basis, C, Builder, DL);
   Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
-  switch (C.CandidateKind) {
-  case Candidate::Add:
-  case Candidate::Mul: {
-    // C = Basis + Bump
-    Value *NegBump;
-    if (match(Bump, m_Neg(m_Value(NegBump)))) {
-      // If Bump is a neg instruction, emit C = Basis - (-Bump).
-      Reduced = Builder.CreateSub(Basis.Ins, NegBump);
-      // We only use the negative argument of Bump, and Bump itself may be
-      // trivially dead.
-      RecursivelyDeleteTriviallyDeadInstructions(Bump);
-    } else {
-      // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
-      // usually unsound, e.g.,
-      //
-      // X = (-2 +nsw 1) *nsw INT_MAX
-      // Y = (-2 +nsw 3) *nsw INT_MAX
-      //   =>
-      // Y = X + 2 * INT_MAX
-      //
-      // Neither + and * in the resultant expression are nsw.
-      Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+  // If delta is 0, C is a fully redundant of Basis, and Bump is nullptr,
+  // just replace C.Ins with Basis.Ins
+  if (!Bump)
+    Reduced = Basis.Ins;
+  else {
+    switch (C.CandidateKind) {
+    case Candidate::Add:
+    case Candidate::Mul: {
+      // C = Basis + Bump
+      Value *NegBump;
+      if (match(Bump, m_Neg(m_Value(NegBump)))) {
+        // If Bump is a neg instruction, emit C = Basis - (-Bump).
+        Reduced = Builder.CreateSub(Basis.Ins, NegBump);
+        // We only use the negative argument of Bump, and Bump itself may be
+        // trivially dead.
+        RecursivelyDeleteTriviallyDeadInstructions(Bump);
+      } else {
+        // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+        // usually unsound, e.g.,
+        //
+        // X = (-2 +nsw 1) *nsw INT_MAX
+        // Y = (-2 +nsw 3) *nsw INT_MAX
+        //   =>
+        // Y = X + 2 * INT_MAX
+        //
+        // Neither + and * in the resultant expression are nsw.
+        Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+      }
+      break;
     }
-    break;
-  }
-  case Candidate::GEP: {
-    bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
-    // C = (char *)Basis + Bump
-    Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds);
-    break;
+    case Candidate::GEP: {
+      bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+      // C = (char *)Basis + Bump
+      Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds);
+      break;
+    }
+    default:
+      llvm_unreachable("C.CandidateKind is invalid");
+    };
+    Reduced->takeName(C.Ins);
   }
-  default:
-    llvm_unreachable("C.CandidateKind is invalid");
-  };
-  Reduced->takeName(C.Ins);
   C.Ins->replaceAllUsesWith(Reduced);
-  // Unlink C.Ins so that we can skip other candidates also corresponding to
-  // C.Ins. The actual deletion is postponed to the end of runOnFunction.
-  C.Ins->removeFromParent();
-  UnlinkedInstructions.push_back(C.Ins);
+  DeadInstructions.push_back(C.Ins);
 }
 
 bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
@@ -686,33 +1235,43 @@ bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
 }
 
 bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "SLSR on Function: " << F.getName() << "\n");
   // Traverse the dominator tree in the depth-first order. This order makes sure
   // all bases of a candidate are in Candidates when we process it.
   for (const auto Node : depth_first(DT))
     for (auto &I : *(Node->getBlock()))
       allocateCandidatesAndFindBasis(&I);
 
-  // Rewrite candidates in the reverse depth-first order. This order makes sure
-  // a candidate being rewritten is not a basis for any other candidate.
-  while (!Candidates.empty()) {
-    const Candidate &C = Candidates.back();
-    if (C.Basis != nullptr) {
-      rewriteCandidateWithBasis(C, *C.Basis);
-    }
-    Candidates.pop_back();
-  }
-
-  // Delete all unlink instructions.
-  for (auto *UnlinkedInst : UnlinkedInstructions) {
-    for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
-      Value *Op = UnlinkedInst->getOperand(I);
-      UnlinkedInst->setOperand(I, nullptr);
-      RecursivelyDeleteTriviallyDeadInstructions(Op);
-    }
-    UnlinkedInst->deleteValue();
+  // Build the dependency graph and sort candidate instructions from dependency
+  // roots to leaves
+  for (auto &C : Candidates) {
+    if (DependencyGraph.find(C.Ins) == DependencyGraph.end())
+      DependencyGraph[C.Ins] = {};
+    addDependency(C, C.Basis);
   }
-  bool Ret = !UnlinkedInstructions.empty();
-  UnlinkedInstructions.clear();
+  sortCandidateInstructions();
+
+  // Rewrite candidates in the topological order that rewrites a Candidate
+  // always before rewriting its Basis
+  for (Instruction *I : reverse(SortedCandidateInsts))
+    if (Candidate *C = pickRewriteCandidate(I))
+      rewriteCandidateWithBasis(*C, *C->Basis);
+
+  for (auto *DeadIns : DeadInstructions)
+    // A dead instruction may be another dead instruction's op,
+    // don't delete an instruction twice
+    if (DeadIns->getParent())
+      RecursivelyDeleteTriviallyDeadInstructions(DeadIns);
+
+  bool Ret = !DeadInstructions.empty();
+  DeadInstructions.clear();
+  DependencyGraph.clear();
+  RewriteCandidates.clear();
+  SortedCandidateInsts.clear();
+  // First clear all references to candidates in the list
+  CandidateDict.clear();
+  // Then destroy the list
+  Candidates.clear();
   return Ret;
 }
 
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
new file mode 100644
index 0000000000000..a2ae4cbe2b4cc
--- /dev/null
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
@@ -0,0 +1,156 @@
+; RUN: opt < %s -passes=slsr -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; CHECK-LABEL: slsr_i8_zero_delta(
+; CHECK-SAME:      ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
+; CHECK-NEXT:   [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
+; CHECK-NEXT:   [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
+; CHECK-NEXT:   [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
+; CHECK-NEXT:   [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 64
+; CHECK-NEXT:   [[LOAD1:%.*]] = load i8, ptr [[GEP1]]
+; CHECK-NEXT:   [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 96
+; CHECK-NEXT:   [[LOAD2:%.*]] = load i8, ptr [[GEP2]]
+define void @slsr_i8_zero_delta(ptr %in, ptr %out, i64 %add) {
+  %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
+  %load0 = load i8, ptr %getElem0.1
+
+  %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
+  %load1 = load i8, ptr %getElem1.1
+
+  %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
+  %load2 = load i8, ptr %getElem2.1
+
+  %out0 = add i8 %load0, %load1
+  %out1 = add i8 %out0, %load2
+  store i8 %out1, ptr %out
+
+  ret void
+}
+
+; CHECK-LABEL: slsr_i8_zero_delta_2(
+; CHECK-SAME:      ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
+; CHECK-NEXT:   [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
+; CHECK-NEXT:   [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
+; CHECK-NEXT:   [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[GEP0]], i64 32
+; CHECK-NEXT:   [[LOAD1:%.*]] = load i8, ptr [[GEP1]]
+; CHECK-NEXT:   [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP0]], i64 64
+; CHECK-NEXT:   [[LOAD2:%.*]] = load i8, ptr [[GEP2]]
+define void @slsr_i8_zero_delta_2(ptr %in, ptr %out, i64 %add) {
+  %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %load0 = load i8, ptr %getElem0.0
+
+  %getElem1.0 = getelementptr i8, ptr %in, i64 %add
+  %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 32
+  %load1 = load i8, ptr %getElem1.1
+
+  %getElem2.0 = getelementptr i8, ptr %in, i64 %add
+  %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 64
+  %load2 = load i8, ptr %getElem2.1
+
+  %out0 = add i8 %load0, %load1
+  %out1 = add i8 %out0, %load2
+  store i8 %out1, ptr %out
+
+  ret void
+}
+
+; CHECK-LABEL: slsr_i8_base_delta(
+; CHECK-SAME:      ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
+; CHECK-NEXT:   [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
+; CHECK-NEXT:   [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
+; CHECK-NEXT:   [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
+; CHECK-NEXT:   [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 1
+; CHECK-NEXT:   [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64
+; CHECK-NEXT:   [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]]
+; CHECK-NEXT:   [[GEP2_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 2
+; CHECK-NEXT:   [[GEP2_1:%.*]] = getelementptr inbounds i8, ptr [[GEP2_0]], i64 96
+; CHECK-NEXT:   [[LOAD2:%.*]] = load i8, ptr [[GEP2_1]]
+define void @slsr_i8_base_delta(ptr %in, ptr %out, i64 %add) {
+  %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
+  %load0 = load i8, ptr %getElem0.1
+
+  %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 1
+  %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 %add
+  %getElem1.2 = getelementptr inbounds i8, ptr %getElem1.1, i64 64
+  %load1 = load i8, ptr %getElem1.2
+
+  %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 2
+  %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 %add
+  %getElem2.2 = getelementptr inbounds i8, ptr %getElem2.1, i64 96
+  %load2 = load i8, ptr %getElem2.2
+
+  %out0 = add i8 %load0, %load1
+  %out1 = add i8 %out0, %load2
+  store i8 %out1, ptr %out
+
+  ret void
+}
+
+; CHECK-LABEL: slsr_i8_index_delta(
+; CHECK-SAME:      ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
+; CHECK-NEXT:   [[GEP:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[ADD]]
+; CHECK-NEXT:   [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
+; CHECK-NEXT:   [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
+; CHECK-NEXT:   [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
+; CHECK-NEXT:   [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64
+; CHECK-NEXT:   [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]]
+; CHECK-NEXT:   [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 96
+; CHECK-NEXT:   [[LOAD2:%.*]] = load i8, ptr [[GEP2]]
+define void @slsr_i8_index_delta(ptr %in, ptr %out, i64 %add) {
+  %getElem0.0 = getelementptr inbounds double, ptr %in, i64 %add
+  %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
+  %load0 = load i8, ptr %getElem0.1
+
+  %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
+  %load1 = load i8, ptr %getElem1.1
+
+  %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
+  %load2 = load i8, ptr %getElem2.1
+
+  %out0 = add i8 %load0, %load1
+  %out1 = add i8 %out0, %load2
+  store i8 %out1, ptr %out
+
+  ret void
+}
+
+; CHECK-LABEL: slsr_i8_stride_delta(
+; CHECK-SAME:      ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]], i64 [[OFFSET:%.*]])
+; CHECK-NEXT:   [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
+; CHECK-NEXT:   [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32
+; CHECK-NEXT:   [[LOAD0:%.*]] = load i8, ptr [[GEP0]]
+; CHECK-NEXT:   [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 1
+; CHECK-NEXT:   [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64
+; CHECK-NEXT:   [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]]
+; CHECK-NEXT:   [[GEP2_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 [[OFFSET]]
+; CHECK-NEXT:   [[GEP2_1:%.*]] = getelementptr inbounds i8, ptr [[GEP2_0]], i64 96
+; CHECK-NEXT:   [[LOAD2:%.*]] = load i8, ptr [[GEP2_1]]
+define void @slsr_i8_stride_delta(ptr %in, ptr %out, i64 %add, i64 %offset) {
+  %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
+  %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
+  %load0 = load i8, ptr %getElem0.1
+
+  %add1 = add i64 %add, 1
+  %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add1
+  %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
+  %load1 = load i8, ptr %getElem1.1
+
+  %add2 = add i64 %add, %offset
+  %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add2
+  %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
+  %load2 = load i8, ptr %getElem2.1
+
+  %out0 = add i8 %load0, %load1
+  %out1 = add i8 %out0, %load2
+  store i8 %out1, ptr %out
+
+  ret void
+}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
new file mode 100644
index 0000000000000..935e030c1e93f
--- /dev/null
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -passes=slsr -S | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; Test SLSR can reuse the computation by complex variable delta.
+; The original program needs 4 mul.wide.s32, after SLSR with 
+; variable-delta, it can reduce to 1 mul.wide.s32.
+; PTX-COUNT-1: mul.wide.s32
+; PTX-NOT: mul.wide.s32
+define void @foo(ptr %a, ptr %b, i32 %j) {
+  %i.0 = load i32, ptr %a, align 8
+  %i = add i32 %i.0, %j
+  ; CHECK: [[L:%.*]] = load i32, ptr %a, align 8
+  ; CHECK: [[I:%.*]] = add i32 [[L]], %j
+  %gep.24 = getelementptr float, ptr %b, i32 %i
+  ; CHECK: [[GEP0:%.*]] = getelementptr float, ptr %b, i32 [[I]]
+  ; CHECK: store i32 0, ptr [[GEP0]]
+  store i32 0, ptr %gep.24
+  %gep.24.sum1 = add i32 %i, %i
+  %gep.25 = getelementptr float, ptr %b, i32 %gep.24.sum1
+  ; CHECK: [[EXT1:%.*]] = sext i32 [[I]] to i64
+  ; CHECK: [[MUL1:%.*]] = shl i64 [[EXT1]], 2
+  ; CHECK: [[GEP1:%.*]] = getelementptr i8, ptr [[GEP0]], i64 [[MUL1]]
+  ; CHECK: store i32 1, ptr [[GEP1]]
+  store i32 1, ptr %gep.25
+  %gep.26.sum3 = add i32 1, %i
+  %gep.27.sum = add i32 %gep.26.sum3, %i
+  %gep.28 = getelementptr float, ptr %b, i32 %gep.27.sum
+  ; CHECK: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 4
+  ; CHECK: store i32 2, ptr [[GEP2]]
+  store i32 2, ptr %gep.28
+  %gep.28.sum = add i32 %gep.27.sum, %i
+  %gep.29 = getelementptr float, ptr %b, i32 %gep.28.sum
+  ; CHECK: [[EXT2:%.*]] = sext i32 [[I]] to i64
+  ; CHECK: [[MUL2:%.*]] = shl i64 [[EXT2]], 2
+  ; CHECK: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[MUL2]]
+  ; CHECK: store i32 3, ptr [[GEP3]]
+  store i32 3, ptr %gep.29
+  %gep.29.sum = add i32 %gep.28.sum, %i
+  %gep.30 = getelementptr float, ptr %b, i32 %gep.29.sum
+  ; CHECK: [[EXT3:%.*]] = sext i32 [[I]] to i64
+  ; CHECK: [[MUL3:%.*]] = shl i64 [[EXT3]], 2
+  ; CHECK: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[MUL3]]
+  ; CHECK: store i32 4, ptr [[GEP4]]
+  store i32 4, ptr %gep.30
+  ret void
+}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll b/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll
new file mode 100644
index 0000000000000..4e8ae40f039aa
--- /dev/null
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -passes="slsr" -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+
+%struct.B = type { i16 }
+%struct.A = type { %struct.B, %struct.B, %struct.B }
+
+define void @path_compression(i32 %a, ptr %base, i16 %r, i1 %cond) {
+; CHECK-LABEL: @path_compression(
+; CHECK: [[I:%.*]] = sext i32 %a to i64
+; CHECK: [[GEP1:%.*]] = getelementptr inbounds %struct.A, ptr %base, i64 [[I]]
+; CHECK: br
+; CHECK-LABEL: next
+; compress the path to use GEP1 as the Basis instead of GEP2
+; CHECK: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2
+; CHECK: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 4
+
+
+  %1 = sext i32 %a to i64
+  %2 = add i64 %1, 1
+  %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1
+  br i1 %cond, label %next, label %ret
+
+next:
+  %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1
+  %offset = sub i64 %2, 1
+  %getElem3 = getelementptr inbounds %struct.A, ptr %base, i64 %offset, i32 2
+  store i16 %r, ptr %getElem1, align 2
+  store i16 %r, ptr %getElem2, align 2
+  store i16 %r, ptr %getElem3, align 2
+  br label %ret
+
+ret:
+  ret void
+}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll b/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll
new file mode 100644
index 0000000000000..7fa0e4f9474f1
--- /dev/null
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -passes="slsr" -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+
+%struct.B = type { i16 }
+%struct.A = type { %struct.B, %struct.B }
+
+define i32 @pick(i32 %0, ptr %addr) {
+; `d` can be optimized by 2 approaches
+; 1. a = 1 + 1 * %0
+;    d = 1 + 8 * %0
+;      = a + 7 * %0
+; 2. c = (8 * %0) + 3
+;    d = (8 * %0) + 1
+;      = c - 2
+; Pick candidate (2) as it can save 1 instruction from (7 * %0)
+;
+; CHECK-LABEL: pick
+; CHECK: [[A:%.*]] = add i32 %0, 1
+; CHECK: [[B:%.*]] = shl i32 %0, 3
+; CHECK: [[C:%.*]] = add i32 [[B]], 3
+; CHECK: store i32 [[C]], ptr %addr
+; CHECK: [[D:%.*]] = add i32 [[C]], -2
+; CHECK: ret i32 %d
+
+  %a = add i32 %0, 1
+  %b = shl i32 %0, 3
+  %c = add i32 %b, 3
+  store i32 %c, ptr %addr
+  %d = add i32 %b, 1
+  ret i32 %d
+}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
index d85331f77b12f..5d23735237f9f 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
@@ -4,6 +4,8 @@
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 
+; Index Delta
+
 define void @shl(i32 %b, i32 %s) {
 ; CHECK-LABEL: @shl(
 ; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S:%.*]]
@@ -171,3 +173,97 @@ define void @slsr_strided_add_128bit(i128 %b, i128 %s) {
 declare void @foo(i32)
 declare void @voo(<2 x i32>)
 declare void @bar(i128)
+
+; Stride Delta
+
+define void @stride_const(i32 %a, ptr %base, i16 %r) {
+; Reuse add1 to compute add2
+; CHECK-LABEL: @stride_const(
+; CHECK: [[I1:%.*]] = sext i32 %a to i64
+; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2
+; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64
+; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
+; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], 8
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 2
+  %3 = add i64 %1, 4
+  %4 = mul i64 %3, 2
+  %baseInt = ptrtoint ptr %base to i64
+  %add1 = add i64 %baseInt, %2
+  %add2 = add i64 %baseInt, %4
+  %addr1 = inttoptr i64 %add1 to ptr
+  %addr2 = inttoptr i64 %add2 to ptr
+  store i16 %r, ptr %addr1, align 2
+  store i16 %r, ptr %addr2, align 2
+  ret void
+}
+
+define void @stride_var(i32 %a, ptr %base, i16 %r, i64 %n) {
+; Reuse add1 to compute add2 to save a add.s64
+; CHECK-LABEL: @stride_var(
+; CHECK: [[I1:%.*]] = sext i32 %a to i64
+; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2
+; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64
+; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
+; CHECK: [[I3:%.*]] = shl i64 %n, 1
+; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], [[I3]]
+
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 2
+  %3 = add i64 %1, %n
+  %4 = mul i64 %3, 2
+  %baseInt = ptrtoint ptr %base to i64
+  %add1 = add i64 %baseInt, %2
+  %add2 = add i64 %baseInt, %4
+  %addr1 = inttoptr i64 %add1 to ptr
+  %addr2 = inttoptr i64 %add2 to ptr
+  store i16 %r, ptr %addr1, align 2
+  store i16 %r, ptr %addr2, align 2
+  ret void
+}
+
+; Base Delta
+
+define void @base_const(i32 %a, ptr %base, i16 %r) {
+; Reuse add1 to compute add2
+; CHECK-LABEL: @base_const(
+; CHECK: [[I1:%.*]] = sext i32 %a to i64
+; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2
+; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64
+; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
+; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], 5
+
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 2
+  %baseInt = ptrtoint ptr %base to i64
+  %add1 = add i64 %baseInt, %2
+  %add2.0 = add i64 %baseInt, 5
+  %add2 = add i64 %add2.0, %2
+  %addr1 = inttoptr i64 %add1 to ptr
+  %addr2 = inttoptr i64 %add2 to ptr
+  store i16 %r, ptr %addr1, align 2
+  store i16 %r, ptr %addr2, align 2
+  ret void
+}
+
+define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) {
+; Reuse add1 to compute add2
+; CHECK-LABEL: @base_var(
+; CHECK: [[I1:%.*]] = sext i32 %a to i64
+; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2
+; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64
+; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
+; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], %n
+
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 2
+  %baseInt = ptrtoint ptr %base to i64
+  %add1 = add i64 %baseInt, %2
+  %add2.0 = add i64 %baseInt, %n
+  %add2 = add i64 %add2.0, %2
+  %addr1 = inttoptr i64 %add1 to ptr
+  %addr2 = inttoptr i64 %add2 to ptr
+  store i16 %r, ptr %addr1, align 2
+  store i16 %r, ptr %addr2, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll
index 7cd45329c24fe..5eb8d058e295f 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll
@@ -3,6 +3,42 @@
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64-p:64:64:64-p1:32:32:32-p2:128:128:128:32"
 
+; Index Delta
+; Most of the original test cases in this file were optimized by Index-delta.
+; After adding Base-delta and Stride-delta, most of the GEP test cases 
+; are optimized by Stride-delta now. The only case that GEP needs index-delta
+; SLSR is to reuse address computation from a GEP with different pointee type.
+; Once LLVM completely moves from typed GEP to PtrAdd, we can remove
+; index-delta for GEP/PtrAdd.
+
+define void @index_delta(ptr %input, i32 %c, i32 %b, i32 %n, float %r) {
+; CHECK-LABEL: define void @index_delta(
+; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) {
+; CHECK-NEXT:    [[ADD0:%.*]] = add nsw i32 [[B]], 1
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]]
+; CHECK-NEXT:    [[OFFSET:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[GETELEM:%.*]] = getelementptr i8, ptr [[INPUT]], i64 [[OFFSET]]
+; CHECK-NEXT:    store float [[R]], ptr [[GETELEM]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = mul i64 [[OFFSET]], 3
+; CHECK-NEXT:    [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP]]
+; CHECK-NEXT:    store float [[R]], ptr [[GETELEM_1]], align 4
+; CHECK-NEXT:    ret void
+
+  %add0 = add nsw i32 %b, 1
+  %mul.1 = mul nsw i32 %add0, %n
+  %add.1 = add i32 %mul.1, %c
+  %offset = sext i32 %add.1 to i64
+  %getElem = getelementptr i8, ptr %input, i64 %offset
+  store float %r, ptr %getElem, align 4
+  %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset
+  store float %r, ptr %getElem.1, align 4
+
+  ret void
+}
+
+; Stride Delta
+
 ; foo(input[0]);
 ; foo(input[s]);
 ; foo(input[s * 2]);
@@ -263,3 +299,115 @@ declare void @foo(ptr)
 declare void @bar(ptr)
 declare void @baz(ptr addrspace(1))
 declare void @baz2(ptr addrspace(2))
+
+define void @stride_const(ptr %input, i32 %c, i32 %b, i32 %n, float %r) {
+; CHECK-LABEL: define void @stride_const(
+; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B]], [[N]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[C]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD]], [[N]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[N]]
+; CHECK-NEXT:    [[OFFSET:%.*]] = sext i32 [[ADD_2]] to i64
+; CHECK-NEXT:    [[GETELEM_1:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[OFFSET]]
+; CHECK-NEXT:    store float [[R]], ptr [[GETELEM_1]], align 4
+; CHECK-NEXT:    [[GETELEM_2:%.*]] = getelementptr i8, ptr [[GETELEM_1]], i64 16
+; CHECK-NEXT:    store float [[R]], ptr [[GETELEM_2]], align 4
+; CHECK-NEXT:    ret void
+;
+
+  %mul = mul nsw i32 %b, %n
+  %add = add i32 %mul, %c
+  %add.1 = add i32 %add, %n
+  %add.2 = add i32 %add.1, %n
+
+  %offset = sext i32 %add.2 to i64
+  %1 = getelementptr float, ptr %input, i64 %offset
+  store float %r, ptr %1, align 4
+
+  %offset3 = add i64 %offset, 4
+  %2 = getelementptr float, ptr %input, i64 %offset3
+  store float %r, ptr %2, align 4
+  ret void
+}
+
+
+define void @stride_var(ptr %input, i32 %c, i32 %b, i32 %n, float %r) {
+; CHECK-LABEL: define void @stride_var(
+; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) {
+; CHECK-NEXT:    [[ADD0:%.*]] = add nsw i32 [[B]], 1
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]]
+; CHECK-NEXT:    [[I:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[GETELEM:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[I]]
+; CHECK-NEXT:    store float [[R]], ptr [[GETELEM]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP2]]
+; CHECK-NEXT:    store float [[R]], ptr [[GETELEM_1]], align 4
+; CHECK-NEXT:    ret void
+;
+; Reuse getElem to compute getElem.1 and getElem.2 with variable offset n extracted from Stride
+
+  %add0 = add nsw i32 %b, 1
+  %mul.1 = mul nsw i32 %add0, %n
+  %add.1 = add i32 %mul.1, %c
+  %offset = sext i32 %add.1 to i64
+  %getElem = getelementptr float, ptr %input, i64 %offset
+  store float %r, ptr %getElem, align 4
+
+  %mul = mul nsw i32 %b, %n
+  %add = add i32 %mul, %c
+  %add.11 = add i32 %add, %n
+  %add.2 = add i32 %add.11, %n
+  %offset1 = sext i32 %add.2 to i64
+  %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset1
+  store float %r, ptr %getElem.1, align 4
+
+  ret void
+}
+
+; Base Delta
+
+%struct.B = type { i16 }
+%struct.A = type { %struct.B, %struct.B }
+
+define void @base_const(i32 %a, ptr %base, i16 %r) {
+; Reuse getElem1 to compute getElem2
+; CHECK-LABEL: define void @base_const(
+; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    store i16 [[R]], ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2
+; CHECK-NEXT:    store i16 [[R]], ptr [[GEP2]], align 2
+; CHECK-NEXT:    ret void
+;
+
+  %1 = sext i32 %a to i64
+  %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1
+  store i16 %r, ptr %getElem1, align 2
+  %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1
+  store i16 %r, ptr %getElem2, align 2
+  ret void
+}
+
+define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) {
+; Reuse getElem1 to compute getElem2
+; CHECK-LABEL: define void @base_var(
+; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    store i16 [[R]], ptr [[GETELEM1]], align 2
+; CHECK-NEXT:    [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1]], i64 [[N]]
+; CHECK-NEXT:    store i16 [[R]], ptr [[GETELEM2]], align 2
+; CHECK-NEXT:    ret void
+;
+
+  %1 = sext i32 %a to i64
+  %base1 = getelementptr inbounds i8, ptr %base, i64 %n
+  %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1
+  store i16 %r, ptr %getElem1, align 2
+  %getElem2 = getelementptr inbounds %struct.A, ptr %base1, i64 %1
+  store i16 %r, ptr %getElem2, align 2
+  ret void
+}

>From b64d72c8336d89189ea9a68576b923173d101ac4 Mon Sep 17 00:00:00 2001
From: feip <feip at nvidia.com>
Date: Fri, 10 Oct 2025 14:52:29 -0700
Subject: [PATCH 2/5] Update AMDGPU tests

---
 .../Scalar/StraightLineStrengthReduce.cpp     |   3 +
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  20 +-
 .../AMDGPU/dagcombine-reassociate-bug.ll      |   2 +-
 llvm/test/CodeGen/AMDGPU/idot2.ll             |  12 +-
 llvm/test/CodeGen/AMDGPU/idot4s.ll            | 161 +++---
 llvm/test/CodeGen/AMDGPU/idot8u.ll            |   6 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      | 461 +++++++++---------
 .../AMDGPU/splitkit-getsubrangeformask.ll     |  18 +-
 llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll     | 335 +++++--------
 .../AMDGPU/pr23975.ll                         |   2 +-
 .../reassociate-geps-and-slsr-addrspace.ll    |  10 +-
 11 files changed, 484 insertions(+), 546 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index c43195e9825e7..e6d1b168cf69d 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -52,6 +52,9 @@
 // S1: X = &B[i * S]
 // S2: Y = &B[i * S']   => X + i * (S' - S)
 //
+// PS: Stride delta write on form Mul is usually non-profitable, and Base delta
+// write sometimes is profitable, so we do not support them on form Mul.
+//
 // This rewriting is in general a good idea. The code patterns we focus on
 // usually come from loop unrolling, so the delta is likely the same
 // across iterations and can be reused. When that happens, the optimized form
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index ebbeab94066d6..ef7a13819a799 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -541,10 +541,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v19, s2
 ; GFX908-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
-; GFX908-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
 ; GFX908-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX908-NEXT:    s_or_b32 s14, s14, 28
 ; GFX908-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -610,13 +609,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    s_add_u32 s22, s20, s9
 ; GFX908-NEXT:    s_addc_u32 s23, s21, s13
-; GFX908-NEXT:    global_load_dword v21, v17, s[22:23] offset:-12 glc
+; GFX908-NEXT:    global_load_dword v21, v17, s[22:23] offset:16 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v20, v17, s[22:23] offset:-8 glc
+; GFX908-NEXT:    global_load_dword v20, v17, s[22:23] offset:20 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v12, v17, s[22:23] offset:-4 glc
+; GFX908-NEXT:    global_load_dword v12, v17, s[22:23] offset:24 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v12, v17, s[22:23] glc
+; GFX908-NEXT:    global_load_dword v12, v17, s[22:23] offset:28 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    ds_read_b64 v[12:13], v17
 ; GFX908-NEXT:    ds_read_b64 v[14:15], v0
@@ -710,7 +709,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
 ; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
 ; GFX90A-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GFX90A-NEXT:    s_or_b32 s14, s14, 28
 ; GFX90A-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_readfirstlane_b32 s2, v18
@@ -771,13 +769,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    s_add_u32 s22, s20, s9
 ; GFX90A-NEXT:    s_addc_u32 s23, s21, s13
-; GFX90A-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
+; GFX90A-NEXT:    global_load_dword v21, v19, s[22:23] offset:16 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v20, v19, s[22:23] offset:-8 glc
+; GFX90A-NEXT:    global_load_dword v20, v19, s[22:23] offset:20 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v14, v19, s[22:23] offset:-4 glc
+; GFX90A-NEXT:    global_load_dword v14, v19, s[22:23] offset:24 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v14, v19, s[22:23] glc
+; GFX90A-NEXT:    global_load_dword v14, v19, s[22:23] offset:28 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ds_read_b64 v[14:15], v19
 ; GFX90A-NEXT:    ds_read_b64 v[16:17], v0
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index af1c64321222b..1b447571efaf2 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -amdgpu-scalar-ir-passes=false < %s | FileCheck %s
 
 ; Test for a bug where DAGCombiner::ReassociateOps() was creating adds
 ; with offset in the first operand and base pointers in the second.
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index b443e654350c5..d3a31f7133ab9 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2396,7 +2396,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
-; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v4, v3
 ; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2417,7 +2417,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
-; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
+; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v4, v3
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2442,7 +2442,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
+; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v2, v1
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    ptr addrspace(1) %src2,
@@ -2553,7 +2553,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
-; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v4, v3
 ; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2574,7 +2574,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
-; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
+; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v4, v3
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2599,7 +2599,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
+; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v2, v1
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index ab38bd21994ec..c96538b557cf6 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3268,19 +3268,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
-; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT:    v_mul_u32_u24_e32 v1, v1, v5
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v6, v3
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -3307,18 +3307,18 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v6, v3, 0, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xff, v2
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v8, v9, sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_mad_u16 v6, v8, v7, v6
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX8-NEXT:    v_mad_u16 v6, v6, v7, v8
 ; GFX8-NEXT:    v_mad_u16 v4, v4, v5, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
@@ -3337,19 +3337,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
+; GFX9-NODL-NEXT:    v_and_b32_e32 v5, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX9-NODL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX9-NODL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v4, v4, v5, v6
 ; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v3, v4
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
 ; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 16
@@ -3367,19 +3367,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX9-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xff, v2
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX9-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v4, v4, v5, v6
 ; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v3, v4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 16
@@ -3392,28 +3392,28 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v6, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xff, v2
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v3
-; GFX10-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_bfe_i32 v6, v7, 0, 8
+; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v3, v0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_bfe_i32 v4, v5, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v3, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v6, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v4, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v1, v2, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
@@ -3429,32 +3429,34 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.h
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.h
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v5.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v3, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.h
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.h, v2.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v0.h, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v3.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v4.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
@@ -3473,24 +3475,25 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX11-DL-FAKE16-NEXT:    global_load_b32 v1, v0, s[0:1]
 ; GFX11-DL-FAKE16-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-DL-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v3
-; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
-; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v0
 ; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
-; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
 ; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v2, v4, v3, v2
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-DL-FAKE16-NEXT:    v_mul_lo_u16 v2, v3, v2
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
 ; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v2, v6, v5, v2
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v2, v6, v7, v2
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v2, v4, v3, v2
 ; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v0, v1, v0, v2
 ; GFX11-DL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 50f0a39802270..02cca4abc87b4 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -1684,7 +1684,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v6
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v4
-; GFX9-NEXT:    v_add3_u32 v1, v17, v1, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v17, v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1735,7 +1735,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
 ; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v7, v6
 ; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v5, v4
-; GFX9-DL-NEXT:    v_add3_u32 v1, v17, v1, v2
+; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v17, v2
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1789,7 +1789,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v6, v5
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
+; GFX10-DL-NEXT:    v_add3_u32 v0, v13, v3, v0
 ; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
                                                 ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 85a9aba1a0e51..22b46671e7649 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -365,107 +365,110 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xfe000000, v1
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xfe000000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    s_movk_i32 s0, 0x5000
+; GFX8-NEXT:    s_movk_i32 s0, 0x2800
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v10, 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v11, 0
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7f
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0x7f
+; GFX8-NEXT:    s_movk_i32 s1, 0x800
+; GFX8-NEXT:    s_movk_i32 s2, 0x1000
+; GFX8-NEXT:    s_movk_i32 s3, 0x1800
+; GFX8-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    s_mov_b32 s0, 0
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:  .LBB1_2: ; %for.body
 ; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffb000, v2
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffb800, v2
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[6:7]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffc000, v2
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffc800, v2
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffd000, v2
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffd800, v2
-; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe000, v2
-; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[4:5]
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[20:21]
-; GFX8-NEXT:    s_addk_i32 s0, 0x2000
-; GFX8-NEXT:    s_cmp_gt_u32 s0, 0x3fffff
-; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v14, v10
-; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, v15, v11, vcc
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffe800, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xfffff000, v2
-; GFX8-NEXT:    flat_load_dwordx2 v[20:21], v[22:23]
-; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v16, v24
-; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, v17, v25, vcc
-; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xfffff800, v2
-; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[14:15]
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffd800, v2
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[8:9]
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0xffffe000, v2
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffe800, v2
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[8:9]
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xfffff000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xfffff800, v2
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[16:17]
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s1, v2
+; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_addk_i32 s5, 0x2000
+; GFX8-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v14, v6
+; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, v15, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s2, v2
+; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[20:21]
 ; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[16:17]
-; GFX8-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s3, v2
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v12, v22
+; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, v13, v23, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v2
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
+; GFX8-NEXT:    flat_load_dwordx2 v[20:21], v[20:21]
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(5)
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v18, v22
 ; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, v19, v23, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s0, v2
+; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
+; GFX8-NEXT:    v_addc_u32_e32 v19, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[18:19]
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x10000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v22
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v23, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v22
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v23, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v14, v8
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v15, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v20, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v21, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v17, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v10, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v11, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v14, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v20, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v21, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v16, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v17, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v18, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v19, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v18, v4
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v19, v5, vcc
 ; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, 1, v13
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 1, v11
 ; GFX8-NEXT:    s_and_b64 vcc, exec, vcc
 ; GFX8-NEXT:    s_cbranch_vccz .LBB1_1
 ; GFX8-NEXT:  ; %bb.4: ; %while.end
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s35
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v12
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v10
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[10:11]
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: clmem_read:
@@ -495,79 +498,76 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, 0x5000, v0
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2800, v0
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7f
-; GFX900-NEXT:    s_movk_i32 s2, 0xd000
-; GFX900-NEXT:    s_movk_i32 s3, 0xe000
-; GFX900-NEXT:    s_movk_i32 s4, 0xf000
+; GFX900-NEXT:    s_movk_i32 s2, 0xf000
+; GFX900-NEXT:    s_movk_i32 s3, 0x1000
+; GFX900-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX900-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX900-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX900-NEXT:    ; Child Loop BB1_2 Depth 2
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v1
-; GFX900-NEXT:    s_mov_b32 s5, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    s_mov_b32 s5, 0
 ; GFX900-NEXT:  .LBB1_2: ; %for.body
 ; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffe000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
 ; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
-; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
-; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
-; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, s2, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v3, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, s3, v2
-; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
-; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off offset:-2048
 ; GFX900-NEXT:    s_addk_i32 s5, 0x2000
 ; GFX900-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e32 v22, vcc, v8, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e64 v24, s[0:1], v18, v22
-; GFX900-NEXT:    v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
-; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[22:23], v[14:15], off
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v24
-; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
-; GFX900-NEXT:    v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v14, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v5, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v16
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v17, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, s2, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off offset:-2048
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v8, v14
+; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v9, v15, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v10, v14
+; GFX900-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v15, vcc
+; GFX900-NEXT:    v_add_co_u32_e64 v14, s[0:1], v12, v14
+; GFX900-NEXT:    v_addc_co_u32_e64 v15, s[0:1], v13, v11, s[0:1]
+; GFX900-NEXT:    v_add_co_u32_e32 v10, vcc, s3, v2
+; GFX900-NEXT:    v_add_co_u32_e64 v12, s[0:1], s4, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX900-NEXT:    v_addc_co_u32_e64 v13, vcc, 0, v3, s[0:1]
+; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v4, v14
+; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v5, v15, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[12:13], off offset:-4096
+; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[10:11], off offset:2048
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v8, v16
+; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v9, v17, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[12:13], off
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[12:13], off offset:2048
 ; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x10000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v20
-; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v16
-; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v17, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, v18, v8
-; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v19, v9, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v16
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v17, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, v22, v8
-; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, v23, v9, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
 ; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1
@@ -610,7 +610,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 3, v6
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
@@ -621,29 +621,30 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:  .LBB1_2: ; %for.body
 ; GFX10-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, 0xffffb800
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, 0xffffe000
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, 0xffffc800
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, 0xfffff000
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v4, 0xffffd800
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v18, vcc_lo, v4, 0xffffe800
-; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_clause 0x5
 ; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off
 ; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[10:11], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v22, vcc_lo, 0xfffff000, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v23, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx2 v[24:25], v[18:19], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[10:11], off
+; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[4:5], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[22:23], v[4:5], off
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, 0x1000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, 0x2000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[24:25], v[8:9], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v26, vcc_lo, 0x2800, v4
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[28:29], v[10:11], off offset:-2048
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
-; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
-; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[14:15], off
-; GFX10-NEXT:    global_load_dwordx2 v[26:27], v[18:19], off
-; GFX10-NEXT:    global_load_dwordx2 v[28:29], v[22:23], off
-; GFX10-NEXT:    global_load_dwordx2 v[30:31], v[4:5], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[32:33], v[4:5], off
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v27, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[30:31], v[10:11], off
+; GFX10-NEXT:    global_load_dwordx2 v[32:33], v[26:27], off
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
 ; GFX10-NEXT:    s_addk_i32 s1, 0x2000
@@ -651,25 +652,27 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(10)
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v12, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v13, v3, s0
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v8, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v9, v3, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v14, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v15, v3, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v16, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v17, v3, s0
-; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v10, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v11, v3, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v18, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v19, v3, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v20, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v21, v3, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v22, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v23, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v14, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v15, v3, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v24, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v25, v3, s0
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v26, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v27, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v8, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v9, v3, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v28, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v29, v3, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -717,77 +720,73 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x5000, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2800, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x7f
-; GFX90A-NEXT:    s_movk_i32 s0, 0xd000
-; GFX90A-NEXT:    s_movk_i32 s1, 0xe000
 ; GFX90A-NEXT:    s_movk_i32 s2, 0xf000
+; GFX90A-NEXT:    s_movk_i32 s3, 0x1000
+; GFX90A-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB1_2 Depth 2
-; GFX90A-NEXT:    s_mov_b32 s3, 0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    s_mov_b32 s5, 0
 ; GFX90A-NEXT:  .LBB1_2: ; %for.body
 ; GFX90A-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
-; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
-; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
-; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
-; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
-; GFX90A-NEXT:    v_add_co_u32_e32 v16, vcc, s0, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
-; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, s1, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[24:25], v[14:15], off offset:-4096
-; GFX90A-NEXT:    global_load_dwordx2 v[26:27], v[14:15], off offset:-2048
-; GFX90A-NEXT:    global_load_dwordx2 v[28:29], v[14:15], off
+; GFX90A-NEXT:    v_add_co_u32_e64 v18, s[0:1], s3, v6
+; GFX90A-NEXT:    v_addc_co_u32_e64 v19, s[0:1], 0, v7, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e64 v20, s[0:1], s4, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffe000, v6
+; GFX90A-NEXT:    v_addc_co_u32_e64 v21, s[0:1], 0, v7, s[0:1]
+; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-4096
+; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[6:7], off offset:-2048
+; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
+; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v7, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[24:25], v[20:21], off offset:-4096
+; GFX90A-NEXT:    global_load_dwordx2 v[26:27], v[20:21], off
+; GFX90A-NEXT:    global_load_dwordx2 v[28:29], v[8:9], off offset:-2048
+; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[8:9], off
 ; GFX90A-NEXT:    v_add_co_u32_e32 v22, vcc, s2, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
-; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[22:23], off offset:-2048
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[18:19], off offset:2048
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[20:21], off offset:2048
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    s_addk_i32 s3, 0x2000
-; GFX90A-NEXT:    s_cmp_gt_u32 s3, 0x3fffff
-; GFX90A-NEXT:    s_waitcnt vmcnt(8)
+; GFX90A-NEXT:    s_addk_i32 s5, 0x2000
+; GFX90A-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
+; GFX90A-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v28, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(3)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(7)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v18, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(6)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v20, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v16, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(4)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v24, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(3)
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v18, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v26, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v28, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v20, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX90A-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX90A-NEXT:    ; in Loop: Header=BB1_1 Depth=1
@@ -822,7 +821,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX11-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
@@ -834,76 +833,76 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, 0xffffc000
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0xffffe000, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0xffffc000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0xfffff000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, -1, v5, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[14:15], v[8:9], off offset:-4096
-; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0xffffd000, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, null, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v16, vcc_lo, v4, 0xffffe000
+; GFX11-NEXT:    s_clause 0x6
+; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
 ; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:-2048
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v17, null, -1, v5, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:-2048
-; GFX11-NEXT:    v_add_co_u32 v18, vcc_lo, 0xffffe000, v4
+; GFX11-NEXT:    global_load_b64 v[14:15], v[4:5], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[16:17], v[4:5], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[18:19], v[4:5], off
+; GFX11-NEXT:    global_load_b64 v[20:21], v[4:5], off offset:2048
+; GFX11-NEXT:    v_add_co_u32 v22, vcc_lo, v4, 0x2000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v23, null, 0, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v24, vcc_lo, 0x1000, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo
+; GFX11-NEXT:    global_load_b64 v[26:27], v[22:23], off offset:-4096
+; GFX11-NEXT:    v_add_co_u32 v28, vcc_lo, 0x2000, v4
+; GFX11-NEXT:    global_load_b64 v[24:25], v[24:25], off offset:2048
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v29, null, 0, v5, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[20:21], v[16:17], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v19, null, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v22, vcc_lo, 0xfffff000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v23, null, -1, v5, vcc_lo
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_b64 v[18:19], v[18:19], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[16:17], v[16:17], off
-; GFX11-NEXT:    global_load_b64 v[22:23], v[22:23], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[24:25], v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[26:27], v[4:5], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[28:29], v[4:5], off
+; GFX11-NEXT:    global_load_b64 v[22:23], v[22:23], off
+; GFX11-NEXT:    global_load_b64 v[28:29], v[28:29], off offset:2048
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
 ; GFX11-NEXT:    s_addk_i32 s1, 0x2000
 ; GFX11-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v14, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v12, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v15, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    v_add_co_u32 v2, s0, v10, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v11, v3, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v14, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v15, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v8, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v12, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v20, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v21, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v17, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    v_add_co_u32 v2, s0, v18, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v19, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v16, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v20, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v17, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v21, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v22, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v26, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v23, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v27, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_add_co_u32 v2, s0, v24, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v25, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v26, v2
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v22, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v27, v3, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v23, v3, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v28, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..a5984799c74d4 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -147,7 +147,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.274, align 8, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -170,7 +170,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.283, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
@@ -191,24 +191,24 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.294, addrspace 4)
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.257, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
   ; CHECK-NEXT:   KILL undef %470:sreg_64
   ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
   ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.266, addrspace 4)
   ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.306, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
@@ -226,13 +226,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.324, addrspace 4)
   ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.330, addrspace 4)
   ; CHECK-NEXT:   undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.336, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f3cb5a783a08c..f6922c75ff848 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -7,36 +7,27 @@
 define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: barrier_vmcnt_global:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_barrier
-; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: barrier_vmcnt_global:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -57,22 +48,20 @@ bb:
 define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: barrier_vscnt_global:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT:    flat_store_dword v[2:3], v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v3, 1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_barrier
-; GFX8-NEXT:    flat_store_dword v[0:1], v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: barrier_vscnt_global:
@@ -81,18 +70,14 @@ define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    global_store_dword v[2:3], v1, off
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off offset:-4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -115,22 +100,19 @@ bb:
 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: barrier_vmcnt_vscnt_global:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT:    flat_store_dword v[2:3], v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -8, v1
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v2, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_barrier
 ; GFX8-NEXT:    flat_store_dword v[0:1], v3
@@ -142,19 +124,15 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:-8
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_store_dword v[2:3], v1, off
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off offset:-4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -179,38 +157,30 @@ bb:
 define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) {
 ; GFX8-LABEL: barrier_vmcnt_flat:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
-; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: barrier_vmcnt_flat:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
-; GFX9-NEXT:    flat_load_dword v4, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    flat_store_dword v[0:1], v4
+; GFX9-NEXT:    flat_store_dword v[0:1], v2 offset:4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -231,22 +201,20 @@ bb:
 define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
 ; GFX8-LABEL: barrier_vscnt_flat:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT:    flat_store_dword v[2:3], v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v3, 1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
-; GFX8-NEXT:    flat_store_dword v[0:1], v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: barrier_vscnt_flat:
@@ -255,18 +223,16 @@ define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -4, v2
 ; GFX9-NEXT:    flat_store_dword v[2:3], v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    flat_store_dword v[0:1], v3
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -289,22 +255,19 @@ bb:
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) {
 ; GFX8-LABEL: barrier_vmcnt_vscnt_flat:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT:    flat_store_dword v[2:3], v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -8, v1
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v2, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
 ; GFX8-NEXT:    flat_store_dword v[0:1], v3
@@ -316,21 +279,18 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, -8, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
+; GFX9-NEXT:    flat_load_dword v4, v[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -4, v2
 ; GFX9-NEXT:    flat_store_dword v[2:3], v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    flat_load_dword v3, v[2:3]
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    flat_store_dword v[0:1], v3
+; GFX9-NEXT:    flat_store_dword v[0:1], v4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -355,24 +315,20 @@ bb:
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX8-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT:    flat_store_dword v[2:3], v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -8, v1
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v2, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
 ; GFX8-NEXT:    flat_store_dword v[0:1], v3
 ; GFX8-NEXT:    s_endpgm
@@ -383,22 +339,18 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, -8, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
+; GFX9-NEXT:    flat_load_dword v4, v[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -4, v2
 ; GFX9-NEXT:    flat_store_dword v[2:3], v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    flat_load_dword v3, v[2:3]
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    flat_store_dword v[0:1], v3
+; GFX9-NEXT:    flat_store_dword v[0:1], v4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -423,34 +375,25 @@ bb:
 define amdgpu_kernel void @load_vmcnt_global(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: load_vmcnt_global:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_vmcnt_global:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -468,36 +411,28 @@ bb:
 define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) {
 ; GFX8-LABEL: load_vmcnt_flat:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_dword v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_vmcnt_flat:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
-; GFX9-NEXT:    flat_load_dword v4, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dword v[0:1], v4
+; GFX9-NEXT:    flat_store_dword v[0:1], v2 offset:4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
index b3cbc3d049b04..9debcf10bb88b 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
@@ -13,7 +13,7 @@ entry:
   %tmp = sext i32 undef to i64
   %arrayidx114 = getelementptr inbounds %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp
   %tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1
-; CHECK: %tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1
+; CHECK: %tmp1 =  getelementptr i8, ptr addrspace(1) %arrayidx114, i64 4
   %tmp2 = load <4 x float>, ptr addrspace(1) undef, align 4
   ret void
 }
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
index 03edfdceab325..8662e61c95d66 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
@@ -46,9 +46,9 @@ define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_off
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP]]
 ; CHECK-NEXT:    [[V11:%.*]] = load i32, ptr addrspace(1) [[P1]], align 4
 ; CHECK-NEXT:    store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[J2:%.*]] = add i32 [[J1]], [[I]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[J2]] to i64
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP5]]
+; CHECK-NEXT:    [[OFFSET:%.*]] = sext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[OFFSET]], 2
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P1]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[V22:%.*]] = load i32, ptr addrspace(1) [[P2]], align 4
 ; CHECK-NEXT:    store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
@@ -109,8 +109,8 @@ define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(pt
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J1]]
 ; CHECK-NEXT:    [[V11:%.*]] = load i32, ptr addrspace(3) [[P1]], align 4
 ; CHECK-NEXT:    store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    [[J2:%.*]] = add i32 [[J1]], [[I]]
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J2]]
+; CHECK-NEXT:    [[J2:%.*]] = shl i32 [[I]], 2
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[P1]], i32 [[J2]]
 ; CHECK-NEXT:    [[V22:%.*]] = load i32, ptr addrspace(3) [[P2]], align 4
 ; CHECK-NEXT:    store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void

>From f346abeb3cb4f304cbf059dbb9b6aa41052b7feb Mon Sep 17 00:00:00 2001
From: feip <feip at nvidia.com>
Date: Fri, 10 Oct 2025 15:25:12 -0700
Subject: [PATCH 3/5] Fix trailing return type.

---
 llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index e6d1b168cf69d..43e13c512fe06 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -747,8 +747,8 @@ void StraightLineStrengthReduce::setBasisAndDeltaFor(Candidate &C) {
 // Y = A + 2
 // Z = A + 3
 // Return the delta info for C aginst the new Basis
-StraightLineStrengthReduce::compressPath(Candidate &C,
-                                         Candidate *Basis) const->DeltaInfo {
+auto StraightLineStrengthReduce::compressPath(
+    Candidate &C, Candidate *Basis) const -> DeltaInfo {
   if (!Basis || !Basis->Basis || C.CandidateKind == Candidate::Mul)
     return {};
   Candidate *Root = Basis;
@@ -849,8 +849,8 @@ void StraightLineStrengthReduce::sortCandidateInstructions() {
          "Dependency graph should not have cycles");
 }
 
-StraightLineStrengthReduce::pickRewriteCandidate(
-    Instruction *I) const->Candidate * {
+auto StraightLineStrengthReduce::pickRewriteCandidate(Instruction *I) const
+    -> Candidate * {
   // Return the candidate of instruction I that has the highest profit.
   auto It = RewriteCandidates.find(I);
   if (It == RewriteCandidates.end())

>From 58134c4bdbedbd1ea0431c475bdcafbebfac3ad7 Mon Sep 17 00:00:00 2001
From: feip <feip at nvidia.com>
Date: Fri, 10 Oct 2025 22:39:48 -0700
Subject: [PATCH 4/5] Address comments

---
 .../Scalar/StraightLineStrengthReduce.cpp     | 139 ++++++++++--------
 .../NVPTX/slsr-i8-gep.ll                      |   3 +-
 .../NVPTX/slsr-var-delta.ll                   |   3 +-
 3 files changed, 78 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 43e13c512fe06..072930fba7dd0 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -113,7 +113,7 @@ static const unsigned UnknownAddressSpace =
     std::numeric_limits<unsigned>::max();
 
 DEBUG_COUNTER(StraightLineStrengthReduceCounter, "slsr-counter",
-              "Controls whether rewriteCandidateWithBasis is executed.");
+              "Controls whether rewriteCandidate is executed.");
 
 namespace {
 
@@ -323,6 +323,24 @@ class StraightLineStrengthReduce {
     bool isHighEfficiency() const {
       return getComputationEfficiency(CandidateKind, Index, Stride, Base) >= 4;
     }
+
+    // Verify that this candidate has valid delta components relative to the
+    // basis
+    bool hasValidDelta(const Candidate &Basis) const {
+      switch (DeltaKind) {
+      case IndexDelta:
+        // Index differs, Base and Stride must match
+        return Base == Basis.Base && StrideSCEV == Basis.StrideSCEV;
+      case StrideDelta:
+        // Stride differs, Base and Index must match
+        return Base == Basis.Base && Index == Basis.Index;
+      case BaseDelta:
+        // Base differs, Stride and Index must match
+        return StrideSCEV == Basis.StrideSCEV && Index == Basis.Index;
+      default:
+        return false;
+      }
+    }
   };
 
   bool runOnFunction(Function &F);
@@ -363,7 +381,7 @@ class StraightLineStrengthReduce {
                                       Instruction *I);
 
   // Rewrites candidate C with respect to Basis.
-  void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+  void rewriteCandidate(const Candidate &C);
 
   // Emit code that computes the "bump" from Basis to C.
   static Value *emitBump(const Candidate &Basis, const Candidate &C,
@@ -540,9 +558,8 @@ class StraightLineStrengthReduce {
   };
 };
 
-inline llvm::raw_ostream &
-operator<<(llvm::raw_ostream &OS,
-           const StraightLineStrengthReduce::Candidate &C) {
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const StraightLineStrengthReduce::Candidate &C) {
   OS << "Ins: " << *C.Ins << "\n  Base: " << *C.Base
      << "\n  Index: " << *C.Index << "\n  Stride: " << *C.Stride
      << "\n  StrideSCEV: " << *C.StrideSCEV;
@@ -551,10 +568,9 @@ operator<<(llvm::raw_ostream &OS,
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
-inline llvm::raw_ostream &
-operator<<(llvm::raw_ostream &OS,
-           const StraightLineStrengthReduce::DeltaInfo &DI) {
+LLVM_DUMP_METHOD
+inline raw_ostream &
+operator<<(raw_ostream &OS, const StraightLineStrengthReduce::DeltaInfo &DI) {
   OS << "Cand: " << *DI.Cand << "\n";
   OS << "Delta Kind: ";
   switch (DI.DeltaKind) {
@@ -730,9 +746,14 @@ void StraightLineStrengthReduce::setBasisAndDeltaFor(Candidate &C) {
 
   // If we did not find a constant delta, we might have found a variable delta
   if (C.Delta) {
-    LLVM_DEBUG(dbgs() << "Found delta from ";
-               if (C.DeltaKind == Candidate::BaseDelta) dbgs() << "Base: ";
-               else dbgs() << "Stride: "; dbgs() << *C.Delta << "\n");
+    LLVM_DEBUG({
+      dbgs() << "Found delta from ";
+      if (C.DeltaKind == Candidate::BaseDelta)
+        dbgs() << "Base: ";
+      else
+        dbgs() << "Stride: ";
+      dbgs() << *C.Delta << "\n";
+    });
     assert(C.DeltaKind != Candidate::InvalidDelta && C.Basis);
   }
 }
@@ -816,8 +837,7 @@ void StraightLineStrengthReduce::sortCandidateInstructions() {
   // processed before processing itself.
   DenseMap<Instruction *, int> InDegree;
   for (auto &KV : DependencyGraph) {
-    if (InDegree.find(KV.first) == InDegree.end())
-      InDegree[KV.first] = 0;
+    InDegree.try_emplace(KV.first, 0);
 
     for (auto *Child : KV.second) {
       InDegree[Child]++;
@@ -839,8 +859,8 @@ void StraightLineStrengthReduce::sortCandidateInstructions() {
     SortedCandidateInsts.push_back(I);
 
     for (auto *Next : DependencyGraph[I]) {
-      InDegree[Next]--;
-      if (InDegree[Next] == 0)
+      auto &Degree = InDegree[Next];
+      if (--Degree == 0)
         WorkList.push(Next);
     }
   }
@@ -1080,8 +1100,8 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
                                             IRBuilder<> &Builder,
                                             const DataLayout *DL) {
   auto CreateMul = [&](Value *LHS, Value *RHS) {
-    if (isa<ConstantInt>(RHS)) {
-      APInt ConstRHS = cast<ConstantInt>(RHS)->getValue();
+    if (ConstantInt *CR = dyn_cast<ConstantInt>(RHS)) {
+      const APInt &ConstRHS = CR->getValue();
       IntegerType *DeltaType =
           IntegerType::get(C.Ins->getContext(), ConstRHS.getBitWidth());
       if (ConstRHS.isPowerOf2()) {
@@ -1126,58 +1146,51 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
     Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
 
     return CreateMul(ExtendedStride, C.Delta);
-  } else {
-    assert(C.DeltaKind == Candidate::StrideDelta ||
-           C.DeltaKind == Candidate::BaseDelta);
-    assert(C.CandidateKind != Candidate::Mul);
-    // StrideDelta
-    // X = B + i * S
-    // Y = B + i * S'
-    //   = B + i * (S + Delta)
-    //   = B + i * S + i * Delta
-    //   = X + i * StrideDelta
-    // Bump = i * (S' - S)
-    //
-    // BaseDelta
-    // X = B  + i * S
-    // Y = B' + i * S
-    //   = (B + Delta) + i * S
-    //   = X + BaseDelta
-    // Bump = (B' - B).
-    Value *Bump = C.Delta;
-    if (C.DeltaKind == Candidate::StrideDelta) {
-      // If this value is consumed by a GEP, promote StrideDelta before doing
-      // StrideDelta * Index to ensure the same semantics as the original GEP.
-      if (C.CandidateKind == Candidate::GEP) {
-        auto *GEP = cast<GetElementPtrInst>(C.Ins);
-        Type *NewScalarIndexTy =
-            DL->getIndexType(GEP->getPointerOperandType()->getScalarType());
-        Bump = Builder.CreateSExtOrTrunc(Bump, NewScalarIndexTy);
-      }
-      if (!C.Index->isOne()) {
-        Value *ExtendedIndex =
-            Builder.CreateSExtOrTrunc(C.Index, Bump->getType());
-        Bump = CreateMul(Bump, ExtendedIndex);
-      }
+  }
+
+  assert(C.DeltaKind == Candidate::StrideDelta ||
+         C.DeltaKind == Candidate::BaseDelta);
+  assert(C.CandidateKind != Candidate::Mul);
+  // StrideDelta
+  // X = B + i * S
+  // Y = B + i * S'
+  //   = B + i * (S + Delta)
+  //   = B + i * S + i * Delta
+  //   = X + i * StrideDelta
+  // Bump = i * (S' - S)
+  //
+  // BaseDelta
+  // X = B  + i * S
+  // Y = B' + i * S
+  //   = (B + Delta) + i * S
+  //   = X + BaseDelta
+  // Bump = (B' - B).
+  Value *Bump = C.Delta;
+  if (C.DeltaKind == Candidate::StrideDelta) {
+    // If this value is consumed by a GEP, promote StrideDelta before doing
+    // StrideDelta * Index to ensure the same semantics as the original GEP.
+    if (C.CandidateKind == Candidate::GEP) {
+      auto *GEP = cast<GetElementPtrInst>(C.Ins);
+      Type *NewScalarIndexTy =
+          DL->getIndexType(GEP->getPointerOperandType()->getScalarType());
+      Bump = Builder.CreateSExtOrTrunc(Bump, NewScalarIndexTy);
+    }
+    if (!C.Index->isOne()) {
+      Value *ExtendedIndex =
+          Builder.CreateSExtOrTrunc(C.Index, Bump->getType());
+      Bump = CreateMul(Bump, ExtendedIndex);
     }
-    return Bump;
   }
+  return Bump;
 }
 
-void StraightLineStrengthReduce::rewriteCandidateWithBasis(
-    const Candidate &C, const Candidate &Basis) {
+void StraightLineStrengthReduce::rewriteCandidate(const Candidate &C) {
   if (!DebugCounter::shouldExecute(StraightLineStrengthReduceCounter))
     return;
 
-  // If one of Base, Index, and Stride are different,
-  // other parts must be the same
+  const Candidate &Basis = *C.Basis;
   assert(C.Delta && C.CandidateKind == Basis.CandidateKind &&
-         ((C.Base == Basis.Base && C.StrideSCEV == Basis.StrideSCEV &&
-           C.DeltaKind == Candidate::IndexDelta) ||
-          (C.Base == Basis.Base && C.Index == Basis.Index &&
-           C.DeltaKind == Candidate::StrideDelta) ||
-          (C.StrideSCEV == Basis.StrideSCEV && C.Index == Basis.Index &&
-           C.DeltaKind == Candidate::BaseDelta)));
+         C.hasValidDelta(Basis));
 
   IRBuilder<> Builder(C.Ins);
   Value *Bump = emitBump(Basis, C, Builder, DL);
@@ -1258,7 +1271,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
   // always before rewriting its Basis
   for (Instruction *I : reverse(SortedCandidateInsts))
     if (Candidate *C = pickRewriteCandidate(I))
-      rewriteCandidateWithBasis(*C, *C->Basis);
+      rewriteCandidate(*C);
 
   for (auto *DeadIns : DeadInstructions)
     // A dead instruction may be another dead instruction's op,
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
index a2ae4cbe2b4cc..dc649673c525f 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
@@ -1,7 +1,6 @@
 ; RUN: opt < %s -passes=slsr -S | FileCheck %s
 
-target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-unknown-unknown"
+target triple = "nvptx64-nvidia-cuda"
 
 ; CHECK-LABEL: slsr_i8_zero_delta(
 ; CHECK-SAME:      ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]])
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
index 935e030c1e93f..b236640802e83 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
@@ -1,8 +1,7 @@
 ; RUN: opt < %s -passes=slsr -S | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
 
-target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-unknown-unknown"
+target triple = "nvptx64-nvidia-cuda"
 
 ; Test SLSR can reuse the computation by complex variable delta.
 ; The original program needs 4 mul.wide.s32, after SLSR with 

>From 6be91c9eac6f7a1de6bf7880f57408771fe40082 Mon Sep 17 00:00:00 2001
From: feip <feip at nvidia.com>
Date: Fri, 10 Oct 2025 22:49:38 -0700
Subject: [PATCH 5/5] Improve comments

---
 .../Scalar/StraightLineStrengthReduce.cpp       | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 072930fba7dd0..a0b47bf2e8a29 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -34,7 +34,7 @@
 //
 // Note: (i' - i) * S is folded to the extent possible.
 //
-// For form Add and GEP, we can also rewrite a candidate in a simpler way
+// For Add and GEP forms, we can also rewrite a candidate in a simpler way
 // with respect to other dominating candidates if their B or S are different
 // but other parts are the same. For example,
 //
@@ -52,8 +52,8 @@
 // S1: X = &B[i * S]
 // S2: Y = &B[i * S']   => X + i * (S' - S)
 //
-// PS: Stride delta write on form Mul is usually non-profitable, and Base delta
-// write sometimes is profitable, so we do not support them on form Mul.
+// PS: Stride delta rewrite on Mul form is usually non-profitable, and Base
+// delta rewrite sometimes is profitable, so we do not support them on Mul.
 //
 // This rewriting is in general a good idea. The code patterns we focus on
 // usually come from loop unrolling, so the delta is likely the same
@@ -1124,8 +1124,9 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
     // IndexDelta
     // X = B + i * S
     // Y = B + i` * S
-    //   = B + (i' - i) * S
-    //   = X + Delta * S
+    //   = B + (i + IndexDelta) * S
+    //   = B + i * S + IndexDelta * S
+    //   = X + IndexDelta * S
     // Bump = (i' - i) * S
 
     // If Delta is 0, C is a fully redundant of C.Basis,
@@ -1154,15 +1155,15 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
   // StrideDelta
   // X = B + i * S
   // Y = B + i * S'
-  //   = B + i * (S + Delta)
-  //   = B + i * S + i * Delta
+  //   = B + i * (S + StrideDelta)
+  //   = B + i * S + i * StrideDelta
   //   = X + i * StrideDelta
   // Bump = i * (S' - S)
   //
   // BaseDelta
   // X = B  + i * S
   // Y = B' + i * S
-  //   = (B + Delta) + i * S
+  //   = (B + BaseDelta) + i * S
   //   = X + BaseDelta
   // Bump = (B' - B).
   Value *Bump = C.Delta;