[llvm] Revert "Redesign Straight-Line Strength Reduction (SLSR) (#162930)" (PR #169546)
Alan Li via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 25 11:52:36 PST 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/169546
>From 8bd8b5bc6153b9e674d745ea721c0f103bfcefd8 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Tue, 25 Nov 2025 18:30:56 +0000
Subject: [PATCH 1/3] Revert "Redesign Straight-Line Strength Reduction (SLSR)
(#162930)"
This reverts commit f67409c3ec7cd45c55656c8159bc42b3918f1116.
---
.../Scalar/StraightLineStrengthReduce.cpp | 1140 ++++-------------
.../AMDGPU/agpr-copy-no-free-registers.ll | 20 +-
.../AMDGPU/dagcombine-reassociate-bug.ll | 2 +-
llvm/test/CodeGen/AMDGPU/idot2.ll | 12 +-
llvm/test/CodeGen/AMDGPU/idot4s.ll | 161 ++-
llvm/test/CodeGen/AMDGPU/idot8u.ll | 6 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 468 +++----
.../AMDGPU/splitkit-getsubrangeformask.ll | 50 +-
llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll | 329 +++--
.../AMDGPU/pr23975.ll | 2 +-
.../reassociate-geps-and-slsr-addrspace.ll | 10 +-
.../NVPTX/slsr-i8-gep.ll | 271 ----
.../NVPTX/slsr-var-delta.ll | 70 -
.../path-compression.ll | 35 -
.../pick-candidate.ll | 32 -
.../StraightLineStrengthReduce/slsr-add.ll | 120 --
.../StraightLineStrengthReduce/slsr-gep.ll | 149 ---
17 files changed, 839 insertions(+), 2038 deletions(-)
delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll
delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index e5399bdd767e2..e94ad1999e32a 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -12,16 +12,17 @@
// effective in simplifying arithmetic statements derived from an unrolled loop.
// It can also simplify the logic of SeparateConstOffsetFromGEP.
//
-// There are many optimizations we can perform in the domain of SLSR.
-// We look for strength reduction candidates in the following forms:
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidates in the following forms:
//
-// Form Add: B + i * S
-// Form Mul: (B + i) * S
-// Form GEP: &B[i * S]
+// Form 1: B + i * S
+// Form 2: (B + i) * S
+// Form 3: &B[i * S]
//
// where S is an integer variable, and i is a constant integer. If we found two
// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
-// in a simpler way with respect to S1 (index delta). For example,
+// in a simpler way with respect to S1. For example,
//
// S1: X = B + i * S
// S2: Y = B + i' * S => X + (i' - i) * S
@@ -34,29 +35,8 @@
//
// Note: (i' - i) * S is folded to the extent possible.
//
-// For Add and GEP forms, we can also rewrite a candidate in a simpler way
-// with respect to other dominating candidates if their B or S are different
-// but other parts are the same. For example,
-//
-// Base Delta:
-// S1: X = B + i * S
-// S2: Y = B' + i * S => X + (B' - B)
-//
-// S1: X = &B [i * S]
-// S2: Y = &B'[i * S] => X + (B' - B)
-//
-// Stride Delta:
-// S1: X = B + i * S
-// S2: Y = B + i * S' => X + i * (S' - S)
-//
-// S1: X = &B[i * S]
-// S2: Y = &B[i * S'] => X + i * (S' - S)
-//
-// PS: Stride delta rewrite on Mul form is usually non-profitable, and Base
-// delta rewrite sometimes is profitable, so we do not support them on Mul.
-//
// This rewriting is in general a good idea. The code patterns we focus on
-// usually come from loop unrolling, so the delta is likely the same
+// usually come from loop unrolling, so (i' - i) * S is likely the same
// across iterations and can be reused. When that happens, the optimized form
// takes only one add starting from the second iteration.
//
@@ -67,14 +47,19 @@
// TODO:
//
// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+// sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+// left as future work.
+//
+// - When (i' - i) is constant but i and i' are not, we could still perform
+// SLSR.
#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
@@ -101,19 +86,16 @@
#include <cstdint>
#include <limits>
#include <list>
-#include <queue>
#include <vector>
using namespace llvm;
using namespace PatternMatch;
-#define DEBUG_TYPE "slsr"
-
static const unsigned UnknownAddressSpace =
std::numeric_limits<unsigned>::max();
DEBUG_COUNTER(StraightLineStrengthReduceCounter, "slsr-counter",
- "Controls whether rewriteCandidate is executed.");
+ "Controls whether rewriteCandidateWithBasis is executed.");
namespace {
@@ -160,23 +142,15 @@ class StraightLineStrengthReduce {
GEP, // &B[..][i * S][..]
};
- enum DKind {
- InvalidDelta, // reserved for the default constructor
- IndexDelta, // Delta is a constant from Index
- BaseDelta, // Delta is a constant or variable from Base
- StrideDelta, // Delta is a constant or variable from Stride
- };
-
Candidate() = default;
Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
- Instruction *I, const SCEV *StrideSCEV)
- : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
- StrideSCEV(StrideSCEV) {}
+ Instruction *I)
+ : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
Kind CandidateKind = Invalid;
const SCEV *Base = nullptr;
- // TODO: Swap Index and Stride's name.
+
// Note that Index and Stride of a GEP candidate do not necessarily have the
// same integer type. In that case, during rewriting, Stride will be
// sign-extended or truncated to Index's type.
@@ -203,164 +177,22 @@ class StraightLineStrengthReduce {
// Points to the immediate basis of this candidate, or nullptr if we cannot
// find any basis for this candidate.
Candidate *Basis = nullptr;
-
- DKind DeltaKind = InvalidDelta;
-
- // Store SCEV of Stride to compute delta from different strides
- const SCEV *StrideSCEV = nullptr;
-
- // Points to (Y - X) that will be used to rewrite this candidate.
- Value *Delta = nullptr;
-
- /// Cost model: Evaluate the computational efficiency of the candidate.
- ///
- /// Efficiency levels (higher is better):
- /// ZeroInst (5) - [Variable] or [Const]
- /// OneInstOneVar (4) - [Variable + Const] or [Variable * Const]
- /// OneInstTwoVar (3) - [Variable + Variable] or [Variable * Variable]
- /// TwoInstOneVar (2) - [Const + Const * Variable]
- /// TwoInstTwoVar (1) - [Variable + Const * Variable]
- enum EfficiencyLevel : unsigned {
- Unknown = 0,
- TwoInstTwoVar = 1,
- TwoInstOneVar = 2,
- OneInstTwoVar = 3,
- OneInstOneVar = 4,
- ZeroInst = 5
- };
-
- static EfficiencyLevel
- getComputationEfficiency(Kind CandidateKind, const ConstantInt *Index,
- const Value *Stride, const SCEV *Base = nullptr) {
- bool IsConstantBase = false;
- bool IsZeroBase = false;
- // When evaluating the efficiency of a rewrite, if the Base's SCEV is
- // not available, conservatively assume the base is not constant.
- if (auto *ConstBase = dyn_cast_or_null<SCEVConstant>(Base)) {
- IsConstantBase = true;
- IsZeroBase = ConstBase->getValue()->isZero();
- }
-
- bool IsConstantStride = isa<ConstantInt>(Stride);
- bool IsZeroStride =
- IsConstantStride && cast<ConstantInt>(Stride)->isZero();
- // All constants
- if (IsConstantBase && IsConstantStride)
- return ZeroInst;
-
- // (Base + Index) * Stride
- if (CandidateKind == Mul) {
- if (IsZeroStride)
- return ZeroInst;
- if (Index->isZero())
- return (IsConstantStride || IsConstantBase) ? OneInstOneVar
- : OneInstTwoVar;
-
- if (IsConstantBase)
- return IsZeroBase && (Index->isOne() || Index->isMinusOne())
- ? ZeroInst
- : OneInstOneVar;
-
- if (IsConstantStride) {
- auto *CI = cast<ConstantInt>(Stride);
- return (CI->isOne() || CI->isMinusOne()) ? OneInstOneVar
- : TwoInstOneVar;
- }
- return TwoInstTwoVar;
- }
-
- // Base + Index * Stride
- assert(CandidateKind == Add || CandidateKind == GEP);
- if (Index->isZero() || IsZeroStride)
- return ZeroInst;
-
- bool IsSimpleIndex = Index->isOne() || Index->isMinusOne();
-
- if (IsConstantBase)
- return IsZeroBase ? (IsSimpleIndex ? ZeroInst : OneInstOneVar)
- : (IsSimpleIndex ? OneInstOneVar : TwoInstOneVar);
-
- if (IsConstantStride)
- return IsZeroStride ? ZeroInst : OneInstOneVar;
-
- if (IsSimpleIndex)
- return OneInstTwoVar;
-
- return TwoInstTwoVar;
- }
-
- // Evaluate if the given delta is profitable to rewrite this candidate.
- bool isProfitableRewrite(const Value *Delta, const DKind DeltaKind) const {
- // This function cannot accurately evaluate the profit of whole expression
- // with context. A candidate (B + I * S) cannot express whether this
- // instruction needs to compute on its own (I * S), which may be shared
- // with other candidates or may need instructions to compute.
- // If the rewritten form has the same strength, still rewrite to
- // (X + Delta) since it may expose more CSE opportunities on Delta, as
- // unrolled loops usually have identical Delta for each unrolled body.
- //
- // Note, this function should only be used on Index Delta rewrite.
- // Base and Stride delta need context info to evaluate the register
- // pressure impact from variable delta.
- return getComputationEfficiency(CandidateKind, Index, Stride, Base) <=
- getRewriteEfficiency(Delta, DeltaKind);
- }
-
- // Evaluate the rewrite efficiency of this candidate with its Basis
- EfficiencyLevel getRewriteEfficiency() const {
- return Basis ? getRewriteEfficiency(Delta, DeltaKind) : Unknown;
- }
-
- // Evaluate the rewrite efficiency of this candidate with a given delta
- EfficiencyLevel getRewriteEfficiency(const Value *Delta,
- const DKind DeltaKind) const {
- switch (DeltaKind) {
- case BaseDelta: // [X + Delta]
- return getComputationEfficiency(
- CandidateKind,
- ConstantInt::get(cast<IntegerType>(Delta->getType()), 1), Delta);
- case StrideDelta: // [X + Index * Delta]
- return getComputationEfficiency(CandidateKind, Index, Delta);
- case IndexDelta: // [X + Delta * Stride]
- return getComputationEfficiency(CandidateKind, cast<ConstantInt>(Delta),
- Stride);
- default:
- return Unknown;
- }
- }
-
- bool isHighEfficiency() const {
- return getComputationEfficiency(CandidateKind, Index, Stride, Base) >=
- OneInstOneVar;
- }
-
- // Verify that this candidate has valid delta components relative to the
- // basis
- bool hasValidDelta(const Candidate &Basis) const {
- switch (DeltaKind) {
- case IndexDelta:
- // Index differs, Base and Stride must match
- return Base == Basis.Base && StrideSCEV == Basis.StrideSCEV;
- case StrideDelta:
- // Stride differs, Base and Index must match
- return Base == Basis.Base && Index == Basis.Index;
- case BaseDelta:
- // Base differs, Stride and Index must match
- return StrideSCEV == Basis.StrideSCEV && Index == Basis.Index;
- default:
- return false;
- }
- }
};
bool runOnFunction(Function &F);
private:
- // Fetch straight-line basis for rewriting C, update C.Basis to point to it,
- // and store the delta between C and its Basis in C.Delta.
- void setBasisAndDeltaFor(Candidate &C);
+ // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+ // share the same base and stride.
+ bool isBasisFor(const Candidate &Basis, const Candidate &C);
+
// Returns whether the candidate can be folded into an addressing mode.
- bool isFoldable(const Candidate &C, TargetTransformInfo *TTI);
+ bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
+ const DataLayout *DL);
+
+ // Returns true if C is already in a simplest form and not worth being
+ // rewritten.
+ bool isSimplestForm(const Candidate &C);
// Checks whether I is in a candidate form. If so, adds all the matching forms
// to Candidates, and tries to find the immediate basis for each of them.
@@ -384,6 +216,12 @@ class StraightLineStrengthReduce {
// Allocate candidates and find bases for GetElementPtr instructions.
void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+ // A helper function that scales Idx with ElementSize before invoking
+ // allocateCandidatesAndFindBasis.
+ void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+ Value *S, uint64_t ElementSize,
+ Instruction *I);
+
// Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
// basis.
void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
@@ -391,7 +229,13 @@ class StraightLineStrengthReduce {
Instruction *I);
// Rewrites candidate C with respect to Basis.
- void rewriteCandidate(const Candidate &C);
+ void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
+ // A helper function that factors ArrayIdx to a product of a stride and a
+ // constant index, and invokes allocateCandidatesAndFindBasis with the
+ // factorings.
+ void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+ GetElementPtrInst *GEP);
// Emit code that computes the "bump" from Basis to C.
static Value *emitBump(const Candidate &Basis, const Candidate &C,
@@ -403,203 +247,12 @@ class StraightLineStrengthReduce {
TargetTransformInfo *TTI = nullptr;
std::list<Candidate> Candidates;
- // Map from SCEV to instructions that represent the value,
- // instructions are sorted in depth-first order.
- DenseMap<const SCEV *, SmallSetVector<Instruction *, 2>> SCEVToInsts;
-
- // Record the dependency between instructions. If C.Basis == B, we would have
- // {B.Ins -> {C.Ins, ...}}.
- MapVector<Instruction *, std::vector<Instruction *>> DependencyGraph;
-
- // Map between each instruction and its possible candidates.
- DenseMap<Instruction *, SmallVector<Candidate *, 3>> RewriteCandidates;
-
- // All instructions that have candidates sort in topological order based on
- // dependency graph, from roots to leaves.
- std::vector<Instruction *> SortedCandidateInsts;
-
- // Record all instructions that are already rewritten and will be removed
- // later.
- std::vector<Instruction *> DeadInstructions;
-
- // Classify candidates against Delta kind
- class CandidateDictTy {
- public:
- using CandsTy = SmallVector<Candidate *, 8>;
- using BBToCandsTy = DenseMap<const BasicBlock *, CandsTy>;
-
- private:
- // Index delta Basis must have the same (Base, StrideSCEV, Inst.Type)
- using IndexDeltaKeyTy = std::tuple<const SCEV *, const SCEV *, Type *>;
- DenseMap<IndexDeltaKeyTy, BBToCandsTy> IndexDeltaCandidates;
-
- // Base delta Basis must have the same (StrideSCEV, Index, Inst.Type)
- using BaseDeltaKeyTy = std::tuple<const SCEV *, ConstantInt *, Type *>;
- DenseMap<BaseDeltaKeyTy, BBToCandsTy> BaseDeltaCandidates;
-
- // Stride delta Basis must have the same (Base, Index, Inst.Type)
- using StrideDeltaKeyTy = std::tuple<const SCEV *, ConstantInt *, Type *>;
- DenseMap<StrideDeltaKeyTy, BBToCandsTy> StrideDeltaCandidates;
-
- public:
- // TODO: Disable index delta on GEP after we completely move
- // from typed GEP to PtrAdd.
- const BBToCandsTy *getCandidatesWithDeltaKind(const Candidate &C,
- Candidate::DKind K) const {
- assert(K != Candidate::InvalidDelta);
- if (K == Candidate::IndexDelta) {
- IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, C.Ins->getType());
- auto It = IndexDeltaCandidates.find(IndexDeltaKey);
- if (It != IndexDeltaCandidates.end())
- return &It->second;
- } else if (K == Candidate::BaseDelta) {
- BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, C.Ins->getType());
- auto It = BaseDeltaCandidates.find(BaseDeltaKey);
- if (It != BaseDeltaCandidates.end())
- return &It->second;
- } else {
- assert(K == Candidate::StrideDelta);
- StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, C.Ins->getType());
- auto It = StrideDeltaCandidates.find(StrideDeltaKey);
- if (It != StrideDeltaCandidates.end())
- return &It->second;
- }
- return nullptr;
- }
-
- // Pointers to C must remain valid until CandidateDict is cleared.
- void add(Candidate &C) {
- Type *ValueType = C.Ins->getType();
- BasicBlock *BB = C.Ins->getParent();
- IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, ValueType);
- BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, ValueType);
- StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, ValueType);
- IndexDeltaCandidates[IndexDeltaKey][BB].push_back(&C);
- BaseDeltaCandidates[BaseDeltaKey][BB].push_back(&C);
- StrideDeltaCandidates[StrideDeltaKey][BB].push_back(&C);
- }
- // Remove all mappings from set
- void clear() {
- IndexDeltaCandidates.clear();
- BaseDeltaCandidates.clear();
- StrideDeltaCandidates.clear();
- }
- } CandidateDict;
-
- const SCEV *getAndRecordSCEV(Value *V) {
- auto *S = SE->getSCEV(V);
- if (isa<Instruction>(V) && !(isa<SCEVCouldNotCompute>(S) ||
- isa<SCEVUnknown>(S) || isa<SCEVConstant>(S)))
- SCEVToInsts[S].insert(cast<Instruction>(V));
-
- return S;
- }
-
- // Get the nearest instruction before CI that represents the value of S,
- // return nullptr if no instruction is associated with S or S is not a
- // reusable expression.
- Value *getNearestValueOfSCEV(const SCEV *S, const Instruction *CI) const {
- if (isa<SCEVCouldNotCompute>(S))
- return nullptr;
-
- if (auto *SU = dyn_cast<SCEVUnknown>(S))
- return SU->getValue();
- if (auto *SC = dyn_cast<SCEVConstant>(S))
- return SC->getValue();
-
- auto It = SCEVToInsts.find(S);
- if (It == SCEVToInsts.end())
- return nullptr;
-
- // Instructions are sorted in depth-first order, so search for the nearest
- // instruction by walking the list in reverse order.
- for (Instruction *I : reverse(It->second))
- if (DT->dominates(I, CI))
- return I;
-
- return nullptr;
- }
-
- struct DeltaInfo {
- Candidate *Cand;
- Candidate::DKind DeltaKind;
- Value *Delta;
-
- DeltaInfo()
- : Cand(nullptr), DeltaKind(Candidate::InvalidDelta), Delta(nullptr) {}
- DeltaInfo(Candidate *Cand, Candidate::DKind DeltaKind, Value *Delta)
- : Cand(Cand), DeltaKind(DeltaKind), Delta(Delta) {}
- operator bool() const { return Cand != nullptr; }
- };
-
- friend raw_ostream &operator<<(raw_ostream &OS, const DeltaInfo &DI);
-
- DeltaInfo compressPath(Candidate &C, Candidate *Basis) const;
-
- Candidate *pickRewriteCandidate(Instruction *I) const;
- void sortCandidateInstructions();
- static Constant *getIndexDelta(Candidate &C, Candidate &Basis);
- static bool isSimilar(Candidate &C, Candidate &Basis, Candidate::DKind K);
-
- // Add Basis -> C in DependencyGraph and propagate
- // C.Stride and C.Delta's dependency to C
- void addDependency(Candidate &C, Candidate *Basis) {
- if (Basis)
- DependencyGraph[Basis->Ins].emplace_back(C.Ins);
-
- // If any candidate of Inst has a basis, then Inst will be rewritten,
- // C must be rewritten after rewriting Inst, so we need to propagate
- // the dependency to C
- auto PropagateDependency = [&](Instruction *Inst) {
- if (auto CandsIt = RewriteCandidates.find(Inst);
- CandsIt != RewriteCandidates.end() &&
- llvm::any_of(CandsIt->second,
- [](Candidate *Cand) { return Cand->Basis; }))
- DependencyGraph[Inst].emplace_back(C.Ins);
- };
-
- // If C has a variable delta and the delta is a candidate,
- // propagate its dependency to C
- if (auto *DeltaInst = dyn_cast_or_null<Instruction>(C.Delta))
- PropagateDependency(DeltaInst);
-
- // If the stride is a candidate, propagate its dependency to C
- if (auto *StrideInst = dyn_cast<Instruction>(C.Stride))
- PropagateDependency(StrideInst);
- };
+ // Temporarily holds all instructions that are unlinked (but not deleted) by
+ // rewriteCandidateWithBasis. These instructions will be actually removed
+ // after all rewriting finishes.
+ std::vector<Instruction *> UnlinkedInstructions;
};
-inline raw_ostream &operator<<(raw_ostream &OS,
- const StraightLineStrengthReduce::Candidate &C) {
- OS << "Ins: " << *C.Ins << "\n Base: " << *C.Base
- << "\n Index: " << *C.Index << "\n Stride: " << *C.Stride
- << "\n StrideSCEV: " << *C.StrideSCEV;
- if (C.Basis)
- OS << "\n Delta: " << *C.Delta << "\n Basis: \n [ " << *C.Basis << " ]";
- return OS;
-}
-
-[[maybe_unused]] LLVM_DUMP_METHOD inline raw_ostream &
-operator<<(raw_ostream &OS, const StraightLineStrengthReduce::DeltaInfo &DI) {
- OS << "Cand: " << *DI.Cand << "\n";
- OS << "Delta Kind: ";
- switch (DI.DeltaKind) {
- case StraightLineStrengthReduce::Candidate::IndexDelta:
- OS << "Index";
- break;
- case StraightLineStrengthReduce::Candidate::BaseDelta:
- OS << "Base";
- break;
- case StraightLineStrengthReduce::Candidate::StrideDelta:
- OS << "Stride";
- break;
- default:
- break;
- }
- OS << "\nDelta: " << *DI.Delta;
- return OS;
-}
-
} // end anonymous namespace
char StraightLineStrengthReduceLegacyPass::ID = 0;
@@ -616,290 +269,17 @@ FunctionPass *llvm::createStraightLineStrengthReducePass() {
return new StraightLineStrengthReduceLegacyPass();
}
-// A helper function that unifies the bitwidth of A and B.
-static void unifyBitWidth(APInt &A, APInt &B) {
- if (A.getBitWidth() < B.getBitWidth())
- A = A.sext(B.getBitWidth());
- else if (A.getBitWidth() > B.getBitWidth())
- B = B.sext(A.getBitWidth());
-}
-
-Constant *StraightLineStrengthReduce::getIndexDelta(Candidate &C,
- Candidate &Basis) {
- APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
- unifyBitWidth(Idx, BasisIdx);
- APInt IndexDelta = Idx - BasisIdx;
- IntegerType *DeltaType =
- IntegerType::get(C.Ins->getContext(), IndexDelta.getBitWidth());
- return ConstantInt::get(DeltaType, IndexDelta);
-}
-
-bool StraightLineStrengthReduce::isSimilar(Candidate &C, Candidate &Basis,
- Candidate::DKind K) {
- bool SameType = false;
- switch (K) {
- case Candidate::StrideDelta:
- SameType = C.StrideSCEV->getType() == Basis.StrideSCEV->getType();
- break;
- case Candidate::BaseDelta:
- SameType = C.Base->getType() == Basis.Base->getType();
- break;
- case Candidate::IndexDelta:
- SameType = true;
- break;
- default:;
- }
- return SameType && Basis.Ins != C.Ins &&
- Basis.CandidateKind == C.CandidateKind;
-}
-
-void StraightLineStrengthReduce::setBasisAndDeltaFor(Candidate &C) {
- auto SearchFrom = [this, &C](const CandidateDictTy::BBToCandsTy &BBToCands,
- auto IsTarget) -> bool {
- // Search dominating candidates by walking the immediate-dominator chain
- // from the candidate's defining block upward. Visiting blocks in this
- // order ensures we prefer the closest dominating basis.
- const BasicBlock *BB = C.Ins->getParent();
- while (BB) {
- auto It = BBToCands.find(BB);
- if (It != BBToCands.end())
- for (Candidate *Basis : reverse(It->second))
- if (IsTarget(Basis))
- return true;
-
- const DomTreeNode *Node = DT->getNode(BB);
- if (!Node)
- break;
- Node = Node->getIDom();
- BB = Node ? Node->getBlock() : nullptr;
- }
- return false;
- };
-
- // Priority:
- // Constant Delta from Index > Constant Delta from Base >
- // Constant Delta from Stride > Variable Delta from Base or Stride
- // TODO: Change the priority to align with the cost model.
-
- // First, look for a constant index-diff basis
- if (const auto *IndexDeltaCandidates =
- CandidateDict.getCandidatesWithDeltaKind(C, Candidate::IndexDelta)) {
- bool FoundConstDelta =
- SearchFrom(*IndexDeltaCandidates, [&](Candidate *Basis) {
- if (isSimilar(C, *Basis, Candidate::IndexDelta)) {
- assert(DT->dominates(Basis->Ins, C.Ins));
- auto *Delta = getIndexDelta(C, *Basis);
- if (!C.isProfitableRewrite(Delta, Candidate::IndexDelta))
- return false;
- C.Basis = Basis;
- C.DeltaKind = Candidate::IndexDelta;
- C.Delta = Delta;
- LLVM_DEBUG(dbgs() << "Found delta from Index " << *C.Delta << "\n");
- return true;
- }
- return false;
- });
- if (FoundConstDelta)
- return;
- }
-
- // No constant-index-diff basis found. look for the best possible base-diff
- // or stride-diff basis
- // Base/Stride diffs not supported for form (B + i) * S
- if (C.CandidateKind == Candidate::Mul)
- return;
-
- auto For = [this, &C](Candidate::DKind K) {
- // return true if find a Basis with constant delta and stop searching,
- // return false if did not find a Basis or the delta is not a constant
- // and continue searching for a Basis with constant delta
- return [K, this, &C](Candidate *Basis) -> bool {
- if (!isSimilar(C, *Basis, K))
- return false;
-
- assert(DT->dominates(Basis->Ins, C.Ins));
- const SCEV *BasisPart =
- (K == Candidate::BaseDelta) ? Basis->Base : Basis->StrideSCEV;
- const SCEV *CandPart =
- (K == Candidate::BaseDelta) ? C.Base : C.StrideSCEV;
- const SCEV *Diff = SE->getMinusSCEV(CandPart, BasisPart);
- Value *AvailableVal = getNearestValueOfSCEV(Diff, C.Ins);
- if (!AvailableVal)
- return false;
-
- // Record delta if none has been found yet, or the new delta is
- // a constant that is better than the existing delta.
- if (!C.Delta || isa<ConstantInt>(AvailableVal)) {
- C.Delta = AvailableVal;
- C.Basis = Basis;
- C.DeltaKind = K;
- }
- return isa<ConstantInt>(C.Delta);
- };
- };
-
- if (const auto *BaseDeltaCandidates =
- CandidateDict.getCandidatesWithDeltaKind(C, Candidate::BaseDelta)) {
- if (SearchFrom(*BaseDeltaCandidates, For(Candidate::BaseDelta))) {
- LLVM_DEBUG(dbgs() << "Found delta from Base: " << *C.Delta << "\n");
- return;
- }
- }
-
- if (const auto *StrideDeltaCandidates =
- CandidateDict.getCandidatesWithDeltaKind(C, Candidate::StrideDelta)) {
- if (SearchFrom(*StrideDeltaCandidates, For(Candidate::StrideDelta))) {
- LLVM_DEBUG(dbgs() << "Found delta from Stride: " << *C.Delta << "\n");
- return;
- }
- }
-
- // If we did not find a constant delta, we might have found a variable delta
- if (C.Delta) {
- LLVM_DEBUG({
- dbgs() << "Found delta from ";
- if (C.DeltaKind == Candidate::BaseDelta)
- dbgs() << "Base: ";
- else
- dbgs() << "Stride: ";
- dbgs() << *C.Delta << "\n";
- });
- assert(C.DeltaKind != Candidate::InvalidDelta && C.Basis);
- }
-}
-
-// Compress the path from `Basis` to the deepest Basis in the Basis chain
-// to avoid non-profitable data dependency and improve ILP.
-// X = A + 1
-// Y = X + 1
-// Z = Y + 1
-// ->
-// X = A + 1
-// Y = A + 2
-// Z = A + 3
-// Return the delta info for C aginst the new Basis
-auto StraightLineStrengthReduce::compressPath(Candidate &C,
- Candidate *Basis) const
- -> DeltaInfo {
- if (!Basis || !Basis->Basis || C.CandidateKind == Candidate::Mul)
- return {};
- Candidate *Root = Basis;
- Value *NewDelta = nullptr;
- auto NewKind = Candidate::InvalidDelta;
-
- while (Root->Basis) {
- Candidate *NextRoot = Root->Basis;
- if (C.Base == NextRoot->Base && C.StrideSCEV == NextRoot->StrideSCEV &&
- isSimilar(C, *NextRoot, Candidate::IndexDelta)) {
- ConstantInt *CI = cast<ConstantInt>(getIndexDelta(C, *NextRoot));
- if (CI->isZero() || CI->isOne() || isa<SCEVConstant>(C.StrideSCEV)) {
- Root = NextRoot;
- NewKind = Candidate::IndexDelta;
- NewDelta = CI;
- continue;
- }
- }
-
- const SCEV *CandPart = nullptr;
- const SCEV *BasisPart = nullptr;
- auto CurrKind = Candidate::InvalidDelta;
- if (C.Base == NextRoot->Base && C.Index == NextRoot->Index) {
- CandPart = C.StrideSCEV;
- BasisPart = NextRoot->StrideSCEV;
- CurrKind = Candidate::StrideDelta;
- } else if (C.StrideSCEV == NextRoot->StrideSCEV &&
- C.Index == NextRoot->Index) {
- CandPart = C.Base;
- BasisPart = NextRoot->Base;
- CurrKind = Candidate::BaseDelta;
- } else
- break;
-
- assert(CandPart && BasisPart);
- if (!isSimilar(C, *NextRoot, CurrKind))
- break;
-
- if (auto DeltaVal =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(CandPart, BasisPart))) {
- Root = NextRoot;
- NewDelta = DeltaVal->getValue();
- NewKind = CurrKind;
- } else
- break;
- }
-
- if (Root != Basis) {
- assert(NewKind != Candidate::InvalidDelta && NewDelta);
- LLVM_DEBUG(dbgs() << "Found new Basis with " << *NewDelta
- << " from path compression.\n");
- return {Root, NewKind, NewDelta};
- }
-
- return {};
-}
-
-// Topologically sort candidate instructions based on their relationship in
-// dependency graph.
-void StraightLineStrengthReduce::sortCandidateInstructions() {
- SortedCandidateInsts.clear();
- // An instruction may have multiple candidates that get different Basis
- // instructions, and each candidate can get dependencies from Basis and
- // Stride when Stride will also be rewritten by SLSR. Hence, an instruction
- // may have multiple dependencies. Use InDegree to ensure all dependencies
- // processed before processing itself.
- DenseMap<Instruction *, int> InDegree;
- for (auto &KV : DependencyGraph) {
- InDegree.try_emplace(KV.first, 0);
-
- for (auto *Child : KV.second) {
- InDegree[Child]++;
- }
- }
- std::queue<Instruction *> WorkList;
- DenseSet<Instruction *> Visited;
-
- for (auto &KV : DependencyGraph)
- if (InDegree[KV.first] == 0)
- WorkList.push(KV.first);
-
- while (!WorkList.empty()) {
- Instruction *I = WorkList.front();
- WorkList.pop();
- if (!Visited.insert(I).second)
- continue;
-
- SortedCandidateInsts.push_back(I);
-
- for (auto *Next : DependencyGraph[I]) {
- auto &Degree = InDegree[Next];
- if (--Degree == 0)
- WorkList.push(Next);
- }
- }
-
- assert(SortedCandidateInsts.size() == DependencyGraph.size() &&
- "Dependency graph should not have cycles");
-}
-
-auto StraightLineStrengthReduce::pickRewriteCandidate(Instruction *I) const
- -> Candidate * {
- // Return the candidate of instruction I that has the highest profit.
- auto It = RewriteCandidates.find(I);
- if (It == RewriteCandidates.end())
- return nullptr;
-
- Candidate *BestC = nullptr;
- auto BestEfficiency = Candidate::Unknown;
- for (Candidate *C : reverse(It->second))
- if (C->Basis) {
- auto Efficiency = C->getRewriteEfficiency();
- if (Efficiency > BestEfficiency) {
- BestEfficiency = Efficiency;
- BestC = C;
- }
- }
-
- return BestC;
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+ const Candidate &C) {
+ return (Basis.Ins != C.Ins && // skip the same instruction
+ // They must have the same type too. Basis.Base == C.Base
+ // doesn't guarantee their types are the same (PR23975).
+ Basis.Ins->getType() == C.Ins->getType() &&
+ // Basis must dominate C in order to rewrite C with respect to Basis.
+ DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+ // They share the same base, stride, and candidate kind.
+ Basis.Base == C.Base && Basis.Stride == C.Stride &&
+ Basis.CandidateKind == C.CandidateKind);
}
static bool isGEPFoldable(GetElementPtrInst *GEP,
@@ -919,7 +299,8 @@ static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
}
bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
- TargetTransformInfo *TTI) {
+ TargetTransformInfo *TTI,
+ const DataLayout *DL) {
if (C.CandidateKind == Candidate::Add)
return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
if (C.CandidateKind == Candidate::GEP)
@@ -927,39 +308,75 @@ bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
return false;
}
+// Returns true if GEP has zero or one non-zero index.
+static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
+ unsigned NumNonZeroIndices = 0;
+ for (Use &Idx : GEP->indices()) {
+ ConstantInt *ConstIdx = dyn_cast<ConstantInt>(Idx);
+ if (ConstIdx == nullptr || !ConstIdx->isZero())
+ ++NumNonZeroIndices;
+ }
+ return NumNonZeroIndices <= 1;
+}
+
+bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
+ if (C.CandidateKind == Candidate::Add) {
+ // B + 1 * S or B + (-1) * S
+ return C.Index->isOne() || C.Index->isMinusOne();
+ }
+ if (C.CandidateKind == Candidate::Mul) {
+ // (B + 0) * S
+ return C.Index->isZero();
+ }
+ if (C.CandidateKind == Candidate::GEP) {
+ // (char*)B + S or (char*)B - S
+ return ((C.Index->isOne() || C.Index->isMinusOne()) &&
+ hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
+ }
+ return false;
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear in
+// the number of existing candidates. However, we could do better by using
+// ScopedHashTable. Specifically, while traversing the dominator tree, we could
+// maintain all the candidates that dominate the basic block being traversed in
+// a ScopedHashTable. This hash table is indexed by the base and the stride of
+// a candidate. Therefore, finding the immediate basis of a candidate boils down
+// to one hash-table look up.
void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
Instruction *I) {
- // Record the SCEV of S that we may use it as a variable delta.
- // Ensure that we rewrite C with a existing IR that reproduces delta value.
-
- Candidate C(CT, B, Idx, S, I, getAndRecordSCEV(S));
- // If we can fold I into an addressing mode, computing I is likely free or
- // takes only one instruction. So, we don't need to analyze or rewrite it.
+ Candidate C(CT, B, Idx, S, I);
+ // SLSR can complicate an instruction in two cases:
//
- // Currently, this algorithm can at best optimize complex computations into
- // a `variable +/* constant` form. However, some targets have stricter
- // constraints on the their addressing mode.
- // For example, a `variable + constant` can only be folded to an addressing
- // mode if the constant falls within a certain range.
- // So, we also check if the instruction is already high efficient enough
- // for the strength reduction algorithm.
- if (!isFoldable(C, TTI) && !C.isHighEfficiency()) {
- setBasisAndDeltaFor(C);
-
- // Compress unnecessary rewrite to improve ILP
- if (auto Res = compressPath(C, C.Basis)) {
- C.Basis = Res.Cand;
- C.DeltaKind = Res.DeltaKind;
- C.Delta = Res.Delta;
+ // 1. If we can fold I into an addressing mode, computing I is likely free or
+ // takes only one instruction.
+ //
+ // 2. I is already in a simplest form. For example, when
+ // X = B + 8 * S
+ // Y = B + S,
+ // rewriting Y to X - 7 * S is probably a bad idea.
+ //
+ // In the above cases, we still add I to the candidate list so that I can be
+ // the basis of other candidates, but we leave I's basis blank so that I
+ // won't be rewritten.
+ if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
+ // Try to compute the immediate basis of C.
+ unsigned NumIterations = 0;
+ // Limit the scan radius to avoid running in quadratice time.
+ static const unsigned MaxNumIterations = 50;
+ for (auto Basis = Candidates.rbegin();
+ Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+ ++Basis, ++NumIterations) {
+ if (isBasisFor(*Basis, C)) {
+ C.Basis = &(*Basis);
+ break;
+ }
}
}
// Regardless of whether we find a basis for C, we need to push C to the
// candidate list so that it can be the basis of other candidates.
- LLVM_DEBUG(dbgs() << "Allocated Candidate: " << C << "\n");
Candidates.push_back(C);
- RewriteCandidates[C.Ins].push_back(&Candidates.back());
- CandidateDict.add(Candidates.back());
}
void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
@@ -1058,6 +475,54 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
}
}
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+ const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+ Instruction *I) {
+ // I = B + sext(Idx *nsw S) * ElementSize
+ // = B + (sext(Idx) * sext(S)) * ElementSize
+ // = B + (sext(Idx) * ElementSize) * sext(S)
+ // Casting to IntegerType is safe because we skipped vector GEPs.
+ IntegerType *PtrIdxTy = cast<IntegerType>(DL->getIndexType(I->getType()));
+ ConstantInt *ScaledIdx = ConstantInt::get(
+ PtrIdxTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+ allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+ const SCEV *Base,
+ uint64_t ElementSize,
+ GetElementPtrInst *GEP) {
+ // At least, ArrayIdx = ArrayIdx *nsw 1.
+ allocateCandidatesAndFindBasisForGEP(
+ Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+ ArrayIdx, ElementSize, GEP);
+ Value *LHS = nullptr;
+ ConstantInt *RHS = nullptr;
+ // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+ // itself. This would allow us to handle the shl case for free. However,
+ // matching SCEVs has two issues:
+ //
+ // 1. this would complicate rewriting because the rewriting procedure
+ // would have to translate SCEVs back to IR instructions. This translation
+ // is difficult when LHS is further evaluated to a composite SCEV.
+ //
+ // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+ // to strip nsw/nuw flags which are critical for SLSR to trace into
+ // sext'ed multiplication.
+ if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // SLSR is currently unsafe if i * S may overflow.
+ // GEP = Base + sext(LHS *nsw RHS) * ElementSize
+ allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+ } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
+ // = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
+ APInt One(RHS->getBitWidth(), 1);
+ ConstantInt *PowerOf2 =
+ ConstantInt::get(RHS->getContext(), One << RHS->getValue());
+ allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
+ }
+}
+
void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
GetElementPtrInst *GEP) {
// TODO: handle vector GEPs
@@ -1081,14 +546,11 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
Value *ArrayIdx = GEP->getOperand(I);
uint64_t ElementSize = GTI.getSequentialElementStride(*DL);
- IntegerType *PtrIdxTy = cast<IntegerType>(DL->getIndexType(GEP->getType()));
- ConstantInt *ElementSizeIdx = ConstantInt::get(PtrIdxTy, ElementSize, true);
if (ArrayIdx->getType()->getIntegerBitWidth() <=
DL->getIndexSizeInBits(GEP->getAddressSpace())) {
// Skip factoring if ArrayIdx is wider than the index size, because
// ArrayIdx is implicitly truncated to the index size.
- allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx,
- ArrayIdx, GEP);
+ factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
}
// When ArrayIdx is the sext of a value, we try to factor that value as
// well. Handling this case is important because array indices are
@@ -1099,159 +561,118 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
DL->getIndexSizeInBits(GEP->getAddressSpace())) {
// Skip factoring if TruncatedArrayIdx is wider than the pointer size,
// because TruncatedArrayIdx is implicitly truncated to the pointer size.
- allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx,
- TruncatedArrayIdx, GEP);
+ factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
}
IndexExprs[I - 1] = OrigIndexExpr;
}
}
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+ if (A.getBitWidth() < B.getBitWidth())
+ A = A.sext(B.getBitWidth());
+ else if (A.getBitWidth() > B.getBitWidth())
+ B = B.sext(A.getBitWidth());
+}
+
Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
const Candidate &C,
IRBuilder<> &Builder,
const DataLayout *DL) {
- auto CreateMul = [&](Value *LHS, Value *RHS) {
- if (ConstantInt *CR = dyn_cast<ConstantInt>(RHS)) {
- const APInt &ConstRHS = CR->getValue();
- IntegerType *DeltaType =
- IntegerType::get(C.Ins->getContext(), ConstRHS.getBitWidth());
- if (ConstRHS.isPowerOf2()) {
- ConstantInt *Exponent =
- ConstantInt::get(DeltaType, ConstRHS.logBase2());
- return Builder.CreateShl(LHS, Exponent);
- }
- if (ConstRHS.isNegatedPowerOf2()) {
- ConstantInt *Exponent =
- ConstantInt::get(DeltaType, (-ConstRHS).logBase2());
- return Builder.CreateNeg(Builder.CreateShl(LHS, Exponent));
- }
- }
-
- return Builder.CreateMul(LHS, RHS);
- };
-
- Value *Delta = C.Delta;
- // If Delta is 0, C is a fully redundant of C.Basis,
- // just replace C.Ins with Basis.Ins
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Delta);
- CI && CI->getValue().isZero())
- return nullptr;
-
- if (C.DeltaKind == Candidate::IndexDelta) {
- APInt IndexDelta = cast<ConstantInt>(C.Delta)->getValue();
- // IndexDelta
- // X = B + i * S
- // Y = B + i` * S
- // = B + (i + IndexDelta) * S
- // = B + i * S + IndexDelta * S
- // = X + IndexDelta * S
- // Bump = (i' - i) * S
-
- // Common case 1: if (i' - i) is 1, Bump = S.
- if (IndexDelta == 1)
- return C.Stride;
- // Common case 2: if (i' - i) is -1, Bump = -S.
- if (IndexDelta.isAllOnes())
- return Builder.CreateNeg(C.Stride);
-
- IntegerType *DeltaType =
- IntegerType::get(Basis.Ins->getContext(), IndexDelta.getBitWidth());
- Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
-
- return CreateMul(ExtendedStride, C.Delta);
+ APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+ unifyBitWidth(Idx, BasisIdx);
+ APInt IndexOffset = Idx - BasisIdx;
+
+ // Compute Bump = C - Basis = (i' - i) * S.
+ // Common case 1: if (i' - i) is 1, Bump = S.
+ if (IndexOffset == 1)
+ return C.Stride;
+ // Common case 2: if (i' - i) is -1, Bump = -S.
+ if (IndexOffset.isAllOnes())
+ return Builder.CreateNeg(C.Stride);
+
+ // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
+ // have different bit widths.
+ IntegerType *DeltaType =
+ IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+ Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+ if (IndexOffset.isPowerOf2()) {
+ // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
+ return Builder.CreateShl(ExtendedStride, Exponent);
}
-
- assert(C.DeltaKind == Candidate::StrideDelta ||
- C.DeltaKind == Candidate::BaseDelta);
- assert(C.CandidateKind != Candidate::Mul);
- // StrideDelta
- // X = B + i * S
- // Y = B + i * S'
- // = B + i * (S + StrideDelta)
- // = B + i * S + i * StrideDelta
- // = X + i * StrideDelta
- // Bump = i * (S' - S)
- //
- // BaseDelta
- // X = B + i * S
- // Y = B' + i * S
- // = (B + BaseDelta) + i * S
- // = X + BaseDelta
- // Bump = (B' - B).
- Value *Bump = C.Delta;
- if (C.DeltaKind == Candidate::StrideDelta) {
- // If this value is consumed by a GEP, promote StrideDelta before doing
- // StrideDelta * Index to ensure the same semantics as the original GEP.
- if (C.CandidateKind == Candidate::GEP) {
- auto *GEP = cast<GetElementPtrInst>(C.Ins);
- Type *NewScalarIndexTy =
- DL->getIndexType(GEP->getPointerOperandType()->getScalarType());
- Bump = Builder.CreateSExtOrTrunc(Bump, NewScalarIndexTy);
- }
- if (!C.Index->isOne()) {
- Value *ExtendedIndex =
- Builder.CreateSExtOrTrunc(C.Index, Bump->getType());
- Bump = CreateMul(Bump, ExtendedIndex);
- }
+ if (IndexOffset.isNegatedPowerOf2()) {
+ // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent =
+ ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
+ return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
}
- return Bump;
+ Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
+ return Builder.CreateMul(ExtendedStride, Delta);
}
-void StraightLineStrengthReduce::rewriteCandidate(const Candidate &C) {
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+ const Candidate &C, const Candidate &Basis) {
if (!DebugCounter::shouldExecute(StraightLineStrengthReduceCounter))
return;
- const Candidate &Basis = *C.Basis;
- assert(C.Delta && C.CandidateKind == Basis.CandidateKind &&
- C.hasValidDelta(Basis));
+ assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+ C.Stride == Basis.Stride);
+ // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
+ // basis of a candidate cannot be unlinked before the candidate.
+ assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
+
+ // An instruction can correspond to multiple candidates. Therefore, instead of
+ // simply deleting an instruction when we rewrite it, we mark its parent as
+ // nullptr (i.e. unlink it) so that we can skip the candidates whose
+ // instruction is already rewritten.
+ if (!C.Ins->getParent())
+ return;
IRBuilder<> Builder(C.Ins);
Value *Bump = emitBump(Basis, C, Builder, DL);
Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
- // If delta is 0, C is a fully redundant of Basis, and Bump is nullptr,
- // just replace C.Ins with Basis.Ins
- if (!Bump)
- Reduced = Basis.Ins;
- else {
- switch (C.CandidateKind) {
- case Candidate::Add:
- case Candidate::Mul: {
- // C = Basis + Bump
- Value *NegBump;
- if (match(Bump, m_Neg(m_Value(NegBump)))) {
- // If Bump is a neg instruction, emit C = Basis - (-Bump).
- Reduced = Builder.CreateSub(Basis.Ins, NegBump);
- // We only use the negative argument of Bump, and Bump itself may be
- // trivially dead.
- RecursivelyDeleteTriviallyDeadInstructions(Bump);
- } else {
- // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
- // usually unsound, e.g.,
- //
- // X = (-2 +nsw 1) *nsw INT_MAX
- // Y = (-2 +nsw 3) *nsw INT_MAX
- // =>
- // Y = X + 2 * INT_MAX
- //
- // Neither + and * in the resultant expression are nsw.
- Reduced = Builder.CreateAdd(Basis.Ins, Bump);
- }
- break;
- }
- case Candidate::GEP: {
- bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
- // C = (char *)Basis + Bump
- Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds);
- break;
+ switch (C.CandidateKind) {
+ case Candidate::Add:
+ case Candidate::Mul: {
+ // C = Basis + Bump
+ Value *NegBump;
+ if (match(Bump, m_Neg(m_Value(NegBump)))) {
+ // If Bump is a neg instruction, emit C = Basis - (-Bump).
+ Reduced = Builder.CreateSub(Basis.Ins, NegBump);
+ // We only use the negative argument of Bump, and Bump itself may be
+ // trivially dead.
+ RecursivelyDeleteTriviallyDeadInstructions(Bump);
+ } else {
+ // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+ // usually unsound, e.g.,
+ //
+ // X = (-2 +nsw 1) *nsw INT_MAX
+ // Y = (-2 +nsw 3) *nsw INT_MAX
+ // =>
+ // Y = X + 2 * INT_MAX
+ //
+ // Neither + and * in the resultant expression are nsw.
+ Reduced = Builder.CreateAdd(Basis.Ins, Bump);
}
- default:
- llvm_unreachable("C.CandidateKind is invalid");
- };
- Reduced->takeName(C.Ins);
+ break;
+ }
+ case Candidate::GEP: {
+ bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+ // C = (char *)Basis + Bump
+ Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds);
+ break;
}
+ default:
+ llvm_unreachable("C.CandidateKind is invalid");
+ };
+ Reduced->takeName(C.Ins);
C.Ins->replaceAllUsesWith(Reduced);
- DeadInstructions.push_back(C.Ins);
+ // Unlink C.Ins so that we can skip other candidates also corresponding to
+ // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+ C.Ins->removeFromParent();
+ UnlinkedInstructions.push_back(C.Ins);
}
bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
@@ -1265,42 +686,33 @@ bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
}
bool StraightLineStrengthReduce::runOnFunction(Function &F) {
- LLVM_DEBUG(dbgs() << "SLSR on Function: " << F.getName() << "\n");
// Traverse the dominator tree in the depth-first order. This order makes sure
// all bases of a candidate are in Candidates when we process it.
for (const auto Node : depth_first(DT))
for (auto &I : *(Node->getBlock()))
allocateCandidatesAndFindBasis(&I);
- // Build the dependency graph and sort candidate instructions from dependency
- // roots to leaves
- for (auto &C : Candidates) {
- DependencyGraph.try_emplace(C.Ins);
- addDependency(C, C.Basis);
+ // Rewrite candidates in the reverse depth-first order. This order makes sure
+ // a candidate being rewritten is not a basis for any other candidate.
+ while (!Candidates.empty()) {
+ const Candidate &C = Candidates.back();
+ if (C.Basis != nullptr) {
+ rewriteCandidateWithBasis(C, *C.Basis);
+ }
+ Candidates.pop_back();
+ }
+
+ // Delete all unlink instructions.
+ for (auto *UnlinkedInst : UnlinkedInstructions) {
+ for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
+ Value *Op = UnlinkedInst->getOperand(I);
+ UnlinkedInst->setOperand(I, nullptr);
+ RecursivelyDeleteTriviallyDeadInstructions(Op);
+ }
+ UnlinkedInst->deleteValue();
}
- sortCandidateInstructions();
-
- // Rewrite candidates in the topological order that rewrites a Candidate
- // always before rewriting its Basis
- for (Instruction *I : reverse(SortedCandidateInsts))
- if (Candidate *C = pickRewriteCandidate(I))
- rewriteCandidate(*C);
-
- for (auto *DeadIns : DeadInstructions)
- // A dead instruction may be another dead instruction's op,
- // don't delete an instruction twice
- if (DeadIns->getParent())
- RecursivelyDeleteTriviallyDeadInstructions(DeadIns);
-
- bool Ret = !DeadInstructions.empty();
- DeadInstructions.clear();
- DependencyGraph.clear();
- RewriteCandidates.clear();
- SortedCandidateInsts.clear();
- // First clear all references to candidates in the list
- CandidateDict.clear();
- // Then destroy the list
- Candidates.clear();
+ bool Ret = !UnlinkedInstructions.empty();
+ UnlinkedInstructions.clear();
return Ret;
}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index ef7a13819a799..ebbeab94066d6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -541,9 +541,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_lshr_b32 s2, s0, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2
; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
-; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
+; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
+; GFX908-NEXT: s_or_b32 s14, s14, 28
; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -609,13 +610,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: s_add_u32 s22, s20, s9
; GFX908-NEXT: s_addc_u32 s23, s21, s13
-; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:16 glc
+; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:20 glc
+; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:24 glc
+; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:-4 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:28 glc
+; GFX908-NEXT: global_load_dword v12, v17, s[22:23] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: ds_read_b64 v[12:13], v17
; GFX908-NEXT: ds_read_b64 v[14:15], v0
@@ -709,6 +710,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
+; GFX90A-NEXT: s_or_b32 s14, s14, 28
; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s2, v18
@@ -769,13 +771,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: s_add_u32 s22, s20, s9
; GFX90A-NEXT: s_addc_u32 s23, s21, s13
-; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:16 glc
+; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:20 glc
+; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:24 glc
+; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:28 glc
+; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ds_read_b64 v[14:15], v19
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index 1b447571efaf2..af1c64321222b 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -amdgpu-scalar-ir-passes=false < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
; Test for a bug where DAGCombiner::ReassociateOps() was creating adds
; with offset in the first operand and base pointers in the second.
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index bf65657ff841c..22907ca28c47f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2396,7 +2396,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v4, v3
+; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
@@ -2417,7 +2417,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
-; GFX9-DL-NEXT: v_add3_u32 v1, v1, v4, v3
+; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -2442,7 +2442,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-DL-NEXT: v_add3_u32 v0, v0, v2, v1
+; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -2553,7 +2553,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v4, v3
+; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
@@ -2574,7 +2574,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
-; GFX9-DL-NEXT: v_add3_u32 v1, v1, v4, v3
+; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -2599,7 +2599,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-DL-NEXT: v_add3_u32 v0, v0, v2, v1
+; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 1a22fa805a5a7..fd1f7b000472a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3268,19 +3268,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
-; GFX7-NEXT: v_mul_u32_u24_e32 v3, v6, v3
+; GFX7-NEXT: v_mul_u32_u24_e32 v1, v1, v5
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -3307,18 +3307,18 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2
-; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v9, sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mad_u16 v6, v6, v7, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6
+; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
@@ -3337,19 +3337,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8
-; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v4, v5, v6
+; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4
; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v3, v4
+; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -3367,19 +3367,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
-; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v4, v5, v6
+; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4
; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v3, v4
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -3392,28 +3392,28 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xff
+; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v1
+; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8
-; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2
-; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT: v_mul_lo_u16 v0, v3, v0
-; GFX10-DL-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT: v_bfe_i32 v4, v5, 0, 8
+; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3
+; GFX10-DL-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_bfe_i32 v6, v7, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_mad_u16 v0, v3, v4, v0
+; GFX10-DL-NEXT: v_mad_u16 v0, v4, v3, v0
+; GFX10-DL-NEXT: v_mad_u16 v0, v5, v6, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -3429,34 +3429,32 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v3, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.h
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v4
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v0.h, v0.l
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v4.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -3475,25 +3473,24 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v6, v1, 0, 8
-; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v0
-; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1
-; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v3, v2
+; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v2, v3
; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v7, v2
-; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v5, v6, 0, 8
+; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v4, v3, v2
+; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v5, v2
; GFX11-DL-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v2
; GFX11-DL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index c03802e144d5b..dfc1e3c088129 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -1684,7 +1684,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6
; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4
-; GFX9-NEXT: v_add3_u32 v1, v1, v17, v2
+; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -1735,7 +1735,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11
; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6
; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4
-; GFX9-DL-NEXT: v_add3_u32 v1, v1, v17, v2
+; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1789,7 +1789,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT: v_add3_u32 v0, v13, v3, v0
+; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0
; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 1156f2718cf1e..3259ddf559d77 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -365,110 +365,107 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0
-; GFX8-NEXT: v_and_b32_e32 v10, 0xfe000000, v1
+; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_or_b32_e32 v0, v10, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v12, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: s_movk_i32 s0, 0x2800
+; GFX8-NEXT: s_movk_i32 s0, 0x5000
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v10, 0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v7, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, 0x7f
-; GFX8-NEXT: s_movk_i32 s1, 0x800
-; GFX8-NEXT: s_movk_i32 s2, 0x1000
-; GFX8-NEXT: s_movk_i32 s3, 0x1800
-; GFX8-NEXT: s_movk_i32 s4, 0x2000
+; GFX8-NEXT: v_mov_b32_e32 v11, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, 0x7f
; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB1_2 Depth 2
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_mov_b32 s5, 0
+; GFX8-NEXT: s_mov_b32 s0, 0
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: .LBB1_2: ; %for.body
; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffd800, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[2:3]
-; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[8:9]
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffe000, v2
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13]
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffe800, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[8:9]
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff000, v2
-; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xfffff800, v2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[6:7]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2
; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[16:17]
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, s1, v2
-; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
-; GFX8-NEXT: s_addk_i32 s5, 0x2000
-; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff
-; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u32_e32 v22, vcc, v14, v6
-; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v15, v7, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, s2, v2
-; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[20:21]
-; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17]
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v20, vcc, s3, v2
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v3, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u32_e32 v22, vcc, v12, v22
-; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v13, v23, vcc
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v2
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
-; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21]
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2
+; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[20:21]
+; GFX8-NEXT: s_addk_i32 s0, 0x2000
+; GFX8-NEXT: s_cmp_gt_u32 s0, 0x3fffff
; GFX8-NEXT: s_waitcnt vmcnt(5)
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, v14, v10
+; GFX8-NEXT: v_addc_u32_e32 v25, vcc, v15, v11, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfffff000, v2
+; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[22:23]
+; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(6)
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, v16, v24
+; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v17, v25, vcc
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff800, v2
+; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15]
+; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17]
+; GFX8-NEXT: s_waitcnt vmcnt(7)
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v18, v22
; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v19, v23, vcc
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, s0, v2
-; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13]
-; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19]
+; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[2:3]
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(7)
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v22
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v23, vcc
; GFX8-NEXT: s_waitcnt vmcnt(6)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v22
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v23, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GFX8-NEXT: s_waitcnt vmcnt(5)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v14, v8
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v15, v9, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v20, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v21, v5, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v14, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v18, v4
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v19, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v5, vcc
; GFX8-NEXT: s_cbranch_scc0 .LBB1_2
; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 1, v11
+; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, 1, v13
; GFX8-NEXT: s_and_b64 vcc, exec, vcc
; GFX8-NEXT: s_cbranch_vccz .LBB1_1
; GFX8-NEXT: ; %bb.4: ; %while.end
; GFX8-NEXT: v_mov_b32_e32 v1, s35
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v10
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11]
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: clmem_read:
@@ -498,76 +495,79 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_mov_b32_e32 v1, s35
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x2800, v0
+; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x5000, v0
; GFX900-NEXT: v_mov_b32_e32 v4, 0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: v_mov_b32_e32 v5, 0
; GFX900-NEXT: v_mov_b32_e32 v7, 0x7f
-; GFX900-NEXT: s_movk_i32 s2, 0xf000
-; GFX900-NEXT: s_movk_i32 s3, 0x1000
-; GFX900-NEXT: s_movk_i32 s4, 0x2000
+; GFX900-NEXT: s_movk_i32 s2, 0xd000
+; GFX900-NEXT: s_movk_i32 s3, 0xe000
+; GFX900-NEXT: s_movk_i32 s4, 0xf000
; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX900-NEXT: ; =>This Loop Header: Depth=1
; GFX900-NEXT: ; Child Loop BB1_2 Depth 2
; GFX900-NEXT: v_mov_b32_e32 v3, v1
-; GFX900-NEXT: v_mov_b32_e32 v2, v0
; GFX900-NEXT: s_mov_b32 s5, 0
+; GFX900-NEXT: v_mov_b32_e32 v2, v0
; GFX900-NEXT: .LBB1_2: ; %for.body
; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffe000, v2
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
-; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off offset:-2048
+; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
+; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
+; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2
+; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
; GFX900-NEXT: s_addk_i32 s5, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v14, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v5, vcc
-; GFX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v14, v16
-; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v17, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, s2, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
-; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off offset:-2048
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v8, v14
-; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v9, v15, vcc
-; GFX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v10, v14
-; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v15, vcc
-; GFX900-NEXT: v_add_co_u32_e64 v14, s[0:1], v12, v14
-; GFX900-NEXT: v_addc_co_u32_e64 v15, s[0:1], v13, v11, s[0:1]
-; GFX900-NEXT: v_add_co_u32_e32 v10, vcc, s3, v2
-; GFX900-NEXT: v_add_co_u32_e64 v12, s[0:1], s4, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX900-NEXT: v_addc_co_u32_e64 v13, vcc, 0, v3, s[0:1]
-; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v4, v14
-; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v5, v15, vcc
-; GFX900-NEXT: global_load_dwordx2 v[4:5], v[12:13], off offset:-4096
-; GFX900-NEXT: global_load_dwordx2 v[14:15], v[10:11], off offset:2048
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v8, v16
-; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v9, v17, vcc
-; GFX900-NEXT: global_load_dwordx2 v[8:9], v[12:13], off
-; GFX900-NEXT: global_load_dwordx2 v[10:11], v[12:13], off offset:2048
+; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
+; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22
+; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
+; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
+; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
+; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24
+; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
+; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20
+; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v16
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v17, vcc
; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v16
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v17, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v18, v8
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v19, v9, vcc
; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v22, v8
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v23, v9, vcc
; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
; GFX900-NEXT: s_cbranch_scc0 .LBB1_2
; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1
@@ -610,7 +610,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6
; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2800, v0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
@@ -621,30 +621,29 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: .LBB1_2: ; %for.body
; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffe000
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffb800
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xfffff000
+; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xffffc800
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT: s_clause 0x5
+; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v4, 0xffffd800
+; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v18, vcc_lo, v4, 0xffffe800
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
-; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off
; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off offset:-2048
-; GFX10-NEXT: global_load_dwordx2 v[18:19], v[10:11], off
-; GFX10-NEXT: global_load_dwordx2 v[20:21], v[4:5], off offset:-2048
-; GFX10-NEXT: global_load_dwordx2 v[22:23], v[4:5], off
-; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0x1000
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0x2000
-; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT: global_load_dwordx2 v[24:25], v[8:9], off offset:-2048
-; GFX10-NEXT: v_add_co_u32 v26, vcc_lo, 0x2800, v4
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[28:29], v[10:11], off offset:-2048
+; GFX10-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
+; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v23, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: global_load_dwordx2 v[24:25], v[18:19], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
-; GFX10-NEXT: v_add_co_ci_u32_e32 v27, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[30:31], v[10:11], off
-; GFX10-NEXT: global_load_dwordx2 v[32:33], v[26:27], off
+; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
+; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off
+; GFX10-NEXT: global_load_dwordx2 v[26:27], v[18:19], off
+; GFX10-NEXT: global_load_dwordx2 v[28:29], v[22:23], off
+; GFX10-NEXT: global_load_dwordx2 v[30:31], v[4:5], off offset:-2048
+; GFX10-NEXT: global_load_dwordx2 v[32:33], v[4:5], off
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
; GFX10-NEXT: s_addk_i32 s1, 0x2000
@@ -652,27 +651,25 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_waitcnt vmcnt(10)
; GFX10-NEXT: v_add_co_u32 v2, s0, v12, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v13, v3, s0
-; GFX10-NEXT: s_waitcnt vmcnt(9)
-; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0
-; GFX10-NEXT: s_waitcnt vmcnt(8)
+; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v17, v3, s0
-; GFX10-NEXT: s_waitcnt vmcnt(7)
-; GFX10-NEXT: v_add_co_u32 v2, s0, v18, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v19, v3, s0
-; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_add_co_u32 v2, s0, v10, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v11, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v20, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v21, v3, s0
-; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_add_co_u32 v2, s0, v22, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v23, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(4)
+; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v24, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v25, v3, s0
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: v_add_co_u32 v2, s0, v26, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v27, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v28, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v29, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -720,76 +717,77 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: v_mov_b32_e32 v2, s35
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x2800, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x5000, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0
+; GFX90A-NEXT: s_movk_i32 s0, 0xd000
+; GFX90A-NEXT: s_movk_i32 s1, 0xe000
; GFX90A-NEXT: s_movk_i32 s2, 0xf000
-; GFX90A-NEXT: s_movk_i32 s3, 0x1000
-; GFX90A-NEXT: s_movk_i32 s4, 0x2000
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2
+; GFX90A-NEXT: s_mov_b32 s3, 0
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_mov_b32 s5, 0
; GFX90A-NEXT: .LBB1_2: ; %for.body
; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT: v_add_co_u32_e64 v18, s[0:1], s3, v6
-; GFX90A-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v7, s[0:1]
-; GFX90A-NEXT: v_add_co_u32_e64 v20, s[0:1], s4, v6
-; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffe000, v6
-; GFX90A-NEXT: v_addc_co_u32_e64 v21, s[0:1], 0, v7, s[0:1]
-; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096
-; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off
-; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[8:9], off offset:-2048
-; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[8:9], off
+; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
+; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
+; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
+; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
+; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
+; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
+; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
+; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6
+; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc
+; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
+; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s1, v6
+; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
+; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[14:15], off offset:-4096
+; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[14:15], off offset:-2048
+; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[14:15], off
; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s2, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[22:23], off offset:-2048
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[18:19], off offset:2048
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[20:21], off offset:2048
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-4096
-; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:-2048
-; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[6:7], off
-; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048
+; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
+; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off
+; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT: s_addk_i32 s5, 0x2000
-; GFX90A-NEXT: s_cmp_gt_u32 s5, 0x3fffff
-; GFX90A-NEXT: s_waitcnt vmcnt(8)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(7)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(6)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(3)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(2)
+; GFX90A-NEXT: s_addk_i32 s3, 0x2000
+; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff
+; GFX90A-NEXT: s_waitcnt vmcnt(10)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(9)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(8)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(7)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(6)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(5)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(4)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(3)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1
@@ -824,7 +822,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2800, v0
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX11-NEXT: ; =>This Loop Header: Depth=1
@@ -836,74 +834,76 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0xffffe000, v4
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffc000
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xfffff000, v4
+; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xffffc000, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, vcc_lo
-; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:-2048
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v4, 0x2000
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, 0x1000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo
-; GFX11-NEXT: global_load_b64 v[26:27], v[22:23], off offset:-4096
-; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, 0x2000, v4
-; GFX11-NEXT: s_clause 0x6
-; GFX11-NEXT: global_load_b64 v[24:25], v[24:25], off offset:2048
-; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off
+; GFX11-NEXT: global_load_b64 v[14:15], v[8:9], off offset:-4096
+; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffd000, v4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v4, 0xffffe000
; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:-2048
-; GFX11-NEXT: global_load_b64 v[14:15], v[4:5], off offset:-4096
-; GFX11-NEXT: global_load_b64 v[16:17], v[4:5], off offset:-2048
-; GFX11-NEXT: global_load_b64 v[18:19], v[4:5], off
-; GFX11-NEXT: global_load_b64 v[20:21], v[4:5], off offset:2048
-; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, -1, v5, vcc_lo
+; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:-2048
+; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, 0xffffe000, v4
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off
-; GFX11-NEXT: global_load_b64 v[28:29], v[28:29], off offset:2048
+; GFX11-NEXT: global_load_b64 v[20:21], v[16:17], off offset:-4096
+; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off
+; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, -1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, -1, v5, vcc_lo
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_load_b64 v[18:19], v[18:19], off offset:-2048
+; GFX11-NEXT: global_load_b64 v[16:17], v[16:17], off
+; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off offset:-2048
+; GFX11-NEXT: global_load_b64 v[24:25], v[4:5], off offset:-4096
+; GFX11-NEXT: global_load_b64 v[26:27], v[4:5], off offset:-2048
+; GFX11-NEXT: global_load_b64 v[28:29], v[4:5], off
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: s_addk_i32 s1, 0x2000
; GFX11-NEXT: s_cmp_gt_u32 s1, 0x3fffff
; GFX11-NEXT: s_waitcnt vmcnt(10)
-; GFX11-NEXT: v_add_co_u32 v2, s0, v12, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, s0
-; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_add_co_u32 v2, s0, v8, v2
+; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, s0
-; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, s0
+; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_add_co_u32 v2, s0, v10, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, s0
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_add_co_u32 v2, s0, v8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, s0
+; GFX11-NEXT: v_add_co_u32 v2, s0, v12, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, s0
+; GFX11-NEXT: v_add_co_u32 v2, s0, v20, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v21, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v2
+; GFX11-NEXT: v_add_co_u32 v2, s0, v18, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v19, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_add_co_u32 v2, s0, v16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v17, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_add_co_u32 v2, s0, v18, v2
+; GFX11-NEXT: v_add_co_u32 v2, s0, v22, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v19, v3, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v23, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_add_co_u32 v2, s0, v20, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v21, v3, s0
-; GFX11-NEXT: v_add_co_u32 v2, s0, v26, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v27, v3, s0
; GFX11-NEXT: v_add_co_u32 v2, s0, v24, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v25, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_add_co_u32 v2, s0, v22, v2
+; GFX11-NEXT: v_add_co_u32 v2, s0, v26, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v23, v3, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v27, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v28, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index c0c1763d54cc0..d6d639c5e496b 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -146,11 +146,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.276, align 8, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
@@ -169,11 +169,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.285, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.207, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.223, addrspace 4)
@@ -190,22 +190,26 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.296, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.259, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
+ ; CHECK-NEXT: KILL undef %469:sreg_64
+ ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.268, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.308, align 8, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM22]]
; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 %73.sub0, 65535, implicit-def dead $scc
@@ -221,17 +225,17 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.332, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.338, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f6922c75ff848..30f52773bbce5 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -7,27 +7,36 @@
define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
; GFX8-LABEL: barrier_vmcnt_global:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_barrier
-; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: flat_store_dword v[0:1], v4
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: barrier_vmcnt_global:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v1, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:4
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -48,20 +57,22 @@ bb:
define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
; GFX8-LABEL: barrier_vscnt_global:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT: flat_store_dword v[1:2], v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, 1
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v3, 1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_barrier
-; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: flat_store_dword v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: barrier_vscnt_global:
@@ -70,14 +81,18 @@ define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
; GFX9-NEXT: v_add_u32_e32 v2, 2, v0
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: global_store_dword v[2:3], v1, off
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v3, 1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: global_store_dword v[2:3], v0, off offset:-4
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -100,19 +115,22 @@ bb:
define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) {
; GFX8-LABEL: barrier_vmcnt_vscnt_global:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc
-; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: flat_store_dword v[1:2], v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_barrier
; GFX8-NEXT: flat_store_dword v[0:1], v3
@@ -124,15 +142,19 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) {
; GFX9-NEXT: v_add_u32_e32 v2, 2, v0
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:-8
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: global_store_dword v[2:3], v1, off
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: global_store_dword v[2:3], v0, off offset:-4
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -157,30 +179,38 @@ bb:
define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) {
; GFX8-LABEL: barrier_vmcnt_flat:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_barrier
-; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: flat_store_dword v[0:1], v4
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: barrier_vmcnt_flat:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT: flat_load_dword v4, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: flat_load_dword v2, v[0:1]
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:4
+; GFX9-NEXT: flat_store_dword v[0:1], v4
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -201,20 +231,22 @@ bb:
define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
; GFX8-LABEL: barrier_vscnt_flat:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT: flat_store_dword v[1:2], v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, 1
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v3, 1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_barrier
-; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: flat_store_dword v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: barrier_vscnt_flat:
@@ -223,16 +255,18 @@ define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
; GFX9-NEXT: v_add_u32_e32 v2, 2, v0
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: flat_store_dword v[2:3], v1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, 1
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v3, 1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: flat_store_dword v[0:1], v3
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -255,19 +289,22 @@ bb:
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) {
; GFX8-LABEL: barrier_vmcnt_vscnt_flat:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc
-; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: flat_store_dword v[1:2], v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_barrier
; GFX8-NEXT: flat_store_dword v[0:1], v3
@@ -279,18 +316,21 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) {
; GFX9-NEXT: v_add_u32_e32 v2, 2, v0
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, -8, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
-; GFX9-NEXT: flat_load_dword v4, v[4:5]
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: flat_store_dword v[2:3], v1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT: flat_load_dword v3, v[2:3]
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: flat_store_dword v[0:1], v4
+; GFX9-NEXT: flat_store_dword v[0:1], v3
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -315,19 +355,22 @@ bb:
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX8-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc
-; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: flat_store_dword v[1:2], v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_barrier
; GFX8-NEXT: flat_store_dword v[0:1], v3
@@ -339,18 +382,21 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX9-NEXT: v_add_u32_e32 v2, 2, v0
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, -8, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
-; GFX9-NEXT: flat_load_dword v4, v[4:5]
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: flat_store_dword v[2:3], v1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT: flat_load_dword v3, v[2:3]
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: flat_store_dword v[0:1], v4
+; GFX9-NEXT: flat_store_dword v[0:1], v3
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -375,25 +421,34 @@ bb:
define amdgpu_kernel void @load_vmcnt_global(ptr addrspace(1) %arg) {
; GFX8-LABEL: load_vmcnt_global:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: flat_store_dword v[0:1], v4
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: load_vmcnt_global:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v2, v1, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:4
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -411,28 +466,36 @@ bb:
define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) {
; GFX8-LABEL: load_vmcnt_flat:
; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: flat_store_dword v[0:1], v4
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: load_vmcnt_flat:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-NEXT: flat_load_dword v4, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: flat_load_dword v2, v[0:1]
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:4
+; GFX9-NEXT: flat_store_dword v[0:1], v4
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
index 51b42463d81ba..0d3846d1eac16 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
@@ -13,7 +13,7 @@ entry:
%tmp = sext i32 undef to i64
%arrayidx114 = getelementptr inbounds %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp
%tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1
-; CHECK: %tmp1 = getelementptr i8, ptr addrspace(1) %arrayidx114, i64 4
+; CHECK: %tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1
%tmp2 = load <4 x float>, ptr addrspace(1) undef, align 4
ret void
}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
index 8662e61c95d66..03edfdceab325 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
@@ -46,9 +46,9 @@ define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_off
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP]]
; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(1) [[P1]], align 4
; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[OFFSET]], 2
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P1]], i64 [[TMP5]]
+; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]]
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[J2]] to i64
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP5]]
; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(1) [[P2]], align 4
; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
@@ -109,8 +109,8 @@ define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(pt
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J1]]
; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(3) [[P1]], align 4
; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: [[J2:%.*]] = shl i32 [[I]], 2
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[P1]], i32 [[J2]]
+; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]]
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J2]]
; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(3) [[P2]], align 4
; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
deleted file mode 100644
index b28ca07084495..0000000000000
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll
+++ /dev/null
@@ -1,271 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=slsr -S | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
-
-target triple = "nvptx64-nvidia-cuda"
-
-define void @slsr_i8_zero_delta(ptr %in, ptr %out, i64 %add) {
-; PTX-LABEL: slsr_i8_zero_delta(
-; PTX: {
-; PTX-NEXT: .reg .b16 %rs<6>;
-; PTX-NEXT: .reg .b64 %rd<5>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_param_0];
-; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_param_2];
-; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
-; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_param_1];
-; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
-; PTX-NEXT: ld.b8 %rs2, [%rd3+64];
-; PTX-NEXT: ld.b8 %rs3, [%rd3+96];
-; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
-; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
-; PTX-NEXT: st.b8 [%rd4], %rs5;
-; PTX-NEXT: ret;
-; CHECK-LABEL: define void @slsr_i8_zero_delta(
-; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
-; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
-; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
-; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
-; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 64
-; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
-; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 96
-; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
-; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
-; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
-; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
-; CHECK-NEXT: ret void
-;
- %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
- %load0 = load i8, ptr %getElem0.1
-
- %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
- %load1 = load i8, ptr %getElem1.1
-
- %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
- %load2 = load i8, ptr %getElem2.1
-
- %out0 = add i8 %load0, %load1
- %out1 = add i8 %out0, %load2
- store i8 %out1, ptr %out
-
- ret void
-}
-
-define void @slsr_i8_zero_delta_2(ptr %in, ptr %out, i64 %add) {
-; PTX-LABEL: slsr_i8_zero_delta_2(
-; PTX: {
-; PTX-NEXT: .reg .b16 %rs<6>;
-; PTX-NEXT: .reg .b64 %rd<5>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_2_param_0];
-; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_2_param_2];
-; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
-; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_2_param_1];
-; PTX-NEXT: ld.b8 %rs1, [%rd3];
-; PTX-NEXT: ld.b8 %rs2, [%rd3+32];
-; PTX-NEXT: ld.b8 %rs3, [%rd3+64];
-; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
-; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
-; PTX-NEXT: st.b8 [%rd4], %rs5;
-; PTX-NEXT: ret;
-; CHECK-LABEL: define void @slsr_i8_zero_delta_2(
-; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
-; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
-; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_0]], align 1
-; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
-; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
-; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 64
-; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
-; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
-; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
-; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
-; CHECK-NEXT: ret void
-;
- %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %load0 = load i8, ptr %getElem0.0
-
- %getElem1.0 = getelementptr i8, ptr %in, i64 %add
- %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 32
- %load1 = load i8, ptr %getElem1.1
-
- %getElem2.0 = getelementptr i8, ptr %in, i64 %add
- %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 64
- %load2 = load i8, ptr %getElem2.1
-
- %out0 = add i8 %load0, %load1
- %out1 = add i8 %out0, %load2
- store i8 %out1, ptr %out
-
- ret void
-}
-
-define void @slsr_i8_base_delta(ptr %in, ptr %out, i64 %add) {
-; PTX-LABEL: slsr_i8_base_delta(
-; PTX: {
-; PTX-NEXT: .reg .b16 %rs<6>;
-; PTX-NEXT: .reg .b64 %rd<5>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_base_delta_param_0];
-; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_base_delta_param_2];
-; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
-; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_base_delta_param_1];
-; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
-; PTX-NEXT: ld.b8 %rs2, [%rd3+65];
-; PTX-NEXT: ld.b8 %rs3, [%rd3+98];
-; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
-; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
-; PTX-NEXT: st.b8 [%rd4], %rs5;
-; PTX-NEXT: ret;
-; CHECK-LABEL: define void @slsr_i8_base_delta(
-; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
-; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
-; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
-; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
-; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 1
-; CHECK-NEXT: [[GETELEM1_2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_1]], i64 64
-; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_2]], align 1
-; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 2
-; CHECK-NEXT: [[GETELEM2_2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_1]], i64 96
-; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_2]], align 1
-; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
-; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
-; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
-; CHECK-NEXT: ret void
-;
- %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
- %load0 = load i8, ptr %getElem0.1
-
- %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 1
- %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 %add
- %getElem1.2 = getelementptr inbounds i8, ptr %getElem1.1, i64 64
- %load1 = load i8, ptr %getElem1.2
-
- %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 2
- %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 %add
- %getElem2.2 = getelementptr inbounds i8, ptr %getElem2.1, i64 96
- %load2 = load i8, ptr %getElem2.2
-
- %out0 = add i8 %load0, %load1
- %out1 = add i8 %out0, %load2
- store i8 %out1, ptr %out
-
- ret void
-}
-
-define void @slsr_i8_index_delta(ptr %in, ptr %out, i64 %add) {
-; PTX-LABEL: slsr_i8_index_delta(
-; PTX: {
-; PTX-NEXT: .reg .b16 %rs<6>;
-; PTX-NEXT: .reg .b64 %rd<7>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_index_delta_param_0];
-; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_index_delta_param_2];
-; PTX-NEXT: shl.b64 %rd3, %rd2, 3;
-; PTX-NEXT: add.s64 %rd4, %rd1, %rd3;
-; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_index_delta_param_1];
-; PTX-NEXT: ld.b8 %rs1, [%rd4+32];
-; PTX-NEXT: add.s64 %rd6, %rd1, %rd2;
-; PTX-NEXT: ld.b8 %rs2, [%rd6+64];
-; PTX-NEXT: ld.b8 %rs3, [%rd6+96];
-; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
-; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
-; PTX-NEXT: st.b8 [%rd5], %rs5;
-; PTX-NEXT: ret;
-; CHECK-LABEL: define void @slsr_i8_index_delta(
-; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) {
-; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[ADD]]
-; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
-; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
-; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
-; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 64
-; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
-; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 96
-; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
-; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
-; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
-; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
-; CHECK-NEXT: ret void
-;
- %getElem0.0 = getelementptr inbounds double, ptr %in, i64 %add
- %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
- %load0 = load i8, ptr %getElem0.1
-
- %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
- %load1 = load i8, ptr %getElem1.1
-
- %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
- %load2 = load i8, ptr %getElem2.1
-
- %out0 = add i8 %load0, %load1
- %out1 = add i8 %out0, %load2
- store i8 %out1, ptr %out
-
- ret void
-}
-
-define void @slsr_i8_stride_delta(ptr %in, ptr %out, i64 %add, i64 %offset) {
-; PTX-LABEL: slsr_i8_stride_delta(
-; PTX: {
-; PTX-NEXT: .reg .b16 %rs<6>;
-; PTX-NEXT: .reg .b64 %rd<7>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_stride_delta_param_0];
-; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_stride_delta_param_2];
-; PTX-NEXT: add.s64 %rd3, %rd1, %rd2;
-; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_stride_delta_param_1];
-; PTX-NEXT: ld.b8 %rs1, [%rd3+32];
-; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_stride_delta_param_3];
-; PTX-NEXT: ld.b8 %rs2, [%rd3+65];
-; PTX-NEXT: add.s64 %rd6, %rd3, %rd5;
-; PTX-NEXT: ld.b8 %rs3, [%rd6+96];
-; PTX-NEXT: add.s16 %rs4, %rs1, %rs2;
-; PTX-NEXT: add.s16 %rs5, %rs4, %rs3;
-; PTX-NEXT: st.b8 [%rd4], %rs5;
-; PTX-NEXT: ret;
-; CHECK-LABEL: define void @slsr_i8_stride_delta(
-; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]], i64 [[OFFSET:%.*]]) {
-; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]]
-; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32
-; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1
-; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 1
-; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 64
-; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1
-; CHECK-NEXT: [[GETELEM2_0:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 [[OFFSET]]
-; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_0]], i64 96
-; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1
-; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
-; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]]
-; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1
-; CHECK-NEXT: ret void
-;
- %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add
- %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32
- %load0 = load i8, ptr %getElem0.1
-
- %add1 = add i64 %add, 1
- %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add1
- %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64
- %load1 = load i8, ptr %getElem1.1
-
- %add2 = add i64 %add, %offset
- %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add2
- %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96
- %load2 = load i8, ptr %getElem2.1
-
- %out0 = add i8 %load0, %load1
- %out1 = add i8 %out0, %load2
- store i8 %out1, ptr %out
-
- ret void
-}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
deleted file mode 100644
index ce6f720cc4352..0000000000000
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=slsr -S | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX
-
-target triple = "nvptx64-nvidia-cuda"
-
-; Test SLSR can reuse the computation by complex variable delta.
-; The original program needs 4 mul.wide.s32, after SLSR with
-; variable-delta, it can reduce to 1 mul.wide.s32.
-define void @foo(ptr %a, ptr %b, i32 %j) {
-; PTX-LABEL: foo(
-; PTX: {
-; PTX-NEXT: .reg .b32 %r<4>;
-; PTX-NEXT: .reg .b64 %rd<9>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b64 %rd1, [foo_param_0];
-; PTX-NEXT: ld.b32 %r1, [%rd1];
-; PTX-NEXT: ld.param.b64 %rd2, [foo_param_1];
-; PTX-NEXT: ld.param.b32 %r2, [foo_param_2];
-; PTX-NEXT: add.s32 %r3, %r1, %r2;
-; PTX-NEXT: mul.wide.s32 %rd3, %r3, 4;
-; PTX-NEXT: add.s64 %rd4, %rd2, %rd3;
-; PTX-NEXT: st.b32 [%rd4], 0;
-; PTX-NEXT: add.s64 %rd5, %rd4, %rd3;
-; PTX-NEXT: st.b32 [%rd5], 1;
-; PTX-NEXT: add.s64 %rd6, %rd5, 4;
-; PTX-NEXT: st.b32 [%rd5+4], 2;
-; PTX-NEXT: add.s64 %rd7, %rd6, %rd3;
-; PTX-NEXT: st.b32 [%rd7], 3;
-; PTX-NEXT: add.s64 %rd8, %rd7, %rd3;
-; PTX-NEXT: st.b32 [%rd8], 4;
-; PTX-NEXT: ret;
- %i.0 = load i32, ptr %a, align 8
- %i = add i32 %i.0, %j
- ; CHECK: [[L:%.*]] = load i32, ptr %a, align 8
- ; CHECK: [[I:%.*]] = add i32 [[L]], %j
- %gep.24 = getelementptr float, ptr %b, i32 %i
- ; CHECK: [[GEP0:%.*]] = getelementptr float, ptr %b, i32 [[I]]
- ; CHECK: store i32 0, ptr [[GEP0]]
- store i32 0, ptr %gep.24
- %gep.24.sum1 = add i32 %i, %i
- %gep.25 = getelementptr float, ptr %b, i32 %gep.24.sum1
- ; CHECK: [[EXT1:%.*]] = sext i32 [[I]] to i64
- ; CHECK: [[MUL1:%.*]] = shl i64 [[EXT1]], 2
- ; CHECK: [[GEP1:%.*]] = getelementptr i8, ptr [[GEP0]], i64 [[MUL1]]
- ; CHECK: store i32 1, ptr [[GEP1]]
- store i32 1, ptr %gep.25
- %gep.26.sum3 = add i32 1, %i
- %gep.27.sum = add i32 %gep.26.sum3, %i
- %gep.28 = getelementptr float, ptr %b, i32 %gep.27.sum
- ; CHECK: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 4
- ; CHECK: store i32 2, ptr [[GEP2]]
- store i32 2, ptr %gep.28
- %gep.28.sum = add i32 %gep.27.sum, %i
- %gep.29 = getelementptr float, ptr %b, i32 %gep.28.sum
- ; CHECK: [[EXT2:%.*]] = sext i32 [[I]] to i64
- ; CHECK: [[MUL2:%.*]] = shl i64 [[EXT2]], 2
- ; CHECK: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[MUL2]]
- ; CHECK: store i32 3, ptr [[GEP3]]
- store i32 3, ptr %gep.29
- %gep.29.sum = add i32 %gep.28.sum, %i
- %gep.30 = getelementptr float, ptr %b, i32 %gep.29.sum
- ; CHECK: [[EXT3:%.*]] = sext i32 [[I]] to i64
- ; CHECK: [[MUL3:%.*]] = shl i64 [[EXT3]], 2
- ; CHECK: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[MUL3]]
- ; CHECK: store i32 4, ptr [[GEP4]]
- store i32 4, ptr %gep.30
- ret void
-}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll b/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll
deleted file mode 100644
index 4e8ae40f039aa..0000000000000
--- a/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: opt < %s -passes="slsr" -S | FileCheck %s
-
-target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
-
-%struct.B = type { i16 }
-%struct.A = type { %struct.B, %struct.B, %struct.B }
-
-define void @path_compression(i32 %a, ptr %base, i16 %r, i1 %cond) {
-; CHECK-LABEL: @path_compression(
-; CHECK: [[I:%.*]] = sext i32 %a to i64
-; CHECK: [[GEP1:%.*]] = getelementptr inbounds %struct.A, ptr %base, i64 [[I]]
-; CHECK: br
-; CHECK-LABEL: next
-; compress the path to use GEP1 as the Basis instead of GEP2
-; CHECK: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2
-; CHECK: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 4
-
-
- %1 = sext i32 %a to i64
- %2 = add i64 %1, 1
- %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1
- br i1 %cond, label %next, label %ret
-
-next:
- %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1
- %offset = sub i64 %2, 1
- %getElem3 = getelementptr inbounds %struct.A, ptr %base, i64 %offset, i32 2
- store i16 %r, ptr %getElem1, align 2
- store i16 %r, ptr %getElem2, align 2
- store i16 %r, ptr %getElem3, align 2
- br label %ret
-
-ret:
- ret void
-}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll b/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll
deleted file mode 100644
index 7fa0e4f9474f1..0000000000000
--- a/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: opt < %s -passes="slsr" -S | FileCheck %s
-
-target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
-
-%struct.B = type { i16 }
-%struct.A = type { %struct.B, %struct.B }
-
-define i32 @pick(i32 %0, ptr %addr) {
-; `d` can be optimized by 2 approaches
-; 1. a = 1 + 1 * %0
-; d = 1 + 8 * %0
-; = a + 7 * %0
-; 2. c = (8 * %0) + 3
-; d = (8 * %0) + 1
-; = c - 2
-; Pick candidate (2) as it can save 1 instruction from (7 * %0)
-;
-; CHECK-LABEL: pick
-; CHECK: [[A:%.*]] = add i32 %0, 1
-; CHECK: [[B:%.*]] = shl i32 %0, 3
-; CHECK: [[C:%.*]] = add i32 [[B]], 3
-; CHECK: store i32 [[C]], ptr %addr
-; CHECK: [[D:%.*]] = add i32 [[C]], -2
-; CHECK: ret i32 %d
-
- %a = add i32 %0, 1
- %b = shl i32 %0, 3
- %c = add i32 %b, 3
- store i32 %c, ptr %addr
- %d = add i32 %b, 1
- ret i32 %d
-}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
index 1abf2fa331821..d85331f77b12f 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
@@ -4,8 +4,6 @@
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
-; Index Delta
-
define void @shl(i32 %b, i32 %s) {
; CHECK-LABEL: @shl(
; CHECK-NEXT: [[T1:%.*]] = add i32 [[B:%.*]], [[S:%.*]]
@@ -173,121 +171,3 @@ define void @slsr_strided_add_128bit(i128 %b, i128 %s) {
declare void @foo(i32)
declare void @voo(<2 x i32>)
declare void @bar(i128)
-
-; Stride Delta
-
-define void @stride_const(i32 %a, ptr %base, i16 %r) {
-; Reuse add1 to compute add2
-; CHECK-LABEL: @stride_const(
-; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64
-; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2
-; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64
-; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
-; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], 8
-; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr
-; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr
-; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2
-; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2
-; CHECK-NEXT: ret void
-;
- %1 = sext i32 %a to i64
- %2 = mul i64 %1, 2
- %3 = add i64 %1, 4
- %4 = mul i64 %3, 2
- %baseInt = ptrtoint ptr %base to i64
- %add1 = add i64 %baseInt, %2
- %add2 = add i64 %baseInt, %4
- %addr1 = inttoptr i64 %add1 to ptr
- %addr2 = inttoptr i64 %add2 to ptr
- store i16 %r, ptr %addr1, align 2
- store i16 %r, ptr %addr2, align 2
- ret void
-}
-
-define void @stride_var(i32 %a, ptr %base, i16 %r, i64 %n) {
-; Reuse add1 to compute add2 to save a add.s64
-; CHECK-LABEL: @stride_var(
-; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64
-; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2
-; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64
-; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[N:%.*]], 1
-; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], [[TMP3]]
-; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr
-; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr
-; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2
-; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2
-; CHECK-NEXT: ret void
-;
-
- %1 = sext i32 %a to i64
- %2 = mul i64 %1, 2
- %3 = add i64 %1, %n
- %4 = mul i64 %3, 2
- %baseInt = ptrtoint ptr %base to i64
- %add1 = add i64 %baseInt, %2
- %add2 = add i64 %baseInt, %4
- %addr1 = inttoptr i64 %add1 to ptr
- %addr2 = inttoptr i64 %add2 to ptr
- store i16 %r, ptr %addr1, align 2
- store i16 %r, ptr %addr2, align 2
- ret void
-}
-
-; Base Delta
-
-define void @base_const(i32 %a, ptr %base, i16 %r) {
-; Reuse add1 to compute add2
-; CHECK-LABEL: @base_const(
-; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64
-; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2
-; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64
-; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
-; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], 5
-; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr
-; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr
-; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2
-; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2
-; CHECK-NEXT: ret void
-;
-
- %1 = sext i32 %a to i64
- %2 = mul i64 %1, 2
- %baseInt = ptrtoint ptr %base to i64
- %add1 = add i64 %baseInt, %2
- %add2.0 = add i64 %baseInt, 5
- %add2 = add i64 %add2.0, %2
- %addr1 = inttoptr i64 %add1 to ptr
- %addr2 = inttoptr i64 %add2 to ptr
- store i16 %r, ptr %addr1, align 2
- store i16 %r, ptr %addr2, align 2
- ret void
-}
-
-define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) {
-; Reuse add1 to compute add2
-; CHECK-LABEL: @base_var(
-; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64
-; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2
-; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64
-; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]]
-; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], [[N:%.*]]
-; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr
-; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr
-; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2
-; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2
-; CHECK-NEXT: ret void
-;
-
- %1 = sext i32 %a to i64
- %2 = mul i64 %1, 2
- %baseInt = ptrtoint ptr %base to i64
- %add1 = add i64 %baseInt, %2
- %add2.0 = add i64 %baseInt, %n
- %add2 = add i64 %add2.0, %2
- %addr1 = inttoptr i64 %add1 to ptr
- %addr2 = inttoptr i64 %add2 to ptr
- store i16 %r, ptr %addr1, align 2
- store i16 %r, ptr %addr2, align 2
- ret void
-}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll
index ee99c6bbeb50a..7cd45329c24fe 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll
@@ -3,43 +3,6 @@
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64-p:64:64:64-p1:32:32:32-p2:128:128:128:32"
-; Index Delta
-; Most of the original test cases in this file were optimized by Index-delta.
-; After adding Base-delta and Stride-delta, most of the GEP test cases
-; are optimized by Stride-delta now. The only case that GEP needs index-delta
-; SLSR is to reuse address computation from a GEP with different pointee type.
-; Once LLVM completely moves from typed GEP to PtrAdd, we can remove
-; index-delta for GEP/PtrAdd.
-
-define void @index_delta(ptr %input, i32 %c, i32 %b, i32 %n, float %r) {
-; CHECK-LABEL: define void @index_delta(
-; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) {
-; CHECK-NEXT: [[ADD0:%.*]] = add nsw i32 [[B]], 1
-; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]]
-; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]]
-; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[ADD1]] to i64
-; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[INPUT]], i64 [[OFFSET]]
-; CHECK-NEXT: store float [[R]], ptr [[GETELEM]], align 4
-; CHECK-NEXT: [[TMP:%.*]] = mul i64 [[OFFSET]], 3
-; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP]]
-; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4
-; CHECK-NEXT: ret void
-;
-
- %add0 = add nsw i32 %b, 1
- %mul.1 = mul nsw i32 %add0, %n
- %add.1 = add i32 %mul.1, %c
- %offset = sext i32 %add.1 to i64
- %getElem = getelementptr i8, ptr %input, i64 %offset
- store float %r, ptr %getElem, align 4
- %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset
- store float %r, ptr %getElem.1, align 4
-
- ret void
-}
-
-; Stride Delta
-
; foo(input[0]);
; foo(input[s]);
; foo(input[s * 2]);
@@ -300,115 +263,3 @@ declare void @foo(ptr)
declare void @bar(ptr)
declare void @baz(ptr addrspace(1))
declare void @baz2(ptr addrspace(2))
-
-define void @stride_const(ptr %input, i32 %c, i32 %b, i32 %n, float %r) {
-; CHECK-LABEL: define void @stride_const(
-; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) {
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B]], [[N]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[C]]
-; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD]], [[N]]
-; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[N]]
-; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[ADD_2]] to i64
-; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[OFFSET]]
-; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4
-; CHECK-NEXT: [[GETELEM_2:%.*]] = getelementptr i8, ptr [[GETELEM_1]], i64 16
-; CHECK-NEXT: store float [[R]], ptr [[GETELEM_2]], align 4
-; CHECK-NEXT: ret void
-;
-
- %mul = mul nsw i32 %b, %n
- %add = add i32 %mul, %c
- %add.1 = add i32 %add, %n
- %add.2 = add i32 %add.1, %n
-
- %offset = sext i32 %add.2 to i64
- %1 = getelementptr float, ptr %input, i64 %offset
- store float %r, ptr %1, align 4
-
- %offset3 = add i64 %offset, 4
- %2 = getelementptr float, ptr %input, i64 %offset3
- store float %r, ptr %2, align 4
- ret void
-}
-
-
-define void @stride_var(ptr %input, i32 %c, i32 %b, i32 %n, float %r) {
-; CHECK-LABEL: define void @stride_var(
-; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) {
-; CHECK-NEXT: [[ADD0:%.*]] = add nsw i32 [[B]], 1
-; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]]
-; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]]
-; CHECK-NEXT: [[I:%.*]] = sext i32 [[ADD1]] to i64
-; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[I]]
-; CHECK-NEXT: store float [[R]], ptr [[GETELEM]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2
-; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP2]]
-; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4
-; CHECK-NEXT: ret void
-;
-; Reuse getElem to compute getElem.1 and getElem.2 with variable offset n extracted from Stride
-
- %add0 = add nsw i32 %b, 1
- %mul.1 = mul nsw i32 %add0, %n
- %add.1 = add i32 %mul.1, %c
- %offset = sext i32 %add.1 to i64
- %getElem = getelementptr float, ptr %input, i64 %offset
- store float %r, ptr %getElem, align 4
-
- %mul = mul nsw i32 %b, %n
- %add = add i32 %mul, %c
- %add.11 = add i32 %add, %n
- %add.2 = add i32 %add.11, %n
- %offset1 = sext i32 %add.2 to i64
- %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset1
- store float %r, ptr %getElem.1, align 4
-
- ret void
-}
-
-; Base Delta
-
-%struct.B = type { i16 }
-%struct.A = type { %struct.B, %struct.B }
-
-define void @base_const(i32 %a, ptr %base, i16 %r) {
-; Reuse getElem1 to compute getElem2
-; CHECK-LABEL: define void @base_const(
-; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT: store i16 [[R]], ptr [[GEP1]], align 2
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2
-; CHECK-NEXT: store i16 [[R]], ptr [[GEP2]], align 2
-; CHECK-NEXT: ret void
-;
-
- %1 = sext i32 %a to i64
- %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1
- store i16 %r, ptr %getElem1, align 2
- %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1
- store i16 %r, ptr %getElem2, align 2
- ret void
-}
-
-define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) {
-; Reuse getElem1 to compute getElem2
-; CHECK-LABEL: define void @base_var(
-; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT: store i16 [[R]], ptr [[GETELEM1]], align 2
-; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1]], i64 [[N]]
-; CHECK-NEXT: store i16 [[R]], ptr [[GETELEM2]], align 2
-; CHECK-NEXT: ret void
-;
-
- %1 = sext i32 %a to i64
- %base1 = getelementptr inbounds i8, ptr %base, i64 %n
- %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1
- store i16 %r, ptr %getElem1, align 2
- %getElem2 = getelementptr inbounds %struct.A, ptr %base1, i64 %1
- store i16 %r, ptr %getElem2, align 2
- ret void
-}
>From 6968e4908a3409abf161ba135667953124347a7a Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Tue, 25 Nov 2025 19:26:42 +0000
Subject: [PATCH 2/3] Fix lit tests.
---
.../AMDGPU/promote-constOffset-to-imm.ll | 1 +
.../AMDGPU/splitkit-getsubrangeformask.ll | 50 +++++++++----------
2 files changed, 24 insertions(+), 27 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3259ddf559d77..3329c9a761900 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -720,6 +720,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x5000, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0
; GFX90A-NEXT: s_movk_i32 s0, 0xd000
; GFX90A-NEXT: s_movk_i32 s1, 0xe000
; GFX90A-NEXT: s_movk_i32 s2, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index d6d639c5e496b..67dae136afb72 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -146,11 +146,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.275, align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
@@ -169,11 +169,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.284, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.207, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.223, addrspace 4)
@@ -190,26 +190,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.295, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
- ; CHECK-NEXT: KILL undef %469:sreg_64
- ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.258, addrspace 4)
; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.267, addrspace 4)
; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.307, align 8, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM22]]
; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 %73.sub0, 65535, implicit-def dead $scc
@@ -225,17 +221,17 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.325, addrspace 4)
; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.331, addrspace 4)
; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.337, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
>From 11d93a80b3f496935f828f6e33b86da3977ff292 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Tue, 25 Nov 2025 19:52:09 +0000
Subject: [PATCH 3/3] regen checks.
---
llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index 30f52773bbce5..a0aee6c80703f 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
; GFX9-LABEL: barrier_vmcnt_global:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v1, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: global_load_dword v2, v1, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: global_store_dword v[0:1], v3, off
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list