[llvm] 80e6aff - [PowerPC] common chains to reuse offsets to reduce register pressure.
Chen Zheng via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 24 21:22:35 PDT 2021
Author: Chen Zheng
Date: 2021-10-25T03:27:16Z
New Revision: 80e6aff6bbad9a5959f8491b616438cc8792f32a
URL: https://github.com/llvm/llvm-project/commit/80e6aff6bbad9a5959f8491b616438cc8792f32a
DIFF: https://github.com/llvm/llvm-project/commit/80e6aff6bbad9a5959f8491b616438cc8792f32a.diff
LOG: [PowerPC] common chains to reuse offsets to reduce register pressure.
Add a new preparation pattern in PPCLoopInstFormPrep pass to reduce register
pressure.
Reviewed By: jsji
Differential Revision: https://reviews.llvm.org/D108750
Added:
Modified:
llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
llvm/test/CodeGen/PowerPC/common-chain.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index fa4865ceb384a..7aba60eb37637 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -39,6 +39,40 @@
// T *p = array[-1];
// for (int i = 0; i < n; ++i)
// *++p = c;
+//
+// 3: common multiple chains for the load/stores with same offsets in the loop,
+// so that we can reuse the offsets and reduce the register pressure in the
+// loop. This transformation can also increase the loop ILP as now each chain
+// uses its own loop induction add/addi. But this will increase the number of
+// add/addi in the loop.
+//
+// Generically, this means transforming loops like this:
+//
+// char *p;
+// A1 = p + base1
+// A2 = p + base1 + offset
+// B1 = p + base2
+// B2 = p + base2 + offset
+//
+// for (int i = 0; i < n; i++)
+// unsigned long x1 = *(unsigned long *)(A1 + i);
+// unsigned long x2 = *(unsigned long *)(A2 + i)
+// unsigned long x3 = *(unsigned long *)(B1 + i);
+// unsigned long x4 = *(unsigned long *)(B2 + i);
+// }
+//
+// to look like this:
+//
+// A1_new = p + base1 // chain 1
+// B1_new = p + base2 // chain 2, now inside the loop, common offset is
+// // reused.
+//
+// for (long long i = 0; i < n; i+=count) {
+// unsigned long x1 = *(unsigned long *)(A1_new + i);
+// unsigned long x2 = *(unsigned long *)((A1_new + i) + offset);
+// unsigned long x3 = *(unsigned long *)(B1_new + i);
+// unsigned long x4 = *(unsigned long *)((B1_new + i) + offset);
+// }
//===----------------------------------------------------------------------===//
#include "PPC.h"
@@ -90,6 +124,10 @@ static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update",
cl::init(true), cl::Hidden,
cl::desc("prefer update form when ds form is also a update form"));
+static cl::opt<bool> EnableChainCommoning(
+ "ppc-formprep-chain-commoning", cl::init(true), cl::Hidden,
+ cl::desc("Enable chain commoning in PPC loop prepare pass."));
+
// Sum of following 3 per loop thresholds for all loops can not be larger
// than MaxVarsPrep.
// now the thresholds for each kind prep are exterimental values on Power9.
@@ -106,6 +144,16 @@ static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars",
cl::Hidden, cl::init(8),
cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form"));
+// Commoning chain will reduce the register pressure, so we don't consider about
+// the PHI nodes number.
+// But commoning chain will increase the addi/add number in the loop and also
+// increase loop ILP. Maximum chain number should be same with hardware
+// IssueWidth, because we won't benefit from ILP if the parallel chains number
+// is bigger than IssueWidth. We assume there are 2 chains in one bucket, so
+// there would be 4 buckets at most on P9(IssueWidth is 8).
+static cl::opt<unsigned> MaxVarsChainCommon(
+ "ppc-chaincommon-max-vars", cl::Hidden, cl::init(4),
+ cl::desc("Bucket number per loop for PPC loop chain common"));
// If would not be profitable if the common base has only one load/store, ISEL
// should already be able to choose best load/store form based on offset for
@@ -116,12 +164,18 @@ static cl::opt<unsigned> DispFormPrepMinThreshold("ppc-dispprep-min-threshold",
cl::desc("Minimal common base load/store instructions triggering DS/DQ form "
"preparation"));
+static cl::opt<unsigned> ChainCommonPrepMinThreshold(
+ "ppc-chaincommon-min-threshold", cl::Hidden, cl::init(4),
+ cl::desc("Minimal common base load/store instructions triggering chain "
+ "commoning preparation. Must be not smaller than 4"));
+
STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form");
STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form");
STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form");
STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten");
STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten");
STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten");
+STATISTIC(ChainCommoningRewritten, "Num of commoning chains");
namespace {
struct BucketElement {
@@ -133,11 +187,24 @@ namespace {
};
struct Bucket {
- Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
- Elements(1, BucketElement(I)) {}
+ Bucket(const SCEV *B, Instruction *I)
+ : BaseSCEV(B), Elements(1, BucketElement(I)) {
+ ChainSize = 0;
+ }
+ // The base of the whole bucket.
const SCEV *BaseSCEV;
+
+ // All elements in the bucket. In the bucket, the element with the BaseSCEV
+ // has no offset and all other elements are stored as offsets to the
+ // BaseSCEV.
SmallVector<BucketElement, 16> Elements;
+
+ // The potential chains size. This is used for chain commoning only.
+ unsigned ChainSize;
+
+ // The base for each potential chain. This is used for chain commoning only.
+ SmallVector<BucketElement, 16> ChainBases;
};
// "UpdateForm" is not a real PPC instruction form, it stands for dform
@@ -193,17 +260,31 @@ namespace {
Value *getNodeForInc(Loop *L, Instruction *MemI,
const SCEV *BasePtrIncSCEV);
+ /// Common chains to reuse offsets for a loop to reduce register pressure.
+ bool chainCommoning(Loop *L, SmallVector<Bucket, 16> &Buckets);
+
+ /// Find out the potential commoning chains and their bases.
+ bool prepareBasesForCommoningChains(Bucket &BucketChain);
+
+ /// Rewrite load/store according to the common chains.
+ bool
+ rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket,
+ SmallSet<BasicBlock *, 16> &BBChanged);
+
/// Collect condition matched(\p isValidCandidate() returns true)
/// candidates in Loop \p L.
SmallVector<Bucket, 16> collectCandidates(
Loop *L,
- std::function<bool(const Instruction *, const Value *, const Type *)>
+ std::function<bool(const Instruction *, Value *, const Type *)>
isValidCandidate,
+ std::function<bool(const SCEV *)> isValidDiff,
unsigned MaxCandidateNum);
- /// Add a candidate to candidates \p Buckets.
+ /// Add a candidate to candidates \p Buckets if
diff between candidate and
+ /// one base in \p Buckets matches \p isValidDiff.
void addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
SmallVector<Bucket, 16> &Buckets,
+ std::function<bool(const SCEV *)> isValidDiff,
unsigned MaxCandidateNum);
/// Prepare all candidates in \p Buckets for update form.
@@ -335,6 +416,221 @@ bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {
return MadeChange;
}
+// Finding the minimal(chain_number + reusable_offset_number) is a complicated
+// algorithmic problem.
+// For now, the algorithm used here is simply adjusted to handle the case for
+// manually unrolling cases.
+// FIXME: use a more powerful algorithm to find minimal sum of chain_number and
+// reusable_offset_number for one base with multiple offsets.
+bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) {
+ // The minimal size for profitable chain commoning:
+ // A1 = base + offset1
+ // A2 = base + offset2 (offset2 - offset1 = X)
+ // A3 = base + offset3
+ // A4 = base + offset4 (offset4 - offset3 = X)
+ // ======>
+ // base1 = base + offset1
+ // base2 = base + offset3
+ // A1 = base1
+ // A2 = base1 + X
+ // A3 = base2
+ // A4 = base2 + X
+ //
+ // There is benefit because of reuse of offest 'X'.
+
+ assert(ChainCommonPrepMinThreshold >= 4 &&
+ "Thredhold can not be smaller than 4!\n");
+ if (CBucket.Elements.size() < ChainCommonPrepMinThreshold)
+ return false;
+
+ // We simply select the FirstOffset as the first reusable offset between each
+ // chain element 1 and element 0.
+ const SCEV *FirstOffset = CBucket.Elements[1].Offset;
+
+ // Figure out how many times above FirstOffset is used in the chain.
+ // For a success commoning chain candidate, offset
diff erence between each
+ // chain element 1 and element 0 must be also FirstOffset.
+ unsigned FirstOffsetReusedCount = 1;
+
+ // Figure out how many times above FirstOffset is used in the first chain.
+ // Chain number is FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain
+ unsigned FirstOffsetReusedCountInFirstChain = 1;
+
+ unsigned EleNum = CBucket.Elements.size();
+ bool SawChainSeparater = false;
+ for (unsigned j = 2; j != EleNum; ++j) {
+ if (SE->getMinusSCEV(CBucket.Elements[j].Offset,
+ CBucket.Elements[j - 1].Offset) == FirstOffset) {
+ if (!SawChainSeparater)
+ FirstOffsetReusedCountInFirstChain++;
+ FirstOffsetReusedCount++;
+ } else
+ // For now, if we meet any offset which is not FirstOffset, we assume we
+ // find a new Chain.
+ // This makes us miss some opportunities.
+ // For example, we can common:
+ //
+ // {OffsetA, Offset A, OffsetB, OffsetA, OffsetA, OffsetB}
+ //
+ // as two chains:
+ // {{OffsetA, Offset A, OffsetB}, {OffsetA, OffsetA, OffsetB}}
+ // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 2
+ //
+ // But we fail to common:
+ //
+ // {OffsetA, OffsetB, OffsetA, OffsetA, OffsetB, OffsetA}
+ // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 1
+
+ SawChainSeparater = true;
+ }
+
+ // FirstOffset is not reused, skip this bucket.
+ if (FirstOffsetReusedCount == 1)
+ return false;
+
+ unsigned ChainNum =
+ FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain;
+
+ // All elements are increased by FirstOffset.
+ // The number of chains should be sqrt(EleNum).
+ if (!SawChainSeparater)
+ ChainNum = (unsigned)sqrt(EleNum);
+
+ CBucket.ChainSize = (unsigned)(EleNum / ChainNum);
+
+ // If this is not a perfect chain(eg: not all elements can be put inside
+ // commoning chains.), skip now.
+ if (CBucket.ChainSize * ChainNum != EleNum)
+ return false;
+
+ if (SawChainSeparater) {
+ // Check that the offset seqs are the same for all chains.
+ for (unsigned i = 1; i < CBucket.ChainSize; i++)
+ for (unsigned j = 1; j < ChainNum; j++)
+ if (CBucket.Elements[i].Offset !=
+ SE->getMinusSCEV(CBucket.Elements[i + j * CBucket.ChainSize].Offset,
+ CBucket.Elements[j * CBucket.ChainSize].Offset))
+ return false;
+ }
+
+ for (unsigned i = 0; i < ChainNum; i++)
+ CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]);
+
+ LLVM_DEBUG(dbgs() << "Bucket has " << ChainNum << " chains.\n");
+
+ return true;
+}
+
+bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
+ SmallVector<Bucket, 16> &Buckets) {
+ bool MadeChange = false;
+
+ if (Buckets.empty())
+ return MadeChange;
+
+ SmallSet<BasicBlock *, 16> BBChanged;
+
+ for (auto &Bucket : Buckets) {
+ if (prepareBasesForCommoningChains(Bucket))
+ MadeChange |= rewriteLoadStoresForCommoningChains(L, Bucket, BBChanged);
+ }
+
+ if (MadeChange)
+ for (auto *BB : BBChanged)
+ DeleteDeadPHIs(BB);
+ return MadeChange;
+}
+
+bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
+ Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) {
+ bool MadeChange = false;
+
+ assert(Bucket.Elements.size() ==
+ Bucket.ChainBases.size() * Bucket.ChainSize &&
+ "invalid bucket for chain commoning!\n");
+ SmallPtrSet<Value *, 16> DeletedPtrs;
+
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+
+ Type *I64Ty = Type::getInt64Ty(Header->getContext());
+
+ SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
+ "loopprepare-chaincommon");
+
+ for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) {
+ unsigned BaseElemIdx = Bucket.ChainSize * ChainIdx;
+ const SCEV *BaseSCEV =
+ ChainIdx ? SE->getAddExpr(Bucket.BaseSCEV,
+ Bucket.Elements[BaseElemIdx].Offset)
+ : Bucket.BaseSCEV;
+ const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV);
+
+ // Make sure the base is able to expand.
+ if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
+ return MadeChange;
+
+ assert(BasePtrSCEV->isAffine() &&
+ "Invalid SCEV type for the base ptr for a candidate chain!\n");
+
+ std::pair<Instruction *, Instruction *> Base =
+ rewriteForBase(L, BasePtrSCEV, Bucket.Elements[BaseElemIdx].Instr,
+ false /* CanPreInc */, UpdateForm, SCEVE, DeletedPtrs);
+
+ if (!Base.first || !Base.second)
+ return MadeChange;
+
+ // Keep track of the replacement pointer values we've inserted so that we
+ // don't generate more pointer values than necessary.
+ SmallPtrSet<Value *, 16> NewPtrs;
+ NewPtrs.insert(Base.first);
+
+ for (unsigned Idx = BaseElemIdx + 1; Idx < BaseElemIdx + Bucket.ChainSize;
+ ++Idx) {
+ BucketElement &I = Bucket.Elements[Idx];
+ Value *Ptr = getPointerOperandAndType(I.Instr);
+ assert(Ptr && "No pointer operand");
+ if (NewPtrs.count(Ptr))
+ continue;
+
+ const SCEV *OffsetSCEV =
+ BaseElemIdx ? SE->getMinusSCEV(Bucket.Elements[Idx].Offset,
+ Bucket.Elements[BaseElemIdx].Offset)
+ : Bucket.Elements[Idx].Offset;
+
+ // Make sure offset is able to expand. Only need to check one time as the
+ // offsets are reused between
diff erent chains.
+ if (!BaseElemIdx)
+ if (!isSafeToExpand(OffsetSCEV, *SE))
+ return false;
+
+ Value *OffsetValue = SCEVE.expandCodeFor(
+ OffsetSCEV, I64Ty, LoopPredecessor->getTerminator());
+
+ Instruction *NewPtr = rewriteForBucketElement(Base, Bucket.Elements[Idx],
+ OffsetValue, DeletedPtrs);
+
+ assert(NewPtr && "Wrong rewrite!\n");
+ NewPtrs.insert(NewPtr);
+ }
+
+ ++ChainCommoningRewritten;
+ }
+
+ // Clear the rewriter cache, because values that are in the rewriter's cache
+ // can be deleted below, causing the AssertingVH in the cache to trigger.
+ SCEVE.clear();
+
+ for (auto *Ptr : DeletedPtrs) {
+ if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
+ BBChanged.insert(IDel->getParent());
+ RecursivelyDeleteTriviallyDeadInstructions(Ptr);
+ }
+
+ MadeChange = true;
+ return MadeChange;
+}
+
// Rewrite the new base according to BasePtrSCEV.
// bb.loop.preheader:
// %newstart = ...
@@ -522,35 +818,43 @@ Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement(
return ReplNewPtr;
}
-void PPCLoopInstrFormPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
- SmallVector<Bucket, 16> &Buckets,
- unsigned MaxCandidateNum) {
+void PPCLoopInstrFormPrep::addOneCandidate(
+ Instruction *MemI, const SCEV *LSCEV, SmallVector<Bucket, 16> &Buckets,
+ std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
assert((MemI && getPointerOperandAndType(MemI)) &&
"Candidate should be a memory instruction.");
assert(LSCEV && "Invalid SCEV for Ptr value.");
+
bool FoundBucket = false;
for (auto &B : Buckets) {
+ if (cast<SCEVAddRecExpr>(B.BaseSCEV)->getStepRecurrence(*SE) !=
+ cast<SCEVAddRecExpr>(LSCEV)->getStepRecurrence(*SE))
+ continue;
const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
- if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
- B.Elements.push_back(BucketElement(CDiff, MemI));
+ if (isValidDiff(Diff)) {
+ B.Elements.push_back(BucketElement(Diff, MemI));
FoundBucket = true;
break;
}
}
if (!FoundBucket) {
- if (Buckets.size() == MaxCandidateNum)
+ if (Buckets.size() == MaxCandidateNum) {
+ LLVM_DEBUG(dbgs() << "Can not prepare more chains, reach maximum limit "
+ << MaxCandidateNum << "\n");
return;
+ }
Buckets.push_back(Bucket(LSCEV, MemI));
}
}
SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
Loop *L,
- std::function<bool(const Instruction *, const Value *, const Type *)>
+ std::function<bool(const Instruction *, Value *, const Type *)>
isValidCandidate,
- unsigned MaxCandidateNum) {
+ std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
SmallVector<Bucket, 16> Buckets;
+
for (const auto &BB : L->blocks())
for (auto &J : *BB) {
Value *PtrValue = nullptr;
@@ -575,7 +879,7 @@ SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
HasCandidateForPrepare = true;
if (isValidCandidate(&J, PtrValue, PointerElementType))
- addOneCandidate(&J, LSCEV, Buckets, MaxCandidateNum);
+ addOneCandidate(&J, LSCEV, Buckets, isValidDiff, MaxCandidateNum);
}
return Buckets;
}
@@ -712,7 +1016,8 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(
SmallPtrSet<Value *, 16> DeletedPtrs;
BasicBlock *Header = L->getHeader();
- SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
+ SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
+ "loopprepare-formrewrite");
// For some DS form load/store instructions, it can also be an update form,
// if the stride is constant and is a multipler of 4. Use update form if
@@ -990,7 +1295,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
}
// Check if a load/store has update form. This lambda is used by function
// collectCandidates which can collect candidates for types defined by lambda.
- auto isUpdateFormCandidate = [&](const Instruction *I, const Value *PtrValue,
+ auto isUpdateFormCandidate = [&](const Instruction *I, Value *PtrValue,
const Type *PointerElementType) {
assert((PtrValue && I) && "Invalid parameter!");
// There are no update forms for Altivec vector load/stores.
@@ -1022,7 +1327,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
};
// Check if a load/store has DS form.
- auto isDSFormCandidate = [](const Instruction *I, const Value *PtrValue,
+ auto isDSFormCandidate = [](const Instruction *I, Value *PtrValue,
const Type *PointerElementType) {
assert((PtrValue && I) && "Invalid parameter!");
if (isa<IntrinsicInst>(I))
@@ -1036,7 +1341,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
};
// Check if a load/store has DQ form.
- auto isDQFormCandidate = [&](const Instruction *I, const Value *PtrValue,
+ auto isDQFormCandidate = [&](const Instruction *I, Value *PtrValue,
const Type *PointerElementType) {
assert((PtrValue && I) && "Invalid parameter!");
// Check if it is a P10 lxvp/stxvp intrinsic.
@@ -1048,37 +1353,131 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
return ST && ST->hasP9Vector() && (PointerElementType->isVectorTy());
};
+ // Check if a load/store is candidate for chain commoning.
+ // If the SCEV is only with one ptr operand in its start, we can use that
+ // start as a chain separator. Mark this load/store as a candidate.
+ auto isChainCommoningCandidate = [&](const Instruction *I, Value *PtrValue,
+ const Type *PointerElementType) {
+ const SCEVAddRecExpr *ARSCEV =
+ cast<SCEVAddRecExpr>(SE->getSCEVAtScope(PtrValue, L));
+ if (!ARSCEV)
+ return false;
+
+ if (!ARSCEV->isAffine())
+ return false;
+
+ const SCEV *Start = ARSCEV->getStart();
+
+ // A single pointer. We can treat it as offset 0.
+ if (isa<SCEVUnknown>(Start) && Start->getType()->isPointerTy())
+ return true;
+
+ const SCEVAddExpr *ASCEV = dyn_cast<SCEVAddExpr>(Start);
+
+ // We need a SCEVAddExpr to include both base and offset.
+ if (!ASCEV)
+ return false;
+
+ // Make sure there is only one pointer operand(base) and all other operands
+ // are integer type.
+ bool SawPointer = false;
+ for (const SCEV *Op : ASCEV->operands()) {
+ if (Op->getType()->isPointerTy()) {
+ if (SawPointer)
+ return false;
+ SawPointer = true;
+ } else if (!Op->getType()->isIntegerTy())
+ return false;
+ }
+
+ return SawPointer;
+ };
+
+ // Check if the
diff is a constant type. This is used for update/DS/DQ form
+ // preparation.
+ auto isValidConstantDiff = [](const SCEV *Diff) {
+ return dyn_cast<SCEVConstant>(Diff) != nullptr;
+ };
+
+ // Make sure the
diff between the base and new candidate is required type.
+ // This is used for chain commoning preparation.
+ auto isValidChainCommoningDiff = [](const SCEV *Diff) {
+ assert(Diff && "Invalid Diff!\n");
+
+ // Don't mess up previous dform prepare.
+ if (isa<SCEVConstant>(Diff))
+ return false;
+
+ // A single integer type offset.
+ if (isa<SCEVUnknown>(Diff) && Diff->getType()->isIntegerTy())
+ return true;
+
+ const SCEVNAryExpr *ADiff = dyn_cast<SCEVNAryExpr>(Diff);
+ if (!ADiff)
+ return false;
+
+ for (const SCEV *Op : ADiff->operands())
+ if (!Op->getType()->isIntegerTy())
+ return false;
+
+ return true;
+ };
+
HasCandidateForPrepare = false;
+ LLVM_DEBUG(dbgs() << "Start to prepare for update form.\n");
// Collect buckets of comparable addresses used by loads and stores for update
// form.
- SmallVector<Bucket, 16> UpdateFormBuckets =
- collectCandidates(L, isUpdateFormCandidate, MaxVarsUpdateForm);
+ SmallVector<Bucket, 16> UpdateFormBuckets = collectCandidates(
+ L, isUpdateFormCandidate, isValidConstantDiff, MaxVarsUpdateForm);
// Prepare for update form.
if (!UpdateFormBuckets.empty())
MadeChange |= updateFormPrep(L, UpdateFormBuckets);
- else if (!HasCandidateForPrepare)
+ else if (!HasCandidateForPrepare) {
+ LLVM_DEBUG(
+ dbgs()
+ << "No prepare candidates found, stop praparation for current loop!\n");
// If no candidate for preparing, return early.
return MadeChange;
+ }
+ LLVM_DEBUG(dbgs() << "Start to prepare for DS form.\n");
// Collect buckets of comparable addresses used by loads and stores for DS
// form.
- SmallVector<Bucket, 16> DSFormBuckets =
- collectCandidates(L, isDSFormCandidate, MaxVarsDSForm);
+ SmallVector<Bucket, 16> DSFormBuckets = collectCandidates(
+ L, isDSFormCandidate, isValidConstantDiff, MaxVarsDSForm);
// Prepare for DS form.
if (!DSFormBuckets.empty())
MadeChange |= dispFormPrep(L, DSFormBuckets, DSForm);
+ LLVM_DEBUG(dbgs() << "Start to prepare for DQ form.\n");
// Collect buckets of comparable addresses used by loads and stores for DQ
// form.
- SmallVector<Bucket, 16> DQFormBuckets =
- collectCandidates(L, isDQFormCandidate, MaxVarsDQForm);
+ SmallVector<Bucket, 16> DQFormBuckets = collectCandidates(
+ L, isDQFormCandidate, isValidConstantDiff, MaxVarsDQForm);
// Prepare for DQ form.
if (!DQFormBuckets.empty())
MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm);
+ // Collect buckets of comparable addresses used by loads and stores for chain
+ // commoning. With chain commoning, we reuse offsets between the chains, so
+ // the register pressure will be reduced.
+ if (!EnableChainCommoning) {
+ LLVM_DEBUG(dbgs() << "Chain commoning is not enabled.\n");
+ return MadeChange;
+ }
+
+ LLVM_DEBUG(dbgs() << "Start to prepare for chain commoning.\n");
+ SmallVector<Bucket, 16> Buckets =
+ collectCandidates(L, isChainCommoningCandidate, isValidChainCommoningDiff,
+ MaxVarsChainCommon);
+
+ // Prepare for chain commoning.
+ if (!Buckets.empty())
+ MadeChange |= chainCommoning(L, Buckets);
+
return MadeChange;
}
diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll
index 5743e7f162d35..231ea4306e683 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain.ll
@@ -38,23 +38,26 @@ define i64 @two_chain_same_offset_succ(i8* %p, i64 %offset, i64 %base1, i64 %n)
; CHECK-NEXT: cmpdi r6, 1
; CHECK-NEXT: blt cr0, .LBB0_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: sldi r8, r4, 1
+; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: mtctr r6
+; CHECK-NEXT: add r8, r4, r7
+; CHECK-NEXT: add r7, r5, r4
+; CHECK-NEXT: add r5, r5, r8
+; CHECK-NEXT: add r7, r3, r7
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
-; CHECK-NEXT: sldi r7, r4, 2
-; CHECK-NEXT: add r9, r4, r8
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: ldx r6, r5, r4
-; CHECK-NEXT: ldx r10, r5, r8
-; CHECK-NEXT: ldx r11, r5, r9
-; CHECK-NEXT: ldx r12, r5, r7
+; CHECK-NEXT: ld r6, 0(r7)
+; CHECK-NEXT: ldx r8, r7, r4
+; CHECK-NEXT: ld r9, 0(r5)
+; CHECK-NEXT: ldx r10, r5, r4
+; CHECK-NEXT: addi r7, r7, 1
; CHECK-NEXT: addi r5, r5, 1
-; CHECK-NEXT: mulld r6, r10, r6
-; CHECK-NEXT: mulld r6, r6, r11
-; CHECK-NEXT: maddld r3, r6, r12, r3
+; CHECK-NEXT: mulld r6, r8, r6
+; CHECK-NEXT: mulld r6, r6, r9
+; CHECK-NEXT: maddld r3, r6, r10, r3
; CHECK-NEXT: bdnz .LBB0_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
@@ -108,8 +111,8 @@ for.body: ; preds = %entry, %for.body
; 4: + offset
; 5: + offset
;
-; It can not be commoned to chains because we will need a chain for a single address,
-; which can not make the commoning be profitable.
+; It can not be commoned to chains because we need a chain for a single address.
+; It is not profitable to common chains if not all addresses are in chains.
;
; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
@@ -304,7 +307,7 @@ for.body: ; preds = %entry, %for.body
; 3: + 2*offset
; 4: + 3*offset
;
-; The
diff between address 2 and address 1 is offset, and this offset is not reused among other addresses,
+; The
diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains,
; so we can not common any chains.
;
; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) {
@@ -404,7 +407,7 @@ for.body: ; preds = %entry, %for.body
; 5: + 1*offset
; 6: + 2*offset
;
-; The
diff between address 2 and address 1 is offset, and this offset is reused between address 4 and address 5.
+; The
diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5.
; but the
diff between address 3 and address 2 (3*offset) is not the same with the
diff between address 6
; and address 5(2*offset), so we can not common chains for these addresses.
;
@@ -564,24 +567,26 @@ define i64 @two_chain_
diff erent_offsets_succ(i8* %p, i64 %offset, i64 %base1, i6
; CHECK-NEXT: cmpdi r6, 1
; CHECK-NEXT: blt cr0, .LBB5_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: mulli r7, r4, 6
-; CHECK-NEXT: add r5, r3, r5
-; CHECK-NEXT: sldi r3, r4, 1
-; CHECK-NEXT: add r9, r4, r3
-; CHECK-NEXT: mtctr r6
; CHECK-NEXT: sldi r8, r4, 2
+; CHECK-NEXT: add r7, r5, r4
+; CHECK-NEXT: mtctr r6
+; CHECK-NEXT: add r5, r5, r8
+; CHECK-NEXT: add r7, r3, r7
+; CHECK-NEXT: sldi r4, r4, 1
+; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_2: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: ldx r6, r5, r4
-; CHECK-NEXT: ldx r10, r5, r9
-; CHECK-NEXT: ldx r11, r5, r8
-; CHECK-NEXT: ldx r12, r5, r7
+; CHECK-NEXT: ld r6, 0(r7)
+; CHECK-NEXT: ldx r8, r7, r4
+; CHECK-NEXT: ld r9, 0(r5)
+; CHECK-NEXT: ldx r10, r5, r4
+; CHECK-NEXT: addi r7, r7, 1
; CHECK-NEXT: addi r5, r5, 1
-; CHECK-NEXT: mulld r6, r10, r6
-; CHECK-NEXT: mulld r6, r6, r11
-; CHECK-NEXT: maddld r3, r6, r12, r3
+; CHECK-NEXT: mulld r6, r8, r6
+; CHECK-NEXT: mulld r6, r6, r9
+; CHECK-NEXT: maddld r3, r6, r10, r3
; CHECK-NEXT: bdnz .LBB5_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
@@ -664,32 +669,30 @@ define i64 @two_chain_two_bases_succ(i8* %p, i64 %offset, i64 %base1, i64 %base2
; CHECK-NEXT: cmpdi r7, 1
; CHECK-NEXT: blt cr0, .LBB6_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: sldi r8, r4, 1
-; CHECK-NEXT: mtctr r7
-; CHECK-NEXT: add r9, r4, r8
-; CHECK-NEXT: add r8, r6, r9
; CHECK-NEXT: add r6, r6, r4
-; CHECK-NEXT: add r9, r5, r9
; CHECK-NEXT: add r5, r5, r4
-; CHECK-NEXT: li r4, 0
+; CHECK-NEXT: mtctr r7
+; CHECK-NEXT: sldi r4, r4, 1
+; CHECK-NEXT: add r5, r3, r5
+; CHECK-NEXT: add r6, r3, r6
+; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB6_2: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: ldx r7, r3, r5
-; CHECK-NEXT: ldx r10, r3, r9
-; CHECK-NEXT: ldx r11, r3, r6
-; CHECK-NEXT: ldx r12, r3, r8
-; CHECK-NEXT: addi r3, r3, 1
-; CHECK-NEXT: mulld r7, r10, r7
-; CHECK-NEXT: mulld r7, r7, r11
-; CHECK-NEXT: maddld r4, r7, r12, r4
+; CHECK-NEXT: ld r7, 0(r5)
+; CHECK-NEXT: ldx r8, r5, r4
+; CHECK-NEXT: ld r9, 0(r6)
+; CHECK-NEXT: ldx r10, r6, r4
+; CHECK-NEXT: addi r5, r5, 1
+; CHECK-NEXT: addi r6, r6, 1
+; CHECK-NEXT: mulld r7, r8, r7
+; CHECK-NEXT: mulld r7, r7, r9
+; CHECK-NEXT: maddld r3, r7, r10, r3
; CHECK-NEXT: bdnz .LBB6_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
-; CHECK-NEXT: mr r3, r4
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB6_4:
-; CHECK-NEXT: li r4, 0
-; CHECK-NEXT: mr r3, r4
+; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = mul nsw i64 %offset, 3
@@ -748,328 +751,272 @@ for.body: ; preds = %entry, %for.body
define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) {
; CHECK-LABEL: spill_reduce_succ:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stdu r1, -336(r1)
-; CHECK-NEXT: .cfi_def_cfa_offset 336
-; CHECK-NEXT: .cfi_offset r14, -144
-; CHECK-NEXT: .cfi_offset r15, -136
-; CHECK-NEXT: .cfi_offset r16, -128
-; CHECK-NEXT: .cfi_offset r17, -120
-; CHECK-NEXT: .cfi_offset r18, -112
-; CHECK-NEXT: .cfi_offset r19, -104
-; CHECK-NEXT: .cfi_offset r20, -96
-; CHECK-NEXT: .cfi_offset r21, -88
-; CHECK-NEXT: .cfi_offset r22, -80
-; CHECK-NEXT: .cfi_offset r23, -72
-; CHECK-NEXT: .cfi_offset r24, -64
-; CHECK-NEXT: .cfi_offset r25, -56
-; CHECK-NEXT: .cfi_offset r26, -48
-; CHECK-NEXT: .cfi_offset r27, -40
-; CHECK-NEXT: .cfi_offset r28, -32
-; CHECK-NEXT: .cfi_offset r29, -24
-; CHECK-NEXT: .cfi_offset r30, -16
-; CHECK-NEXT: .cfi_offset r31, -8
-; CHECK-NEXT: .cfi_offset r2, -152
; CHECK-NEXT: cmpdi r6, 1
-; CHECK-NEXT: std r14, 192(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r15, 200(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r16, 208(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r17, 216(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r18, 224(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r19, 232(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r20, 240(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r21, 248(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r22, 256(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r23, 264(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r24, 272(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r25, 280(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r26, 288(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r27, 296(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r28, 304(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r29, 312(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r30, 320(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r31, 328(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r2, 184(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r9, 40(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r8, 48(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r7, 64(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r5, 80(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r4, 72(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r9, -176(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r8, -168(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r7, -160(r1) # 8-byte Folded Spill
; CHECK-NEXT: blt cr0, .LBB7_7
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: sldi r3, r6, 2
-; CHECK-NEXT: li r4, 1
-; CHECK-NEXT: mr r16, r10
-; CHECK-NEXT: cmpdi r3, 1
-; CHECK-NEXT: iselgt r3, r3, r4
-; CHECK-NEXT: addi r4, r3, -1
-; CHECK-NEXT: clrldi r6, r3, 63
-; CHECK-NEXT: cmpldi r4, 3
+; CHECK-NEXT: sldi r6, r6, 2
+; CHECK-NEXT: li r7, 1
+; CHECK-NEXT: mr r12, r10
+; CHECK-NEXT: cmpdi r6, 1
+; CHECK-NEXT: iselgt r7, r6, r7
+; CHECK-NEXT: addi r8, r7, -1
+; CHECK-NEXT: clrldi r6, r7, 63
+; CHECK-NEXT: cmpldi r8, 3
; CHECK-NEXT: blt cr0, .LBB7_4
; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
-; CHECK-NEXT: ld r30, 40(r1) # 8-byte Folded Reload
-; CHECK-NEXT: sldi r4, r16, 2
-; CHECK-NEXT: ld r19, 80(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r21, 72(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r22, 56(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r27, 48(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r18, 64(r1) # 8-byte Folded Reload
-; CHECK-NEXT: add r5, r30, r4
-; CHECK-NEXT: rldicl r0, r3, 62, 2
-; CHECK-NEXT: sldi r5, r5, 3
-; CHECK-NEXT: add r11, r19, r5
-; CHECK-NEXT: add r7, r21, r5
-; CHECK-NEXT: add r5, r22, r5
-; CHECK-NEXT: std r5, 168(r1) # 8-byte Folded Spill
-; CHECK-NEXT: add r5, r27, r4
-; CHECK-NEXT: add r4, r18, r4
-; CHECK-NEXT: std r7, 176(r1) # 8-byte Folded Spill
-; CHECK-NEXT: sldi r5, r5, 3
-; CHECK-NEXT: sldi r4, r4, 3
-; CHECK-NEXT: add r29, r19, r5
-; CHECK-NEXT: add r7, r21, r5
-; CHECK-NEXT: add r5, r22, r5
-; CHECK-NEXT: add r26, r19, r4
-; CHECK-NEXT: std r5, 152(r1) # 8-byte Folded Spill
-; CHECK-NEXT: add r5, r21, r4
-; CHECK-NEXT: add r4, r22, r4
-; CHECK-NEXT: std r7, 160(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r4, 136(r1) # 8-byte Folded Spill
-; CHECK-NEXT: sldi r4, r16, 1
-; CHECK-NEXT: std r5, 144(r1) # 8-byte Folded Spill
-; CHECK-NEXT: add r10, r16, r4
-; CHECK-NEXT: add r3, r18, r4
-; CHECK-NEXT: add r5, r30, r10
-; CHECK-NEXT: sldi r3, r3, 3
-; CHECK-NEXT: sldi r5, r5, 3
-; CHECK-NEXT: add r23, r19, r5
-; CHECK-NEXT: add r7, r21, r5
-; CHECK-NEXT: add r5, r22, r5
-; CHECK-NEXT: std r5, 120(r1) # 8-byte Folded Spill
-; CHECK-NEXT: add r5, r27, r10
-; CHECK-NEXT: std r7, 128(r1) # 8-byte Folded Spill
-; CHECK-NEXT: sldi r5, r5, 3
-; CHECK-NEXT: add r20, r19, r5
-; CHECK-NEXT: add r7, r21, r5
-; CHECK-NEXT: add r5, r22, r5
-; CHECK-NEXT: std r5, 104(r1) # 8-byte Folded Spill
-; CHECK-NEXT: add r5, r18, r10
-; CHECK-NEXT: std r7, 112(r1) # 8-byte Folded Spill
-; CHECK-NEXT: sub r10, r18, r10
-; CHECK-NEXT: sldi r5, r5, 3
+; CHECK-NEXT: rldicl r7, r7, 62, 2
+; CHECK-NEXT: sldi r10, r12, 2
+; CHECK-NEXT: ld r2, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r31, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT: std r7, -184(r1) # 8-byte Folded Spill
+; CHECK-NEXT: mr r7, r4
+; CHECK-NEXT: ld r4, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT: add r8, r4, r10
+; CHECK-NEXT: sldi r8, r8, 3
+; CHECK-NEXT: add r9, r5, r8
+; CHECK-NEXT: add r8, r2, r10
+; CHECK-NEXT: add r10, r31, r10
; CHECK-NEXT: sldi r10, r10, 3
-; CHECK-NEXT: add r17, r19, r5
-; CHECK-NEXT: add r7, r21, r5
-; CHECK-NEXT: add r5, r22, r5
-; CHECK-NEXT: std r5, 88(r1) # 8-byte Folded Spill
-; CHECK-NEXT: add r5, r30, r4
-; CHECK-NEXT: std r7, 96(r1) # 8-byte Folded Spill
-; CHECK-NEXT: add r7, r19, r3
-; CHECK-NEXT: sldi r5, r5, 3
-; CHECK-NEXT: add r14, r19, r5
-; CHECK-NEXT: add r31, r21, r5
-; CHECK-NEXT: add r2, r22, r5
-; CHECK-NEXT: add r5, r27, r4
-; CHECK-NEXT: add r4, r22, r3
-; CHECK-NEXT: sldi r5, r5, 3
-; CHECK-NEXT: add r12, r19, r5
-; CHECK-NEXT: add r8, r21, r5
-; CHECK-NEXT: add r9, r22, r5
-; CHECK-NEXT: add r5, r21, r3
-; CHECK-NEXT: add r3, r16, r30
-; CHECK-NEXT: rldicl r30, r0, 2, 1
-; CHECK-NEXT: addi r0, r30, -4
-; CHECK-NEXT: sldi r28, r3, 3
-; CHECK-NEXT: rldicl r30, r0, 62, 2
-; CHECK-NEXT: add r3, r19, r28
-; CHECK-NEXT: addi r0, r30, 1
-; CHECK-NEXT: add r30, r21, r28
-; CHECK-NEXT: add r28, r22, r28
-; CHECK-NEXT: mtctr r0
-; CHECK-NEXT: add r0, r16, r27
+; CHECK-NEXT: sldi r8, r8, 3
+; CHECK-NEXT: add r30, r5, r10
+; CHECK-NEXT: add r29, r7, r10
+; CHECK-NEXT: add r28, r3, r10
+; CHECK-NEXT: sldi r10, r12, 1
+; CHECK-NEXT: add r8, r5, r8
+; CHECK-NEXT: add r11, r12, r10
+; CHECK-NEXT: add r0, r4, r11
+; CHECK-NEXT: sldi r0, r0, 3
+; CHECK-NEXT: add r27, r5, r0
+; CHECK-NEXT: add r0, r2, r11
+; CHECK-NEXT: add r11, r31, r11
+; CHECK-NEXT: sldi r11, r11, 3
; CHECK-NEXT: sldi r0, r0, 3
-; CHECK-NEXT: add r25, r21, r0
-; CHECK-NEXT: add r24, r22, r0
-; CHECK-NEXT: add r22, r22, r10
-; CHECK-NEXT: add r21, r21, r10
-; CHECK-NEXT: add r10, r16, r18
-; CHECK-NEXT: add r27, r19, r0
-; CHECK-NEXT: li r0, 0
-; CHECK-NEXT: sldi r18, r16, 5
+; CHECK-NEXT: add r25, r5, r11
+; CHECK-NEXT: add r24, r7, r11
+; CHECK-NEXT: add r23, r3, r11
+; CHECK-NEXT: add r11, r4, r10
+; CHECK-NEXT: add r26, r5, r0
+; CHECK-NEXT: sldi r11, r11, 3
+; CHECK-NEXT: add r22, r5, r11
+; CHECK-NEXT: add r11, r2, r10
+; CHECK-NEXT: add r10, r31, r10
+; CHECK-NEXT: sldi r10, r10, 3
+; CHECK-NEXT: sldi r11, r11, 3
+; CHECK-NEXT: add r20, r5, r10
+; CHECK-NEXT: add r19, r7, r10
+; CHECK-NEXT: add r18, r3, r10
+; CHECK-NEXT: add r10, r12, r4
+; CHECK-NEXT: add r21, r5, r11
+; CHECK-NEXT: sldi r11, r2, 3
; CHECK-NEXT: sldi r10, r10, 3
-; CHECK-NEXT: add r19, r19, r10
-; CHECK-NEXT: mr r10, r16
+; CHECK-NEXT: add r17, r5, r10
+; CHECK-NEXT: add r10, r12, r2
+; CHECK-NEXT: sldi r10, r10, 3
+; CHECK-NEXT: add r16, r5, r10
+; CHECK-NEXT: add r10, r12, r31
+; CHECK-NEXT: sldi r31, r31, 3
+; CHECK-NEXT: sub r0, r11, r31
+; CHECK-NEXT: sldi r11, r4, 3
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: ld r7, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT: sldi r10, r10, 3
+; CHECK-NEXT: add r15, r5, r10
+; CHECK-NEXT: add r14, r3, r10
+; CHECK-NEXT: sub r31, r11, r31
+; CHECK-NEXT: add r2, r4, r10
+; CHECK-NEXT: li r11, 0
+; CHECK-NEXT: mr r10, r12
+; CHECK-NEXT: rldicl r7, r7, 2, 1
+; CHECK-NEXT: addi r7, r7, -4
+; CHECK-NEXT: rldicl r7, r7, 62, 2
+; CHECK-NEXT: addi r7, r7, 1
+; CHECK-NEXT: mtctr r7
+; CHECK-NEXT: sldi r7, r12, 5
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_3: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lfdux f0, r21, r18
-; CHECK-NEXT: lfdux f1, r22, r18
-; CHECK-NEXT: ld r15, 88(r1) # 8-byte Folded Reload
-; CHECK-NEXT: add r10, r10, r16
-; CHECK-NEXT: add r10, r10, r16
-; CHECK-NEXT: xsmuldp f0, f1, f0
-; CHECK-NEXT: lfd f1, 0(r19)
-; CHECK-NEXT: add r10, r10, r16
-; CHECK-NEXT: add r10, r10, r16
-; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfd f0, 0(r19)
-; CHECK-NEXT: add r19, r19, r18
-; CHECK-NEXT: lfdx f0, r24, r0
-; CHECK-NEXT: lfdx f1, r25, r0
+; CHECK-NEXT: lfd f0, 0(r14)
+; CHECK-NEXT: lfd f1, 0(r2)
+; CHECK-NEXT: add r10, r10, r12
+; CHECK-NEXT: add r10, r10, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r27, r0
+; CHECK-NEXT: lfd f1, 0(r15)
+; CHECK-NEXT: add r10, r10, r12
+; CHECK-NEXT: add r10, r10, r12
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r27, r0
-; CHECK-NEXT: lfdx f0, r28, r0
-; CHECK-NEXT: lfdx f1, r30, r0
+; CHECK-NEXT: stfd f0, 0(r15)
+; CHECK-NEXT: add r15, r15, r7
+; CHECK-NEXT: lfdx f0, r14, r0
+; CHECK-NEXT: lfdx f1, r2, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r3, r0
+; CHECK-NEXT: lfdx f1, r16, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r3, r0
-; CHECK-NEXT: lfdx f0, r4, r0
-; CHECK-NEXT: lfdx f1, r5, r0
+; CHECK-NEXT: stfdx f0, r16, r11
+; CHECK-NEXT: lfdx f0, r14, r31
+; CHECK-NEXT: lfdx f1, r2, r31
+; CHECK-NEXT: add r14, r14, r7
+; CHECK-NEXT: add r2, r2, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r7, r0
+; CHECK-NEXT: lfdx f1, r17, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r7, r0
-; CHECK-NEXT: lfdx f0, r9, r0
-; CHECK-NEXT: lfdx f1, r8, r0
+; CHECK-NEXT: stfdx f0, r17, r11
+; CHECK-NEXT: lfd f0, 0(r18)
+; CHECK-NEXT: lfd f1, 0(r19)
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r12, r0
+; CHECK-NEXT: lfdx f1, r20, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r12, r0
-; CHECK-NEXT: lfdx f0, r2, r0
-; CHECK-NEXT: lfdx f1, r31, r0
+; CHECK-NEXT: stfdx f0, r20, r11
+; CHECK-NEXT: lfdx f0, r18, r0
+; CHECK-NEXT: lfdx f1, r19, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r14, r0
+; CHECK-NEXT: lfdx f1, r21, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r14, r0
-; CHECK-NEXT: lfdx f0, r15, r0
-; CHECK-NEXT: ld r15, 96(r1) # 8-byte Folded Reload
-; CHECK-NEXT: lfdx f1, r15, r0
-; CHECK-NEXT: ld r15, 104(r1) # 8-byte Folded Reload
+; CHECK-NEXT: stfdx f0, r21, r11
+; CHECK-NEXT: lfdx f0, r18, r31
+; CHECK-NEXT: lfdx f1, r19, r31
+; CHECK-NEXT: add r18, r18, r7
+; CHECK-NEXT: add r19, r19, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r17, r0
+; CHECK-NEXT: lfdx f1, r22, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r17, r0
-; CHECK-NEXT: lfdx f0, r15, r0
-; CHECK-NEXT: ld r15, 112(r1) # 8-byte Folded Reload
-; CHECK-NEXT: lfdx f1, r15, r0
-; CHECK-NEXT: ld r15, 120(r1) # 8-byte Folded Reload
+; CHECK-NEXT: stfdx f0, r22, r11
+; CHECK-NEXT: lfd f0, 0(r23)
+; CHECK-NEXT: lfd f1, 0(r24)
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r20, r0
+; CHECK-NEXT: lfdx f1, r25, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r20, r0
-; CHECK-NEXT: lfdx f0, r15, r0
-; CHECK-NEXT: ld r15, 128(r1) # 8-byte Folded Reload
-; CHECK-NEXT: lfdx f1, r15, r0
-; CHECK-NEXT: ld r15, 136(r1) # 8-byte Folded Reload
+; CHECK-NEXT: stfdx f0, r25, r11
+; CHECK-NEXT: lfdx f0, r23, r0
+; CHECK-NEXT: lfdx f1, r24, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r23, r0
+; CHECK-NEXT: lfdx f1, r26, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r23, r0
-; CHECK-NEXT: lfdx f0, r15, r0
-; CHECK-NEXT: ld r15, 144(r1) # 8-byte Folded Reload
-; CHECK-NEXT: lfdx f1, r15, r0
-; CHECK-NEXT: ld r15, 152(r1) # 8-byte Folded Reload
+; CHECK-NEXT: stfdx f0, r26, r11
+; CHECK-NEXT: lfdx f0, r23, r31
+; CHECK-NEXT: lfdx f1, r24, r31
+; CHECK-NEXT: add r23, r23, r7
+; CHECK-NEXT: add r24, r24, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r26, r0
+; CHECK-NEXT: lfdx f1, r27, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r26, r0
-; CHECK-NEXT: lfdx f0, r15, r0
-; CHECK-NEXT: ld r15, 160(r1) # 8-byte Folded Reload
-; CHECK-NEXT: lfdx f1, r15, r0
-; CHECK-NEXT: ld r15, 168(r1) # 8-byte Folded Reload
+; CHECK-NEXT: stfdx f0, r27, r11
+; CHECK-NEXT: lfd f0, 0(r28)
+; CHECK-NEXT: lfd f1, 0(r29)
; CHECK-NEXT: xsmuldp f0, f0, f1
+; CHECK-NEXT: lfdx f1, r30, r11
+; CHECK-NEXT: xsadddp f0, f1, f0
+; CHECK-NEXT: stfdx f0, r30, r11
+; CHECK-NEXT: lfdx f0, r28, r0
; CHECK-NEXT: lfdx f1, r29, r0
+; CHECK-NEXT: xsmuldp f0, f0, f1
+; CHECK-NEXT: lfdx f1, r8, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r29, r0
-; CHECK-NEXT: lfdx f0, r15, r0
-; CHECK-NEXT: ld r15, 176(r1) # 8-byte Folded Reload
-; CHECK-NEXT: lfdx f1, r15, r0
+; CHECK-NEXT: stfdx f0, r8, r11
+; CHECK-NEXT: lfdx f0, r28, r31
+; CHECK-NEXT: lfdx f1, r29, r31
+; CHECK-NEXT: add r28, r28, r7
+; CHECK-NEXT: add r29, r29, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r11, r0
+; CHECK-NEXT: lfdx f1, r9, r11
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r11, r0
-; CHECK-NEXT: add r0, r0, r18
+; CHECK-NEXT: stfdx f0, r9, r11
+; CHECK-NEXT: add r11, r11, r7
; CHECK-NEXT: bdnz .LBB7_3
; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: beq cr0, .LBB7_7
; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader
-; CHECK-NEXT: ld r12, 64(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r3, 40(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r8, 48(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r30, 80(r1) # 8-byte Folded Reload
-; CHECK-NEXT: sldi r4, r16, 3
-; CHECK-NEXT: ld r29, 72(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r28, 56(r1) # 8-byte Folded Reload
-; CHECK-NEXT: add r0, r10, r12
-; CHECK-NEXT: add r3, r10, r3
-; CHECK-NEXT: add r8, r10, r8
-; CHECK-NEXT: sub r10, r0, r16
-; CHECK-NEXT: sldi r7, r3, 3
-; CHECK-NEXT: sldi r11, r8, 3
+; CHECK-NEXT: ld r0, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r7, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT: sldi r8, r12, 3
+; CHECK-NEXT: add r0, r10, r0
+; CHECK-NEXT: add r7, r10, r7
; CHECK-NEXT: sldi r0, r0, 3
-; CHECK-NEXT: sldi r12, r10, 3
-; CHECK-NEXT: add r3, r30, r7
-; CHECK-NEXT: add r5, r29, r7
-; CHECK-NEXT: add r7, r28, r7
-; CHECK-NEXT: add r8, r30, r11
-; CHECK-NEXT: add r9, r29, r11
-; CHECK-NEXT: add r11, r28, r11
-; CHECK-NEXT: add r30, r30, r0
-; CHECK-NEXT: li r0, 0
-; CHECK-NEXT: add r10, r28, r12
-; CHECK-NEXT: add r12, r29, r12
+; CHECK-NEXT: sldi r11, r7, 3
+; CHECK-NEXT: add r30, r5, r0
+; CHECK-NEXT: add r29, r4, r0
+; CHECK-NEXT: add r28, r3, r0
+; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT: add r7, r5, r11
+; CHECK-NEXT: add r9, r4, r11
+; CHECK-NEXT: add r11, r3, r11
+; CHECK-NEXT: add r10, r10, r0
+; CHECK-NEXT: sub r12, r10, r12
+; CHECK-NEXT: sldi r10, r10, 3
+; CHECK-NEXT: sldi r12, r12, 3
+; CHECK-NEXT: add r5, r5, r10
+; CHECK-NEXT: li r10, 0
+; CHECK-NEXT: add r3, r3, r12
+; CHECK-NEXT: add r4, r4, r12
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_6: # %for.body.epil
; CHECK-NEXT: #
-; CHECK-NEXT: lfdux f0, r12, r4
-; CHECK-NEXT: lfdux f1, r10, r4
+; CHECK-NEXT: lfdux f0, r4, r8
+; CHECK-NEXT: lfdux f1, r3, r8
; CHECK-NEXT: addi r6, r6, -1
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: xsmuldp f0, f1, f0
-; CHECK-NEXT: lfd f1, 0(r30)
+; CHECK-NEXT: lfd f1, 0(r5)
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfd f0, 0(r30)
-; CHECK-NEXT: add r30, r30, r4
-; CHECK-NEXT: lfdx f0, r11, r0
-; CHECK-NEXT: lfdx f1, r9, r0
+; CHECK-NEXT: stfd f0, 0(r5)
+; CHECK-NEXT: add r5, r5, r8
+; CHECK-NEXT: lfdx f0, r28, r10
+; CHECK-NEXT: lfdx f1, r29, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r8, r0
+; CHECK-NEXT: lfdx f1, r30, r10
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r8, r0
-; CHECK-NEXT: lfdx f0, r7, r0
-; CHECK-NEXT: lfdx f1, r5, r0
+; CHECK-NEXT: stfdx f0, r30, r10
+; CHECK-NEXT: lfdx f0, r11, r10
+; CHECK-NEXT: lfdx f1, r9, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r3, r0
+; CHECK-NEXT: lfdx f1, r7, r10
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r3, r0
-; CHECK-NEXT: add r0, r0, r4
+; CHECK-NEXT: stfdx f0, r7, r10
+; CHECK-NEXT: add r10, r10, r8
; CHECK-NEXT: bne cr0, .LBB7_6
; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup
-; CHECK-NEXT: ld r2, 184(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r31, 328(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r30, 320(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r29, 312(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
; CHECK-NEXT: li r3, 0
-; CHECK-NEXT: ld r28, 304(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r27, 296(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r26, 288(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r25, 280(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r24, 272(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r23, 264(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r22, 256(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r21, 248(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r20, 240(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r19, 232(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r18, 224(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r17, 216(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r16, 208(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r15, 200(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r14, 192(r1) # 8-byte Folded Reload
-; CHECK-NEXT: addi r1, r1, 336
+; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
entry:
%cmp49 = icmp sgt i64 %m, 0
More information about the llvm-commits
mailing list