[llvm] r294821 - Fix PR23384 (under "-lsr-insns-cost" option)
Evgeny Stupachenko via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 10 18:57:43 PST 2017
Author: evstupac
Date: Fri Feb 10 20:57:43 2017
New Revision: 294821
URL: http://llvm.org/viewvc/llvm-project?rev=294821&view=rev
Log:
Fix PR23384 (under "-lsr-insns-cost" option)
Summary:
The patch adds instructions number generated by a solution
to LSR cost under "-lsr-insns-cost" option.
Reviewers: qcolombet, hfinkel
Differential Revision: http://reviews.llvm.org/D28307
From: Evgeny Stupachenko <evstupac at gmail.com>
Added:
llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
Modified:
llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Modified: llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp?rev=294821&r1=294820&r2=294821&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp Fri Feb 10 20:57:43 2017
@@ -129,6 +129,11 @@ static cl::opt<bool> EnablePhiElim(
"enable-lsr-phielim", cl::Hidden, cl::init(true),
cl::desc("Enable LSR phi elimination"));
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+ "lsr-insns-cost", cl::Hidden, cl::init(false),
+ cl::desc("Add instruction count to a LSR cost model"));
+
#ifndef NDEBUG
// Stress test IV chain generation.
static cl::opt<bool> StressIVChain(
@@ -325,6 +330,8 @@ struct Formula {
bool unscale();
+ bool hasZeroEnd() const;
+
size_t getNumRegs() const;
Type *getType() const;
@@ -459,6 +466,14 @@ bool Formula::unscale() {
return true;
}
+bool Formula::hasZeroEnd() const {
+ if (UnfoldedOffset || BaseOffset)
+ return false;
+ if (BaseRegs.size() != 1 || ScaledReg)
+ return false;
+ return true;
+}
+
/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
@@ -895,6 +910,7 @@ namespace {
class Cost {
/// TODO: Some of these could be merged. Also, a lexical ordering
/// isn't always optimal.
+ unsigned Insns;
unsigned NumRegs;
unsigned AddRecCost;
unsigned NumIVMuls;
@@ -905,8 +921,8 @@ class Cost {
public:
Cost()
- : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
- SetupCost(0), ScaleCost(0) {}
+ : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
+ ImmCost(0), SetupCost(0), ScaleCost(0) {}
bool operator<(const Cost &Other) const;
@@ -915,9 +931,9 @@ public:
#ifndef NDEBUG
// Once any of the metrics loses, they must all remain losers.
bool isValid() {
- return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+ return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
| ImmCost | SetupCost | ScaleCost) != ~0u)
- || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+ || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
& ImmCost & SetupCost & ScaleCost) == ~0u);
}
#endif
@@ -1163,6 +1179,9 @@ void Cost::RateFormula(const TargetTrans
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
assert(F.isCanonical() && "Cost is accurate only for canonical formula");
// Tally up the registers.
+ unsigned PrevAddRecCost = AddRecCost;
+ unsigned PrevNumRegs = NumRegs;
+ unsigned PrevNumBaseAdds = NumBaseAdds;
if (const SCEV *ScaledReg = F.ScaledReg) {
if (VisitedRegs.count(ScaledReg)) {
Lose();
@@ -1182,6 +1201,18 @@ void Cost::RateFormula(const TargetTrans
return;
}
+ // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+ // additional instruction (at least fill).
+ unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+ if (NumRegs > TTIRegNum) {
+ // Cost already exceeded TTIRegNum, then only newly added register can add
+ // new instructions.
+ if (PrevNumRegs > TTIRegNum)
+ Insns += (NumRegs - PrevNumRegs);
+ else
+ Insns += (NumRegs - TTIRegNum);
+ }
+
// Determine how many (unfolded) adds we'll need inside the loop.
size_t NumBaseParts = F.getNumRegs();
if (NumBaseParts > 1)
@@ -1210,11 +1241,30 @@ void Cost::RateFormula(const TargetTrans
!TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
NumBaseAdds++;
}
+
+ // If ICmpZero formula ends with not 0, it could not be replaced by
+ // just add or sub. We'll need to compare final result of AddRec.
+ // That means we'll need an additional instruction.
+ // For -10 + {0, +, 1}:
+ // i = i + 1;
+ // cmp i, 10
+ //
+ // For {-10, +, 1}:
+ // i = i + 1;
+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+ Insns++;
+ // Each new AddRec adds 1 instruction to calculation.
+ Insns += (AddRecCost - PrevAddRecCost);
+
+ // BaseAdds adds instructions for unfolded registers.
+ if (LU.Kind != LSRUse::ICmpZero)
+ Insns += NumBaseAdds - PrevNumBaseAdds;
assert(isValid() && "invalid cost");
}
/// Set this cost to a losing value.
void Cost::Lose() {
+ Insns = ~0u;
NumRegs = ~0u;
AddRecCost = ~0u;
NumIVMuls = ~0u;
@@ -1226,6 +1276,8 @@ void Cost::Lose() {
/// Choose the lower cost.
bool Cost::operator<(const Cost &Other) const {
+ if (InsnsCost && Insns != Other.Insns)
+ return Insns < Other.Insns;
return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
ImmCost, SetupCost) <
std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
@@ -1234,6 +1286,7 @@ bool Cost::operator<(const Cost &Other)
}
void Cost::print(raw_ostream &OS) const {
+ OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
if (AddRecCost != 0)
OS << ", with addrec cost " << AddRecCost;
Added: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll?rev=294821&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll (added)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll Fri Feb 10 20:57:43 2017
@@ -0,0 +1,52 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT test checks that LSR optimize compare for static counter to compare with 0.
+
+; BOTH: for.body:
+; INSN: icmp eq i64 %lsr.iv.next, 0
+; REGS: icmp eq i64 %indvars.iv.next, 1024
+
+; LLC test checks that LSR optimize compare for static counter.
+; That means that instead of creating the following:
+; movl %ecx, (%rdx,%rax,4)
+; incq %rax
+; cmpq $1024, %rax
+; LSR should optimize out cmp:
+; movl %ecx, 4096(%rdx,%rax)
+; addq $4, %rax
+; or
+; movl %ecx, 4096(%rdx,%rax,4)
+; incq %rax
+
+; CHECK: LBB0_1:
+; CHECK-NEXT: movl 4096(%{{...}},[[REG:%...]]
+; CHECK-NEXT: addl 4096(%{{...}},[[REG]]
+; CHECK-NEXT: movl %{{...}}, 4096(%{{...}},[[REG]]
+; CHECK-NOT: cmp
+; CHECK: jne
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+ %tmp = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+ %tmp1 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %tmp1, %tmp
+ %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+ store i32 %add, i32* %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1024
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
Added: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll?rev=294821&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll (added)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll Fri Feb 10 20:57:43 2017
@@ -0,0 +1,58 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT checks that LSR prefers less instructions to less registers.
+; For x86 LSR should prefer complicated address to new lsr induction
+; variables.
+
+; BOTH: for.body:
+; INSN: getelementptr i32, i32* %x, i64 %indvars.iv
+; INSN: getelementptr i32, i32* %y, i64 %indvars.iv
+; INSN: getelementptr i32, i32* %q, i64 %indvars.iv
+; REGS %lsr.iv4 = phi
+; REGS %lsr.iv2 = phi
+; REGS %lsr.iv1 = phi
+; REGS: getelementptr i32, i32* %lsr.iv1, i64 1
+; REGS: getelementptr i32, i32* %lsr.iv2, i64 1
+; REGS: getelementptr i32, i32* %lsr.iv4, i64 1
+
+; LLC checks that LSR prefers less instructions to less registers.
+; LSR should prefer complicated address to additonal add instructions.
+
+; CHECK: LBB0_2:
+; CHECK-NEXT: movl (%r{{[a-z][a-z]}},
+; CHECK-NEXT: addl (%r{{[a-z][a-z]}},
+; CHECK-NEXT: movl %e{{[a-z][a-z]}}, (%r{{[a-z][a-z]}},
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
+entry:
+ %cmp10 = icmp sgt i32 %n, 0
+ br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+ %tmp = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+ %tmp1 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %tmp1, %tmp
+ %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+ store i32 %add, i32* %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
More information about the llvm-commits
mailing list