[llvm-commits] [PATCH][FastMath, InstCombine] Fadd/Fsub optimizations
Shuxin Yang
shuxin.llvm at gmail.com
Tue Dec 11 13:32:15 PST 2012
Hi, Dear All:
The attached patch is to implement following rules about
floating-point add/sub in relaxed mode.
(The n-th rule is not yet implemented. I just realized it when I write
this mail.
It is easy to implement this rule, but I don't like to go through
stress test one more time).
----------------------------------------------------
1. (x + c1) + c2 -> x + (c1 + c2)
2. (c * x) + x -> (c+1) * x
3. (x + x) + x -> x * 3
4. c * x + (x + x) -> (c + 2)*x
5. (x + x) + (x+x) -> 4*x
6. x - (x + y) -> 0 - y
...
...
...
n. (factoring) C * X1 + C * X2 -> C(X1 + X2)
-------------------------------------------------------
Up to three neighboring instructions are involved in the
optimization. The number
of the combination is daunting!. So I have to resort a general way
(instead of
pattern match) to tackle these optimizations.
The idea is simple, just try to decompose instructions into
uniformally represented
Addends. Take following instruction sequence as an example:
t1 = 1.8 * x;
t2 = y - x;
t3 = t1 - t2;
t3 has two addends A1=<1, t1> (denote value 1*t1), and A2=<-1, t2>. If
we "zoom-in"
A1 and A2 one step, we will reveal more addends: A1 can be zoom-in-ed
into another
addend A1_0 = <1.8, x>, and A2 can be zoom-in into <1,y> and <-1,x>.
When these addends available, the optimize try to optimize following
N-ary additions
using symbolic evaluation:
A1_0 + A2_0 + A2_1, or
A1 + A2_0 + A2_1 or
A1_0 + A2
This patch is stress-tested with SingleSrc and MultiSource by
considering all fadd/fsub
are in relaxed mode.
Thank you for code review!
Shuxin
-------------- next part --------------
Index: test/Transforms/InstCombine/fast-math.ll
===================================================================
--- test/Transforms/InstCombine/fast-math.ll (revision 169752)
+++ test/Transforms/InstCombine/fast-math.ll (working copy)
@@ -3,19 +3,17 @@
; testing-case "float fold(float a) { return 1.2f * a * 2.3f; }"
; 1.2f and 2.3f is supposed to be fold.
define float @fold(float %a) {
-fold:
%mul = fmul fast float %a, 0x3FF3333340000000
%mul1 = fmul fast float %mul, 0x4002666660000000
ret float %mul1
-; CHECK: fold
+; CHECK: @fold
; CHECK: fmul float %a, 0x4006147AE0000000
}
; Same testing-case as the one used in fold() except that the operators have
; fixed FP mode.
define float @notfold(float %a) {
-notfold:
-; CHECK: notfold
+; CHECK: @notfold
; CHECK: %mul = fmul fast float %a, 0x3FF3333340000000
%mul = fmul fast float %a, 0x3FF3333340000000
%mul1 = fmul float %mul, 0x4002666660000000
@@ -23,10 +21,96 @@
}
define float @fold2(float %a) {
-fold2:
-; CHECK: fold2
+; CHECK: @fold2
; CHECK: fmul float %a, 0x4006147AE0000000
%mul = fmul float %a, 0x3FF3333340000000
%mul1 = fmul fast float %mul, 0x4002666660000000
ret float %mul1
}
+
+; C * f1 + f1 = (C+1) * f1
+define double @fold3(double %f1) {
+ %t1 = fmul fast double 2.000000e+00, %f1
+ %t2 = fadd fast double %f1, %t1
+ ret double %t2
+; CHECK: @fold3
+; CHECK: fmul fast double %f1, 3.000000e+00
+}
+
+; (C1 - X) + (C2 - Y) => ((C1+C2) - X) - Y
+define float @fold4(float %f1, float %f2) nounwind uwtable readnone ssp {
+ %sub = fsub float 4.000000e+00, %f1
+ %sub1 = fsub float 5.000000e+00, %f2
+ %add = fadd fast float %sub, %sub1
+ ret float %add
+; CHECK: @fold4
+; CHECK: fsub fast float 9.000000e+00, %f1
+}
+
+; (X + C1) + C2 => X + (C1 + C2)
+define float @fold5(float %f1, float %f2) nounwind uwtable readnone ssp {
+ %add = fadd float %f1, 4.000000e+00
+ %add1 = fadd fast float %add, 5.000000e+00
+ ret float %add1
+; CHECK: @fold5
+; CHECK: fadd float %f1, 9.000000e+00
+}
+
+; (X + X) + X => 3.0 * X
+define float @fold6(float %f1) {
+ %t1 = fadd fast float %f1, %f1
+ %t2 = fadd fast float %f1, %t1
+ ret float %t2
+; CHECK: @fold6
+; CHECK: fmul fast float %f1, 3.000000e+00
+}
+
+; C1 * X + (X + X) = (C1 + 2) * X
+define float @fold7(float %f1) {
+ %t1 = fmul fast float %f1, 5.000000e+00
+ %t2 = fadd fast float %f1, %f1
+ %t3 = fadd fast float %t1, %t2
+ ret float %t3
+; CHECK: @fold7
+; CHECK: fmul fast float %f1, 7.000000e+00
+}
+
+; (X + X) + (X + X) => 4.0 * X
+define float @fold8(float %f1) {
+ %t1 = fadd fast float %f1, %f1
+ %t2 = fadd fast float %f1, %f1
+ %t3 = fadd fast float %t1, %t2
+ ret float %t3
+; CHECK: fold8
+; CHECK: fmul fast float %f1, 4.000000e+00
+}
+
+; X - (X + Y) => 0 - Y
+define float @fold9(float %f1, float %f2) {
+ %t1 = fadd float %f1, %f2
+ %t3 = fsub fast float %f1, %t1
+ ret float %t3
+
+; CHECK: @fold9
+; CHECK: fsub fast float 0.000000e+00, %f2
+}
+
+
+; once cause Crash/miscompilation
+define float @fail1(float %f1, float %f2) {
+ %conv3 = fadd fast float %f1, -1.000000e+00
+ %add = fadd fast float %conv3, %conv3
+ %add2 = fadd fast float %add, %conv3
+ ret float %add2
+; CHECK: @fail1
+; CHECK: ret
+}
+
+define double @fail2(double %f1, double %f2) {
+ %t1 = fsub fast double %f1, %f2
+ %t2 = fadd fast double %f1, %f2
+ %t3 = fsub fast double %t1, %t2
+ ret double %t3
+; CHECK: @fail2
+; CHECK: ret
+}
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp (revision 169752)
+++ lib/Transforms/InstCombine/InstructionCombining.cpp (working copy)
@@ -53,6 +53,7 @@
#include "llvm/Support/ValueHandle.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "InstCombineFastMath.h"
#include <algorithm>
#include <climits>
using namespace llvm;
@@ -2406,6 +2407,9 @@
InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this);
Simplifier = &TheSimplifier;
+ FastMathInstComb FMC(Builder);
+ FastMathCombiner = &FMC;
+
bool EverMadeChange = false;
// Lower dbg.declare intrinsics otherwise their value may be clobbered
Index: lib/Transforms/InstCombine/InstCombineFastMath.h
===================================================================
--- lib/Transforms/InstCombine/InstCombineFastMath.h (revision 0)
+++ lib/Transforms/InstCombine/InstCombineFastMath.h (revision 0)
@@ -0,0 +1,209 @@
+//===- InstCombineFastMath.h - Fast-math InstCombine definition ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef INSTCOMBINE_FASTMATH_H
+#define INSTCOMBINE_FASTMATH_H
+
+namespace llvm {
+
+class FastMathInstComb {
+public:
+ Value *simplifyFAdd(Instruction *I) { return getFAddCombiner()->simplify(I); }
+ Value *simplifyFSub(Instruction *I) { return simplifyFAdd(I);}
+
+ FastMathInstComb(InstCombiner::BuilderTy *B) : FAddComb(0), Builder(B) {}
+ ~FastMathInstComb() { delete FAddComb; }
+
+private:
+ class FAddCombine;
+
+ FAddCombine *getFAddCombiner()
+ { return FAddComb ? FAddComb : (FAddComb = new FAddCombine(Builder)); }
+
+ FAddCombine *FAddComb;
+ InstCombiner::BuilderTy *Builder;
+
+
+ //===----------------------------------------------------------------===//
+ //
+ // Helper classes starts from this point.
+ //
+ //===-----------------------------------------------------------------===//
+private:
+
+ /// Class representing coefficient of floating-point addend.
+ /// This class needs to be highly efficient.
+ class FAddendCoef {
+ public:
+ // The constructor has to initialize a APFloat, which is uncessary for
+ // most addends which have coefficient either 1 or -1. So, the constructor
+ // is expensive. In order to avoid the cost of the constructor, we should
+ // reuse some instances whenever possible. The pre-created instances
+ // FAddCombine::Add[0-5] embodies this idea.
+ //
+ FAddendCoef() : FpVal(0.0), IntVal(0), isInt(true) {}
+
+ void set(short C) {
+ assert(!InsaneIntVal(C) && "Insane coefficient");
+ isInt = true; IntVal = C;
+ }
+ void set(const APFloat& C) { isInt = false; FpVal = C; }
+
+ bool isZero() const { return isInt ? !IntVal : FpVal.isZero();}
+
+ void negate();
+
+ // If possible, don't define operator+/operator- etc because these
+ // operators inevitably call FAddendCoef's constructor which is not cheap.
+ void operator=(const FAddendCoef &A);
+ void operator+=(const FAddendCoef &A);
+ void operator-=(const FAddendCoef &A);
+ void operator*=(const FAddendCoef &S);
+
+ bool isOne() const { return isInt && IntVal == 1; }
+ bool isTwo() const { return isInt && IntVal == 2; }
+ bool isMinusOne() const { return isInt && IntVal == -1; }
+ bool isMinusTwo() const { return isInt && IntVal == -2; }
+
+ Value *getValue(Type *) const;
+
+ private:
+ bool InsaneIntVal(int V) { return V > 4 || V < -4; }
+
+ APFloat FpVal;
+ // The integer coefficient of an individual addend is either 1 or -1,
+ // and we try to simplify at most 4 addends from neighboring at most
+ // two instructions. So the range if <IntVal> falls in [-4, 4]. APInt
+ // is overkill of this end.
+ short IntVal;
+ bool isInt;
+ };
+
+ /// FAddend is used to represent floating-point addend. An addend is
+ /// represented as <C, V>, where the V is is symbolic value, and C is a
+ /// constant coefficient. A constant addend is represented as <C, 0>.
+ ///
+ class FAddend {
+ public:
+ typedef enum {
+ Simpler, // addend1 = c1*x, addend2 = c2*x, result = (c1+c2)*x
+ FlushToZero, // similar to the case of Simpler, except that (c1+c2) is a
+ // denormal, and the result is flushed to zero.
+ Zero, // addend1 = c1*x, addend2 = -c1*x , result = 0
+ Fail // addend1 = c1*x, addend2 = c2*y, and x != y
+ } SimpResult;
+
+ FAddend() { Val = 0; }
+
+ Value *getSymVal (void) const { return Val; }
+ const FAddendCoef& getCoef(void) const { return Coeff; }
+
+ bool isConstant() const { return Val == 0; }
+
+ void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
+ void set(const APFloat& Coefficient, Value *V)
+ { Coeff.set(Coefficient); Val = V; }
+ void set(const ConstantFP* Coefficient, Value *V)
+ { Coeff.set(Coefficient->getValueAPF()); Val = V; }
+
+ void negate() { Coeff.negate(); }
+
+ /// Try to simplify "\p this + \p Addend2". Iff simplification was
+ /// successful, the resulting value will be saved to "this" instance.
+ SimpResult trySimplifyAdd(const FAddend& Addend2, bool FlushToZero=false);
+
+ /// Drill down the U-D chain one step to find the definition of V, and
+ /// try to break the definition into one or two addends.
+ static unsigned drillDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+
+ /// Similar to FAddend::drillDownOneStep() except that the value being
+ /// splitted is the addend itself.
+ unsigned drillDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+
+ private:
+ void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+
+ // This addend has the value of "Coeff * Val".
+ FAddendCoef Coeff;
+ Value *Val;
+ };
+
+ /// This functor works with std::sort to permute addends such that those
+ /// having same symbolic-value are clustered together.
+ struct FAddendCmp {
+ bool operator()(const FAddend *A1, const FAddend *A2) {
+ return A1->getSymVal() < A2->getSymVal();
+ }
+ };
+
+ /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+ /// with its neighboring at most two instructions.
+ ///
+ class FAddCombine {
+ public:
+ FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
+ Value *simplify(Instruction *FAdd);
+
+ private:
+ typedef SmallVector<const FAddend*, 4> AddendVect;
+
+ Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+ /// Convert given addend to a Value
+ Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+
+ /// Return the number of instruction needed to emit the N-ary addition.
+ unsigned calcInstrNumber(const AddendVect& Vect);
+ Value *createFSub(Value *Opnd0, Value *Opnd1);
+ Value *createFAdd(Value *Opnd0, Value *Opnd1);
+ Value *createFMul(Value *Opnd0, Value *Opnd1);
+ Value *createFNeg(Value *V);
+ Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+ void createInstPostProc(Instruction *NewInst);
+
+ InstCombiner::BuilderTy *Builder;
+ Instruction *Instr;
+
+ FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+ private:
+ // "Messy" stuff to make simplifyFAdd() faster. NOTE: the functions
+ // defined this section can only be called by simplifyFAdd() itself.
+
+ // At most 4 addends are involved in simplification, so we need at
+ // most 4 - 1 tmp addends to evaluate the intermediate results.
+ #define MAX_TMP_ADDEND_NUM 3
+
+ FAddend *allocTmpAddend() {
+ assert((NextFreeIdx < MAX_TMP_ADDEND_NUM) && "run out of tmp addends");
+ return &TmpAddends[NextFreeIdx++];
+ }
+
+ void freeAllTmpAddends() { NextFreeIdx = 0; }
+
+ FAddend TmpAddends[MAX_TMP_ADDEND_NUM];
+ unsigned NextFreeIdx;
+
+ #undef MAX_TMP_ADDEND_NUM
+
+ private:
+ // Debugging stuff are clustered here.
+ #ifndef NDEBUG
+ unsigned CreateInstrNum;
+ void initCreateInstNum() { CreateInstrNum = 0; }
+ void incCreateInstNum() { CreateInstrNum++; }
+ #else
+ void initCreateInstNum() {}
+ void incCreateInstNum() {}
+ #endif
+ };
+};
+
+} // end namespace llvm.
+
+#endif
Index: lib/Transforms/InstCombine/InstCombine.h
===================================================================
--- lib/Transforms/InstCombine/InstCombine.h (revision 169752)
+++ lib/Transforms/InstCombine/InstCombine.h (working copy)
@@ -67,6 +67,8 @@
Worklist.Add(I);
}
};
+
+class FastMathInstComb;
/// InstCombiner - The -instcombine pass.
class LLVM_LIBRARY_VISIBILITY InstCombiner
@@ -76,6 +78,8 @@
TargetLibraryInfo *TLI;
bool MadeIRChange;
LibCallSimplifier *Simplifier;
+ FastMathInstComb *FastMathCombiner;
+
public:
/// Worklist - All of the instructions that need to be simplified.
InstCombineWorklist Worklist;
Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAddSub.cpp (revision 169752)
+++ lib/Transforms/InstCombine/InstCombineAddSub.cpp (working copy)
@@ -16,13 +16,516 @@
#include "llvm/DataLayout.h"
#include "llvm/Support/GetElementPtrTypeIterator.h"
#include "llvm/Support/PatternMatch.h"
+#include "InstCombineFastMath.h"
using namespace llvm;
using namespace PatternMatch;
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+// FastMathInstComb::{FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+
+void FastMathInstComb::FAddendCoef::operator=(const FAddendCoef& That) {
+ if ((isInt = That.isInt))
+ IntVal = That.IntVal;
+ else
+ FpVal = That.FpVal;
+}
+
+void FastMathInstComb::FAddendCoef::operator+=(const FAddendCoef &That) {
+ enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+ if (isInt == That.isInt) {
+ if (isInt)
+ IntVal += That.IntVal;
+ else
+ FpVal.add(That.FpVal, RndMode);
+ return;
+ }
+
+ if (isInt) {
+ FpVal = That.FpVal;
+ FpVal.add(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+ isInt = false;
+ return;
+ }
+
+ FpVal.add(APFloat(FpVal.getSemantics(), That.IntVal), RndMode);
+}
+
+void FastMathInstComb::FAddendCoef::operator-=(const FAddendCoef &That) {
+ enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+ if (isInt == That.isInt) {
+ if (isInt)
+ IntVal -= That.IntVal;
+ else
+ FpVal.subtract(That.FpVal, RndMode);
+ return;
+ }
+
+ if (isInt) {
+ FpVal = That.FpVal;
+ FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+ isInt = false;
+ return;
+ }
+
+ FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+}
+
+void FastMathInstComb::FAddendCoef::operator*=(const FAddendCoef &That) {
+ if (That.isOne())
+ return;
+
+ if (That.isMinusOne()) {
+ negate();
+ return;
+ }
+
+ if (isInt && That.isInt) {
+ int Res = IntVal * (int)That.IntVal;
+ assert(!InsaneIntVal(Res) && "Insane int value");
+ IntVal = Res;
+ return;
+ }
+
+ const fltSemantics &Semantic =
+ isInt ? That.FpVal.getSemantics() : FpVal.getSemantics();
+
+ APFloat &F0 = FpVal;
+ if (isInt)
+ F0 = APFloat(Semantic, IntVal);
+
+ if (That.isInt)
+ F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven);
+ else
+ F0.multiply(That.FpVal, APFloat::rmNearestTiesToEven);
+
+ return;
+}
+
+void FastMathInstComb::FAddendCoef::negate() {
+ if (isInt)
+ IntVal = 0 - IntVal;
+ else
+ FpVal.changeSign();
+}
+
+Value *FastMathInstComb::FAddendCoef::getValue(Type *Ty) const {
+ return isInt ?
+ ConstantFP::get(Ty, float(IntVal)) :
+ ConstantFP::get(Ty->getContext(), FpVal);
+}
+
+FastMathInstComb::FAddend::SimpResult
+FastMathInstComb::FAddend::trySimplifyAdd
+ (const FAddend& Addend2, bool FlushToZero) {
+ // Currently flush-to-0 is ignored. Following stmtement is to suppress
+ // compile-warning.
+ FlushToZero = !FlushToZero;
+
+ if (Val != Addend2.Val)
+ return Fail;
+
+ Coeff += Addend2.Coeff;
+
+ return Coeff.isZero() ? Zero : Simpler;
+}
+
+// The definition of <Val> Addends
+// =========================================
+// A + B <1, A>, <1,B>
+// A - B <1, A>, <1,B>
+// 0 - B <-1, B>
+// C * A, <C, A>
+// A + C <1, A> <C, NULL>
+// 0 +/- 0 <0, NULL> (corner case)
+//
+// Legend: A, B are not constant, C is constant
+//
+unsigned FastMathInstComb::FAddend::drillDownOneStep
+ (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+ Instruction *I = 0;
+ if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
+ return 0;
+
+ unsigned Opcode = I->getOpcode();
+
+ if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+ ConstantFP *C0, *C1;
+ Value *Opnd0 = I->getOperand(0);
+ Value *Opnd1 = I->getOperand(1);
+ if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+ Opnd0 = 0;
+
+ if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+ Opnd1 = 0;
+
+ if (Opnd0) {
+ if (!C0)
+ Addend0.set(1, Opnd0);
+ else
+ Addend0.set(C0, 0);
+ }
+
+ if (Opnd1) {
+ FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+ if (!C1)
+ Addend.set(1, Opnd1);
+ else
+ Addend.set(C1, 0);
+ if (Opcode == Instruction::FSub)
+ Addend.negate();
+ }
+
+ if (Opnd0 || Opnd1)
+ return Opnd0 && Opnd1 ? 2 : 1;
+
+ // Both operands are zero. Weird!
+ Addend0.set(APFloat(0.0f), 0);
+ return 1;
+ }
+
+ if (I->getOpcode() == Instruction::FMul) {
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+ if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+ Addend0.set(C, V1);
+ return 1;
+ }
+
+ if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+ Addend0.set(C, V0);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+unsigned FastMathInstComb::FAddend::drillDownOneStep
+ (FAddend &Addend0, FAddend &Addend1) const {
+ if (isConstant())
+ return 0;
+
+ unsigned BreakNum = FAddend::drillDownOneStep(Val, Addend0, Addend1);
+ if (!BreakNum || Coeff.isOne())
+ return BreakNum;
+
+ Addend0.Scale(Coeff);
+
+ if (BreakNum == 2)
+ Addend1.Scale(Coeff);
+
+ return BreakNum;
+}
+
+Value *FastMathInstComb::FAddCombine::simplify(Instruction *I) {
+
+ assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+
+ // Currently we are able to handle vector type.
+ if (I->getType()->isVectorTy())
+ return 0;
+
+ if (I->getOpcode() != Instruction::FAdd &&
+ I->getOpcode() != Instruction::FSub)
+ return 0;
+
+ // Save the instruction before calling other member-functions.
+ Instr = I;
+
+ unsigned OpndNum = FAddend::drillDownOneStep(I, Opnd0, Opnd1);
+
+ // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1
+ unsigned Opnd0_ExpNum = 0;
+ unsigned Opnd1_ExpNum = 0;
+
+ if (!Opnd0.isConstant())
+ Opnd0_ExpNum = Opnd0.drillDownOneStep(Opnd0_0, Opnd0_1);
+
+ // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+ if (OpndNum == 2 && !Opnd1.isConstant())
+ Opnd1_ExpNum = Opnd1.drillDownOneStep(Opnd1_0, Opnd1_1);
+
+ // Step 3: try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+ if (Opnd0_ExpNum && Opnd1_ExpNum) {
+ AddendVect AllOpnds;
+ AllOpnds.push_back(&Opnd0_0);
+ AllOpnds.push_back(&Opnd1_0);
+ if (Opnd0_ExpNum == 2)
+ AllOpnds.push_back(&Opnd0_1);
+ if (Opnd1_ExpNum == 2)
+ AllOpnds.push_back(&Opnd1_1);
+
+ // Compute instruction quota. We should save at least one instruction.
+ unsigned InstQuota = 0;
+
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+ InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
+ (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+ if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+ return R;
+ }
+
+ if (OpndNum != 2) {
+ // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+ // splitted into two addends, say "V = X - Y", the instruction would have
+ // been optimized into "I = Y - X" in the previous steps.
+ //
+ const FAddendCoef& CE = Opnd0.getCoef();
+ return CE.isOne() ? Opnd0.getSymVal() : 0;
+ }
+
+ // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+ if (Opnd1_ExpNum) {
+ AddendVect AllOpnds;
+ AllOpnds.push_back(&Opnd0);
+ AllOpnds.push_back(&Opnd1_0);
+ if (Opnd1_ExpNum == 2)
+ AllOpnds.push_back(&Opnd1_1);
+
+ if (Value *R = simplifyFAdd(AllOpnds, 1))
+ return R;
+ }
+
+ // step 4: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+ if (Opnd0_ExpNum) {
+ AddendVect AllOpnds;
+ AllOpnds.push_back(&Opnd1);
+ AllOpnds.push_back(&Opnd0_0);
+ if (Opnd0_ExpNum == 2)
+ AllOpnds.push_back(&Opnd0_1);
+
+ if (Value *R = simplifyFAdd(AllOpnds, 1))
+ return R;
+ }
+
+ return 0;
+}
+
+
+Value *FastMathInstComb::FAddCombine::simplifyFAdd
+ (AddendVect& Addends, unsigned InstrQuota) {
+
+ // Permute the input addends such that addends sharing same symbolic-value
+ // are clustered together. e.g. { c1*x, c2*y, c3*x, c4*y, ... } =>
+ /// { c1*x, c3*x, c2*y, c4*y, ...}.
+ std::sort(Addends.begin(), Addends.end(), FAddendCmp());
+
+ freeAllTmpAddends();
+
+ // Walk forward along the sorted addends, trying to combine adjacent two
+ // addends into a single one.
+ AddendVect SimpVect;
+ for (AddendVect::iterator I = Addends.begin(), E = Addends.end();
+ I != E; I++) {
+ const FAddend* Opnd = *I;
+ if (SimpVect.empty()) {
+ SimpVect.push_back(Opnd);
+ continue;
+ }
+
+ // Try to combine current addend with the previous adjacent addent
+ const FAddend *Opnd0 = SimpVect.back();
+ if (Opnd0->getSymVal() != Opnd->getSymVal()) {
+ // case 1: Opnd0 + Opnd can not be simplified.
+ SimpVect.push_back(Opnd);
+ continue;
+ }
+
+ SimpVect.pop_back();
+ FAddend *T = allocTmpAddend();
+ *T = *Opnd0;
+
+ FAddend::SimpResult R = T->trySimplifyAdd(*Opnd);
+
+ // case 2: Opnd0 + Opnd = 0
+ if (R == FAddend::Zero || R == FAddend::FlushToZero)
+ continue;
+
+ // case 3: Opnd0 + Opnd = C * X
+ assert (R == FAddend::Simpler);
+ SimpVect.push_back(T);
+ }
+
+ Value *Result;
+ if (!SimpVect.empty())
+ Result = createNaryFAdd(SimpVect, InstrQuota);
+ else {
+ // The addition is folded to 0.0
+ Result = ConstantFP::get(Instr->getType(), 0.0);
+ }
+
+ return Result;
+}
+
+Value *FastMathInstComb::FAddCombine::createNaryFAdd
+ (const AddendVect& Opnds, unsigned InstrQuota) {
+ assert(!Opnds.empty() && "Exect at least one addend");
+
+ // Step 1: Check if the # of instruction needed exceeds the quota.
+ //
+ unsigned InstrNeeded = calcInstrNumber(Opnds);
+ if (InstrNeeded > InstrQuota)
+ return 0;
+
+ initCreateInstNum();
+
+ // step 2: Emit the N-ary addition.
+ // Note that at most threee instructions involved in Fadd-InstCombine: the
+ // addition in question, and at most two neighboring instructions.
+ // The resulting optimized addition should have at least one less instruction
+ // than the original addition expression tree. This implies the resulting
+ // N-ary addition has at most two instructions, and we don't need to worry
+ // about tree-height when constructing the N-ary addition.
+
+ Value *LastVal = 0;
+ bool LastValNeedNeg = false;
+
+ // Iterate the addends, creating fadd/fsub using adjacent two addends.
+ for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+ I != E; I++) {
+ bool NeedNeg;
+ Value *V = createAddendVal(**I, NeedNeg);
+ if (!LastVal) {
+ LastVal = V;
+ LastValNeedNeg = NeedNeg;
+ continue;
+ }
+
+ if (LastValNeedNeg == NeedNeg) {
+ LastVal = createFAdd(LastVal, V);
+ continue;
+ }
+
+ if (LastValNeedNeg)
+ LastVal = createFSub(V, LastVal);
+ else
+ LastVal = createFSub(LastVal, V);
+
+ LastValNeedNeg = false;
+ }
+
+ if (LastValNeedNeg) {
+ LastVal = createFNeg(LastVal);
+ }
+
+ #ifndef NDEBUG
+ assert(CreateInstrNum == InstrNeeded &&
+ "Inconsistent in instruction numbers");
+ #endif
+
+ return LastVal;
+}
+
+Value *FastMathInstComb::FAddCombine::createFSub
+ (Value *Opnd0, Value *Opnd1) {
+ Value *V = Builder->CreateFSub(Opnd0, Opnd1);
+ createInstPostProc(cast<Instruction>(V));
+ return V;
+}
+
+Value *FastMathInstComb::FAddCombine::createFNeg(Value *V) {
+ Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0));
+ return createFSub(Zero, V);
+}
+
+Value *FastMathInstComb::FAddCombine::createFAdd
+ (Value *Opnd0, Value *Opnd1) {
+ Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
+ createInstPostProc(cast<Instruction>(V));
+ return V;
+}
+
+Value *FastMathInstComb::FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+ Value *V = Builder->CreateFMul(Opnd0, Opnd1);
+ createInstPostProc(cast<Instruction>(V));
+ return V;
+}
+
+void FastMathInstComb::FAddCombine::createInstPostProc(Instruction *NewInstr) {
+ NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+ // keep track of the number of instruction created.
+ incCreateInstNum();
+
+ // Propagate fast-math flags
+ NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FastMathInstComb::FAddCombine::calcInstrNumber
+ (const AddendVect &Opnds) {
+ unsigned OpndNum = Opnds.size();
+ unsigned InstrNeeded = OpndNum - 1;
+
+ // The number of addends in the form the "(-1)*x".
+ unsigned NegOpndNum = 0;
+
+ // Adjust the the number of instruction needed to emit the N-ary add.
+ for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+ I != E; I++) {
+ const FAddend *Opnd = *I;
+ if (Opnd->isConstant())
+ continue;
+
+ const FAddendCoef& CE = Opnd->getCoef();
+ if (CE.isMinusOne() || CE.isMinusTwo())
+ NegOpndNum++;
+
+ // Let the addend be "c * x". If "c == +/-1", the value of the addend
+ // is immediately aviable; otherwise, it needs exactly one instruction
+ // to evaluate the value.
+ if (!CE.isMinusOne() && !CE.isOne())
+ InstrNeeded++;
+ }
+ if (NegOpndNum == OpndNum)
+ InstrNeeded++;
+ return InstrNeeded;
+}
+
+// Input Addend Value NeedNeg(output)
+// ================================================================
+// Constant C C false
+// <+/-1, V> V coefficient is -1
+// <2/-2, V> "fadd V, V" coefficient is -2
+// <C, V> "fmul V, C" false
+//
+Value *FastMathInstComb::FAddCombine::createAddendVal
+ (const FAddend &Opnd, bool& NeedNeg) {
+ const FAddendCoef& Coeff = Opnd.getCoef();
+
+ if (Opnd.isConstant()) {
+ NeedNeg = false;
+ return Coeff.getValue(Instr->getType());
+ }
+
+ Value *OpndVal = Opnd.getSymVal();
+
+ if (Coeff.isMinusOne() || Coeff.isOne()) {
+ NeedNeg = Coeff.isMinusOne();
+ return OpndVal;
+ }
+
+ if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+ NeedNeg = Coeff.isMinusTwo();
+ return createFAdd(OpndVal, OpndVal);
+ }
+
+ NeedNeg = false;
+ return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
/// AddOne - Add one to a ConstantInt.
static Constant *AddOne(Constant *C) {
return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
}
+
/// SubOne - Subtract one from a ConstantInt.
static Constant *SubOne(ConstantInt *C) {
return ConstantInt::get(C->getContext(), C->getValue()-1);
@@ -417,6 +920,10 @@
}
}
+ if (I.hasUnsafeAlgebra())
+ if (Value *V = FastMathCombiner->simplifyFAdd(&I))
+ return ReplaceInstUsesWith(I, V);
+
return Changed ? &I : 0;
}
@@ -657,5 +1164,8 @@
if (Value *V = dyn_castFNegVal(Op1))
return BinaryOperator::CreateFAdd(Op0, V);
+ if (I.hasUnsafeAlgebra())
+ if (Value *V = FastMathCombiner->simplifyFAdd(&I))
+ return ReplaceInstUsesWith(I, V);
return 0;
}
More information about the llvm-commits
mailing list