[llvm-commits] [PATCH][FastMath, InstCombine] Fadd/Fsub optimizations

Tue Dec 11 13:32:15 PST 2012

Hi, Dear All:

   The attached patch is to implement following rules about 
floating-point add/sub in relaxed mode.
(The n-th rule is not yet implemented. I just realized it when I write 
this mail.
  It is easy to implement this rule, but I don't like to go through 
stress test one more time).

----------------------------------------------------
1. (x + c1) + c2 ->  x + (c1 + c2)
2. (c * x) + x -> (c+1) * x
3. (x + x) + x -> x * 3
4. c * x + (x + x) -> (c + 2)*x
5. (x + x) + (x+x) -> 4*x
6. x - (x + y) -> 0 - y
   ...
   ...
   ...
n. (factoring) C * X1 + C * X2 -> C(X1 + X2)
-------------------------------------------------------

   Up to three neighboring instructions are involved in the 
optimization. The number
of the combination is daunting!. So I have to resort a general way 
(instead of
pattern match) to tackle these optimizations.

   The idea is simple, just try to decompose instructions into 
uniformally represented
Addends. Take following instruction sequence as an example:

   t1 = 1.8 * x;
   t2 = y - x;
   t3 = t1 - t2;

  t3 has two addends A1=<1, t1> (denote value 1*t1), and A2=<-1, t2>. If 
we "zoom-in"
A1 and A2 one step, we will reveal more addends: A1 can be zoom-in-ed 
into another
addend A1_0 = <1.8, x>, and A2 can be zoom-in into <1,y> and <-1,x>.

  When these addends available, the optimize try to optimize following 
N-ary additions
  using symbolic evaluation:
    A1_0 + A2_0 + A2_1, or
    A1 +  A2_0 + A2_1 or
    A1_0 + A2

  This patch is stress-tested with SingleSrc and MultiSource by 
considering all fadd/fsub
are in relaxed mode.

  Thank you for code review!

Shuxin

-------------- next part --------------
Index: test/Transforms/InstCombine/fast-math.ll
===================================================================

--- test/Transforms/InstCombine/fast-math.ll	(revision 169752)
+++ test/Transforms/InstCombine/fast-math.ll	(working copy)
@@ -3,19 +3,17 @@
 ; testing-case "float fold(float a) { return 1.2f * a * 2.3f; }"
 ; 1.2f and 2.3f is supposed to be fold.
 define float @fold(float %a) {
-fold:
   %mul = fmul fast float %a, 0x3FF3333340000000
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
-; CHECK: fold
+; CHECK: @fold
 ; CHECK: fmul float %a, 0x4006147AE0000000
 }
 
 ; Same testing-case as the one used in fold() except that the operators have
 ; fixed FP mode.
 define float @notfold(float %a) {
-notfold:
-; CHECK: notfold
+; CHECK: @notfold
 ; CHECK: %mul = fmul fast float %a, 0x3FF3333340000000
   %mul = fmul fast float %a, 0x3FF3333340000000
   %mul1 = fmul float %mul, 0x4002666660000000
@@ -23,10 +21,96 @@
 }
 
 define float @fold2(float %a) {
-fold2:
-; CHECK: fold2
+; CHECK: @fold2
 ; CHECK: fmul float %a, 0x4006147AE0000000
   %mul = fmul float %a, 0x3FF3333340000000
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
 }
+
+; C * f1 + f1 = (C+1) * f1
+define double @fold3(double %f1) {
+  %t1 = fmul fast double 2.000000e+00, %f1
+  %t2 = fadd fast double %f1, %t1
+  ret double %t2
+; CHECK: @fold3
+; CHECK: fmul fast double %f1, 3.000000e+00
+}
+
+; (C1 - X) + (C2 - Y) => ((C1+C2) - X) - Y
+define float @fold4(float %f1, float %f2) nounwind uwtable readnone ssp {
+  %sub = fsub float 4.000000e+00, %f1
+  %sub1 = fsub float 5.000000e+00, %f2
+  %add = fadd fast float %sub, %sub1
+  ret float %add
+; CHECK: @fold4
+; CHECK: fsub fast float 9.000000e+00, %f1
+}
+
+; (X + C1) + C2 => X + (C1 + C2)
+define float @fold5(float %f1, float %f2) nounwind uwtable readnone ssp {
+  %add = fadd float %f1, 4.000000e+00
+  %add1 = fadd fast float %add, 5.000000e+00
+  ret float %add1
+; CHECK: @fold5
+; CHECK: fadd float %f1, 9.000000e+00
+}
+
+; (X + X) + X => 3.0 * X
+define float @fold6(float %f1) {
+  %t1 = fadd fast float %f1, %f1
+  %t2 = fadd fast float %f1, %t1
+  ret float %t2
+; CHECK: @fold6
+; CHECK: fmul fast float %f1, 3.000000e+00
+}
+
+; C1 * X + (X + X) = (C1 + 2) * X
+define float @fold7(float %f1) {
+  %t1 = fmul fast float %f1, 5.000000e+00
+  %t2 = fadd fast float %f1, %f1
+  %t3 = fadd fast float %t1, %t2
+  ret float %t3
+; CHECK: @fold7
+; CHECK: fmul fast float %f1, 7.000000e+00
+}
+
+; (X + X) + (X + X) => 4.0 * X
+define float @fold8(float %f1) {
+  %t1 = fadd fast float %f1, %f1
+  %t2 = fadd fast float %f1, %f1
+  %t3 = fadd fast float %t1, %t2
+  ret float %t3
+; CHECK: fold8
+; CHECK: fmul fast float %f1, 4.000000e+00
+}
+
+; X - (X + Y) => 0 - Y
+define float @fold9(float %f1, float %f2) {
+  %t1 = fadd float %f1, %f2
+  %t3 = fsub fast float %f1, %t1
+  ret float %t3
+
+; CHECK: @fold9
+; CHECK: fsub fast float 0.000000e+00, %f2
+}
+
+
+; once cause Crash/miscompilation
+define float @fail1(float %f1, float %f2) {
+  %conv3 = fadd fast float %f1, -1.000000e+00
+  %add = fadd fast float %conv3, %conv3
+  %add2 = fadd fast float %add, %conv3
+  ret float %add2
+; CHECK: @fail1
+; CHECK: ret
+}
+
+define double @fail2(double %f1, double %f2) {
+  %t1 = fsub fast double %f1, %f2
+  %t2 = fadd fast double %f1, %f2
+  %t3 = fsub fast double %t1, %t2
+  ret double %t3
+; CHECK: @fail2
+; CHECK: ret
+}
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp	(revision 169752)
+++ lib/Transforms/InstCombine/InstructionCombining.cpp	(working copy)
@@ -53,6 +53,7 @@
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "InstCombineFastMath.h"
 #include <algorithm>
 #include <climits>
 using namespace llvm;
@@ -2406,6 +2407,9 @@
   InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this);
   Simplifier = &TheSimplifier;
 
+  FastMathInstComb FMC(Builder);
+  FastMathCombiner = &FMC;
+
   bool EverMadeChange = false;
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
Index: lib/Transforms/InstCombine/InstCombineFastMath.h
===================================================================
--- lib/Transforms/InstCombine/InstCombineFastMath.h	(revision 0)
+++ lib/Transforms/InstCombine/InstCombineFastMath.h	(revision 0)
@@ -0,0 +1,209 @@
+//===- InstCombineFastMath.h - Fast-math InstCombine definition  ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef INSTCOMBINE_FASTMATH_H
+#define INSTCOMBINE_FASTMATH_H
+
+namespace llvm {
+
+class FastMathInstComb {
+public:
+  Value *simplifyFAdd(Instruction *I) { return getFAddCombiner()->simplify(I); }
+  Value *simplifyFSub(Instruction *I) { return simplifyFAdd(I);}
+
+  FastMathInstComb(InstCombiner::BuilderTy *B) : FAddComb(0), Builder(B) {}
+  ~FastMathInstComb() { delete FAddComb; }
+
+private:
+  class FAddCombine;
+
+  FAddCombine *getFAddCombiner()
+    { return FAddComb ? FAddComb : (FAddComb = new FAddCombine(Builder)); }
+
+  FAddCombine *FAddComb;
+  InstCombiner::BuilderTy *Builder;
+
+
+  //===----------------------------------------------------------------===//
+  //
+  //    Helper classes starts from this point.
+  //
+  //===-----------------------------------------------------------------===//
+private:
+
+  /// Class representing coefficient of floating-point addend.
+  /// This class needs to be highly efficient.
+  class FAddendCoef {
+  public:
+    // The constructor has to initialize a APFloat, which is uncessary for
+    // most addends which have coefficient either 1 or -1. So, the constructor
+    // is expensive. In order to avoid the cost of the constructor, we should
+    // reuse some instances whenever possible. The pre-created instances
+    // FAddCombine::Add[0-5] embodies this idea.
+    //
+    FAddendCoef() : FpVal(0.0), IntVal(0), isInt(true) {}
+
+    void set(short C) {
+      assert(!InsaneIntVal(C) && "Insane coefficient");
+      isInt = true; IntVal = C;
+    }
+    void set(const APFloat& C) { isInt = false; FpVal = C; }
+
+    bool isZero() const { return isInt ? !IntVal : FpVal.isZero();}
+
+    void negate();
+
+    // If possible, don't define operator+/operator- etc because these
+    // operators inevitably call FAddendCoef's constructor which is not cheap.
+    void operator=(const FAddendCoef &A);
+    void operator+=(const FAddendCoef &A);
+    void operator-=(const FAddendCoef &A);
+    void operator*=(const FAddendCoef &S);
+
+    bool isOne() const { return isInt && IntVal == 1; }
+    bool isTwo() const { return isInt && IntVal == 2; }
+    bool isMinusOne() const { return isInt && IntVal == -1; }
+    bool isMinusTwo() const { return isInt && IntVal == -2; }
+
+    Value *getValue(Type *) const;
+
+  private:
+    bool InsaneIntVal(int V) { return V > 4 || V < -4; }
+
+    APFloat FpVal;
+    // The integer coefficient of an individual addend is either 1 or -1,
+    // and we try to simplify at most 4 addends from neighboring at most
+    // two instructions. So the range if <IntVal> falls in [-4, 4]. APInt
+    // is overkill of this end.
+    short IntVal;
+    bool isInt;
+  };
+
+  /// FAddend is used to represent floating-point addend. An addend is
+  /// represented as <C, V>, where the V is is symbolic value, and C is a
+  /// constant coefficient. A constant addend is represented as <C, 0>.
+  ///
+  class FAddend {
+  public:
+    typedef enum {
+      Simpler,     // addend1 = c1*x, addend2 = c2*x, result = (c1+c2)*x
+      FlushToZero, // similar to the case of Simpler, except that (c1+c2) is a
+                   // denormal, and the result is flushed to zero.
+      Zero,        // addend1 = c1*x, addend2 = -c1*x , result = 0
+      Fail         // addend1 = c1*x, addend2 = c2*y, and x != y
+    } SimpResult;
+
+    FAddend() { Val = 0; }
+
+    Value *getSymVal (void) const { return Val; }
+    const FAddendCoef& getCoef(void) const { return Coeff; }
+
+    bool isConstant() const { return Val == 0; }
+
+    void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
+    void set(const APFloat& Coefficient, Value *V)
+      { Coeff.set(Coefficient); Val = V; }
+    void set(const ConstantFP* Coefficient, Value *V)
+      { Coeff.set(Coefficient->getValueAPF()); Val = V; }
+
+    void negate() { Coeff.negate(); }
+
+    /// Try to simplify "\p this + \p Addend2". Iff simplification was
+    /// successful, the resulting value will be saved to "this" instance.
+    SimpResult trySimplifyAdd(const FAddend& Addend2, bool FlushToZero=false);
+
+    /// Drill down the U-D chain one step to find the definition of V, and
+    /// try to break the definition into one or two addends.
+    static unsigned drillDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+
+    /// Similar to FAddend::drillDownOneStep() except that the value being
+    /// splitted is the addend itself.
+    unsigned drillDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+
+  private:
+    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+
+    // This addend has the value of "Coeff * Val".
+    FAddendCoef Coeff;
+    Value *Val;
+  };
+
+  /// This functor works with std::sort to permute addends such that those
+  /// having same symbolic-value are clustered together.
+  struct FAddendCmp {
+    bool operator()(const FAddend *A1, const FAddend *A2) {
+      return A1->getSymVal() < A2->getSymVal();
+    }
+  };
+
+  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+  /// with its neighboring at most two instructions.
+  ///
+  class FAddCombine {
+  public:
+    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
+    Value *simplify(Instruction *FAdd);
+
+  private:
+    typedef SmallVector<const FAddend*, 4> AddendVect;
+
+    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+    /// Convert given addend to a Value
+    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+    
+    /// Return the number of instruction needed to emit the N-ary addition.
+    unsigned calcInstrNumber(const AddendVect& Vect);
+    Value *createFSub(Value *Opnd0, Value *Opnd1);
+    Value *createFAdd(Value *Opnd0, Value *Opnd1);
+    Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFNeg(Value *V);
+    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+    void createInstPostProc(Instruction *NewInst);
+
+    InstCombiner::BuilderTy *Builder;
+    Instruction *Instr;
+
+    FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+  private:
+    // "Messy" stuff to make simplifyFAdd() faster. NOTE: the functions
+    // defined this section can only be called by simplifyFAdd() itself.
+
+    // At most 4 addends are involved in simplification, so we need at
+    // most 4 - 1 tmp addends to evaluate the intermediate results.
+    #define MAX_TMP_ADDEND_NUM 3
+
+    FAddend *allocTmpAddend() {
+      assert((NextFreeIdx < MAX_TMP_ADDEND_NUM) && "run out of tmp addends");
+      return &TmpAddends[NextFreeIdx++];
+    }
+
+    void freeAllTmpAddends() { NextFreeIdx = 0; }
+
+    FAddend TmpAddends[MAX_TMP_ADDEND_NUM];
+    unsigned NextFreeIdx;
+
+    #undef MAX_TMP_ADDEND_NUM
+
+  private:
+     // Debugging stuff are clustered here.
+    #ifndef NDEBUG
+      unsigned CreateInstrNum;
+      void initCreateInstNum() { CreateInstrNum = 0; }
+      void incCreateInstNum() { CreateInstrNum++; }
+    #else
+      void initCreateInstNum() {}
+      void incCreateInstNum() {}
+    #endif
+  };
+};
+
+} // end namespace llvm.
+
+#endif
Index: lib/Transforms/InstCombine/InstCombine.h
===================================================================
--- lib/Transforms/InstCombine/InstCombine.h	(revision 169752)
+++ lib/Transforms/InstCombine/InstCombine.h	(working copy)
@@ -67,6 +67,8 @@
     Worklist.Add(I);
   }
 };
+
+class FastMathInstComb;
   
 /// InstCombiner - The -instcombine pass.
 class LLVM_LIBRARY_VISIBILITY InstCombiner
@@ -76,6 +78,8 @@
   TargetLibraryInfo *TLI;
   bool MadeIRChange;
   LibCallSimplifier *Simplifier;
+  FastMathInstComb *FastMathCombiner;
+
 public:
   /// Worklist - All of the instructions that need to be simplified.
   InstCombineWorklist Worklist;
Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAddSub.cpp	(revision 169752)
+++ lib/Transforms/InstCombine/InstCombineAddSub.cpp	(working copy)
@@ -16,13 +16,516 @@
 #include "llvm/DataLayout.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
+#include "InstCombineFastMath.h"
 using namespace llvm;
 using namespace PatternMatch;
 
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+//    FastMathInstComb::{FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+
+void FastMathInstComb::FAddendCoef::operator=(const FAddendCoef& That) {
+  if ((isInt = That.isInt))
+    IntVal = That.IntVal;
+  else
+    FpVal = That.FpVal;
+}
+
+void FastMathInstComb::FAddendCoef::operator+=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt == That.isInt) {
+    if (isInt)
+      IntVal += That.IntVal;
+    else
+      FpVal.add(That.FpVal, RndMode);
+    return;
+  }
+  
+  if (isInt) {
+    FpVal = That.FpVal;
+    FpVal.add(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+    isInt = false;
+    return;
+  }
+  
+  FpVal.add(APFloat(FpVal.getSemantics(), That.IntVal), RndMode);
+}
+
+void FastMathInstComb::FAddendCoef::operator-=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt == That.isInt) {
+    if (isInt)
+      IntVal -= That.IntVal;
+    else
+      FpVal.subtract(That.FpVal, RndMode);
+    return;
+  }
+  
+  if (isInt) {
+    FpVal = That.FpVal;
+    FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+    isInt = false;
+    return;
+  }
+
+  FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+}
+
+void FastMathInstComb::FAddendCoef::operator*=(const FAddendCoef &That) {
+  if (That.isOne())
+    return;
+
+  if (That.isMinusOne()) {
+    negate();
+    return;
+  }
+
+  if (isInt && That.isInt) {
+    int Res = IntVal * (int)That.IntVal;
+    assert(!InsaneIntVal(Res) && "Insane int value");
+    IntVal = Res;
+    return;
+  }
+
+  const fltSemantics &Semantic = 
+    isInt ? That.FpVal.getSemantics() : FpVal.getSemantics();
+
+  APFloat &F0 = FpVal;
+  if (isInt)
+    F0 = APFloat(Semantic, IntVal);
+
+  if (That.isInt)
+    F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven);
+  else
+    F0.multiply(That.FpVal, APFloat::rmNearestTiesToEven); 
+
+  return;
+}
+
+void FastMathInstComb::FAddendCoef::negate() {
+  if (isInt)
+    IntVal = 0 - IntVal;
+  else
+    FpVal.changeSign();
+}
+
+Value *FastMathInstComb::FAddendCoef::getValue(Type *Ty) const {
+  return isInt ?
+    ConstantFP::get(Ty, float(IntVal)) :
+    ConstantFP::get(Ty->getContext(), FpVal);
+}
+
+FastMathInstComb::FAddend::SimpResult
+FastMathInstComb::FAddend::trySimplifyAdd
+  (const FAddend& Addend2, bool FlushToZero) {
+  // Currently flush-to-0 is ignored. Following stmtement is to suppress
+  // compile-warning.
+  FlushToZero = !FlushToZero;
+ 
+  if (Val != Addend2.Val)
+    return Fail;
+  
+  Coeff += Addend2.Coeff;
+
+  return Coeff.isZero() ? Zero : Simpler;
+}
+
+// The definition of <Val>     Addends
+// =========================================
+//  A + B                     <1, A>, <1,B>
+//  A - B                     <1, A>, <1,B>
+//  0 - B                     <-1, B>
+//  C * A,                    <C, A>
+//  A + C                     <1, A> <C, NULL> 
+//  0 +/- 0                   <0, NULL> (corner case)
+//
+// Legend: A, B are not constant, C is constant
+// 
+unsigned FastMathInstComb::FAddend::drillDownOneStep
+  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+  Instruction *I = 0;
+  if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
+    return 0;
+
+  unsigned Opcode = I->getOpcode();
+
+  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+    ConstantFP *C0, *C1;
+    Value *Opnd0 = I->getOperand(0);
+    Value *Opnd1 = I->getOperand(1);
+    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+      Opnd0 = 0;
+
+    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+      Opnd1 = 0;
+
+    if (Opnd0) {
+      if (!C0)
+        Addend0.set(1, Opnd0);
+      else
+        Addend0.set(C0, 0);
+    }
+
+    if (Opnd1) {
+      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+      if (!C1)
+        Addend.set(1, Opnd1);
+      else
+        Addend.set(C1, 0);
+      if (Opcode == Instruction::FSub)
+        Addend.negate();
+    }
+
+    if (Opnd0 || Opnd1)
+      return Opnd0 && Opnd1 ? 2 : 1;
+
+    // Both operands are zero. Weird!
+    Addend0.set(APFloat(0.0f), 0);
+    return 1;
+  }
+
+  if (I->getOpcode() == Instruction::FMul) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+      Addend0.set(C, V1);
+      return 1;
+    }
+
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+      Addend0.set(C, V0);
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+unsigned FastMathInstComb::FAddend::drillDownOneStep
+  (FAddend &Addend0, FAddend &Addend1) const {
+  if (isConstant())
+    return 0;
+
+  unsigned BreakNum = FAddend::drillDownOneStep(Val, Addend0, Addend1);
+  if (!BreakNum || Coeff.isOne()) 
+    return BreakNum;
+
+  Addend0.Scale(Coeff);
+
+  if (BreakNum == 2)
+    Addend1.Scale(Coeff);
+
+  return BreakNum;
+}
+
+Value *FastMathInstComb::FAddCombine::simplify(Instruction *I) {
+
+  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+
+  // Currently we are able to handle vector type.
+  if (I->getType()->isVectorTy())
+    return 0;
+
+  if (I->getOpcode() != Instruction::FAdd && 
+      I->getOpcode() != Instruction::FSub)
+    return 0;
+
+  // Save the instruction before calling other member-functions. 
+  Instr = I;
+
+  unsigned OpndNum = FAddend::drillDownOneStep(I, Opnd0, Opnd1);
+
+  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1
+  unsigned Opnd0_ExpNum = 0;
+  unsigned Opnd1_ExpNum = 0;
+
+  if (!Opnd0.isConstant()) 
+    Opnd0_ExpNum = Opnd0.drillDownOneStep(Opnd0_0, Opnd0_1);
+
+  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+  if (OpndNum == 2 && !Opnd1.isConstant())
+    Opnd1_ExpNum = Opnd1.drillDownOneStep(Opnd1_0, Opnd1_1);
+
+  // Step 3: try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+  if (Opnd0_ExpNum && Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0_0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    // Compute instruction quota. We should save at least one instruction.
+    unsigned InstQuota = 0;
+
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&  
+                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+      return R;
+  }
+
+  if (OpndNum != 2) {
+    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+    // splitted into two addends, say "V = X - Y", the instruction would have
+    // been optimized into "I = Y - X" in the previous steps.
+    //
+    const FAddendCoef& CE = Opnd0.getCoef();
+    return CE.isOne() ? Opnd0.getSymVal() : 0;
+  }
+
+  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+  if (Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 4: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+  if (Opnd0_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd1);
+    AllOpnds.push_back(&Opnd0_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  return 0;
+}
+
+
+Value *FastMathInstComb::FAddCombine::simplifyFAdd
+  (AddendVect& Addends, unsigned InstrQuota) {
+
+  // Permute the input addends such that addends sharing same symbolic-value
+  // are clustered together. e.g. { c1*x, c2*y, c3*x, c4*y, ... } => 
+  /// { c1*x, c3*x, c2*y, c4*y, ...}.
+  std::sort(Addends.begin(), Addends.end(), FAddendCmp());
+
+  freeAllTmpAddends();
+
+  // Walk forward along the sorted addends, trying to combine adjacent two
+  // addends into a single one.
+  AddendVect SimpVect;
+  for (AddendVect::iterator I = Addends.begin(), E = Addends.end();
+       I != E; I++) {
+    const FAddend* Opnd = *I;
+    if (SimpVect.empty()) {
+      SimpVect.push_back(Opnd);
+      continue;
+    }
+
+    // Try to combine current addend with the previous adjacent addent
+    const FAddend *Opnd0 = SimpVect.back();
+    if (Opnd0->getSymVal() != Opnd->getSymVal()) {
+      // case 1: Opnd0 + Opnd can not be simplified.
+      SimpVect.push_back(Opnd);
+      continue;
+    }
+
+    SimpVect.pop_back();
+    FAddend *T = allocTmpAddend();
+    *T = *Opnd0;
+
+    FAddend::SimpResult R = T->trySimplifyAdd(*Opnd);
+    
+    // case 2: Opnd0 + Opnd = 0
+    if (R == FAddend::Zero || R == FAddend::FlushToZero)
+      continue;
+
+    // case 3: Opnd0 + Opnd = C * X
+    assert (R == FAddend::Simpler);
+    SimpVect.push_back(T);
+  }
+
+  Value *Result;
+  if (!SimpVect.empty())
+    Result = createNaryFAdd(SimpVect, InstrQuota);
+  else {
+    // The addition is folded to 0.0
+    Result = ConstantFP::get(Instr->getType(), 0.0);
+  }
+
+  return Result;
+}
+
+Value *FastMathInstComb::FAddCombine::createNaryFAdd
+  (const AddendVect& Opnds, unsigned InstrQuota) {
+  assert(!Opnds.empty() && "Exect at least one addend");
+
+  // Step 1: Check if the # of instruction needed exceeds the quota.
+  // 
+  unsigned InstrNeeded = calcInstrNumber(Opnds);
+  if (InstrNeeded > InstrQuota)
+    return 0;
+
+  initCreateInstNum();
+
+  // step 2: Emit the N-ary addition.
+  // Note that at most threee instructions involved in Fadd-InstCombine: the
+  // addition in question, and at most two neighboring instructions.
+  // The resulting optimized addition should have at least one less instruction
+  // than the original addition expression tree. This implies the resulting
+  // N-ary addition has at most two instructions, and we don't need to worry
+  // about tree-height when constructing the N-ary addition.
+
+  Value *LastVal = 0;
+  bool LastValNeedNeg = false;
+
+  // Iterate the addends, creating fadd/fsub using adjacent two addends.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    bool NeedNeg; 
+    Value *V = createAddendVal(**I, NeedNeg);
+    if (!LastVal) {
+      LastVal = V;
+      LastValNeedNeg = NeedNeg;
+      continue;
+    }
+
+    if (LastValNeedNeg == NeedNeg) {
+      LastVal = createFAdd(LastVal, V);
+      continue;
+    }
+
+    if (LastValNeedNeg)
+      LastVal = createFSub(V, LastVal);
+    else
+      LastVal = createFSub(LastVal, V);
+
+    LastValNeedNeg = false;
+  }
+
+  if (LastValNeedNeg) {
+    LastVal = createFNeg(LastVal);
+  }
+
+  #ifndef NDEBUG
+    assert(CreateInstrNum == InstrNeeded && 
+           "Inconsistent in instruction numbers");
+  #endif
+
+  return LastVal;
+}
+
+Value *FastMathInstComb::FAddCombine::createFSub
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFSub(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+Value *FastMathInstComb::FAddCombine::createFNeg(Value *V) {
+  Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0));
+  return createFSub(Zero, V);
+}
+
+Value *FastMathInstComb::FAddCombine::createFAdd
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+Value *FastMathInstComb::FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFMul(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+void FastMathInstComb::FAddCombine::createInstPostProc(Instruction *NewInstr) {
+  NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+  // keep track of the number of instruction created.
+  incCreateInstNum();
+
+  // Propagate fast-math flags
+  NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FastMathInstComb::FAddCombine::calcInstrNumber
+  (const AddendVect &Opnds) {
+  unsigned OpndNum = Opnds.size();
+  unsigned InstrNeeded = OpndNum - 1;
+
+  // The number of addends in the form the "(-1)*x". 
+  unsigned NegOpndNum = 0; 
+
+  // Adjust the the number of instruction needed to emit the N-ary add.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    const FAddend *Opnd = *I;
+    if (Opnd->isConstant())
+      continue;
+
+    const FAddendCoef& CE = Opnd->getCoef();
+    if (CE.isMinusOne() || CE.isMinusTwo())
+      NegOpndNum++;
+
+    // Let the addend be "c * x". If "c == +/-1", the value of the addend
+    // is immediately aviable; otherwise, it needs exactly one instruction
+    // to evaluate the value.
+    if (!CE.isMinusOne() && !CE.isOne())
+      InstrNeeded++;
+  }
+  if (NegOpndNum == OpndNum)
+    InstrNeeded++;
+  return InstrNeeded;
+}
+
+// Input Addend        Value           NeedNeg(output)
+// ================================================================
+// Constant C          C               false
+// <+/-1, V>           V               coefficient is -1
+// <2/-2, V>          "fadd V, V"      coefficient is -2
+// <C, V>             "fmul V, C"      false
+//
+Value *FastMathInstComb::FAddCombine::createAddendVal
+  (const FAddend &Opnd, bool& NeedNeg) {
+  const FAddendCoef& Coeff = Opnd.getCoef();
+
+  if (Opnd.isConstant()) {
+    NeedNeg = false;
+    return Coeff.getValue(Instr->getType());
+  }
+
+  Value *OpndVal = Opnd.getSymVal();
+
+  if (Coeff.isMinusOne() || Coeff.isOne()) {
+    NeedNeg = Coeff.isMinusOne();
+    return OpndVal;
+  }
+
+  if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+    NeedNeg = Coeff.isMinusTwo();
+    return createFAdd(OpndVal, OpndVal);
+  }
+
+  NeedNeg = false;
+  return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
 /// AddOne - Add one to a ConstantInt.
 static Constant *AddOne(Constant *C) {
   return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
 }
+
 /// SubOne - Subtract one from a ConstantInt.
 static Constant *SubOne(ConstantInt *C) {
   return ConstantInt::get(C->getContext(), C->getValue()-1);
@@ -417,6 +920,10 @@
     }
   }
   
+  if (I.hasUnsafeAlgebra())
+    if (Value *V = FastMathCombiner->simplifyFAdd(&I))
+      return ReplaceInstUsesWith(I, V);
+
   return Changed ? &I : 0;
 }
 
@@ -657,5 +1164,8 @@
   if (Value *V = dyn_castFNegVal(Op1))
     return BinaryOperator::CreateFAdd(Op0, V);
 
+  if (I.hasUnsafeAlgebra())
+    if (Value *V = FastMathCombiner->simplifyFAdd(&I))
+      return ReplaceInstUsesWith(I, V);
   return 0;
 }