[llvm-commits] [PATCH][FastMath, InstCombine] Fadd/Fsub optimizations

Wed Dec 12 14:20:41 PST 2012

Hi, Eli:

   Per our face-2-face talk yesterday, I measure some data today. Here 
is the result.
The input *.bc is obtained from 
spec2kfp/benchspec/CFP2000/188.ammp/src/rectmm.c
I compile it with "clang -O0 -emit-llvm rectmm.c -c -o a.bc".

   Two <opt>s are compared side-by-side:
  opt1:  pre-created some Addend instance, corresponding to is.patch, 
debug-built (as an indicator)
  opt2:  No pre-created Addend instances. correponding to was.patch, 
debug-build (as an indicator)

The difference of complexity in terms of LOC (not including testing-cases).
    is.patch = 770 vs was.patch 799

  Instcombine compile time, see bellow.

  Apparently, op1 is slightly faster albeit at 29 lines of addition 
complexity.
Pleas advice what I should do next.  I personally think it is very 
difficult to significant reduce the is.patch.
If you believe the *.patch has lots of room to improve  in terms of 
"complexity", could you please pin
point the code such that I can focus on it.

  IMHO, I  believe pattern match would only things even worse.

Thanks
Shuxin


Following is the side-by-side comparison.
===========================

   All fadd/fsub are considered unsafe

   I measure the compile-time by 3 times on my imac w/ corei7.
   The commands for the measurement are :
  ----------------------------------
   "for i in 1 2 3; do opt1 -time-passes a.bc | 2>&1 | grep "Combine"  > 
A$i.txt ; done"
   (the results for op2 are save to B{1,2,3}.txt

cat A1.txt
   0.0230 (  2.1%)   0.0008 (  2.5%)   0.0237 (  2.1%)   0.0237 ( 2.1%)  
Combine redundant instructions
    0.0229 (  2.1%)   0.0000 (  0.0%)   0.0229 (  2.1%)   0.0229 ( 
2.1%)  Combine redundant instructions
    0.0119 (  1.1%)   0.0005 (  1.6%)   0.0124 (  1.1%)   0.0124 ( 
1.1%)  Combine redundant instructions
    0.0114 (  1.1%)   0.0004 (  1.2%)   0.0117 (  1.1%)   0.0117 ( 
1.1%)  Combine redundant instructions
    0.0117 (  1.1%)   0.0000 (  0.0%)   0.0117 (  1.1%)   0.0117 ( 
1.1%)  Combine redundant instructions

cat B1.txt:
    0.0262 (  2.3%)   0.0000 (  0.2%)   0.0262 (  2.3%)   0.0262 ( 
2.3%)  Combine redundant instructions
    0.0240 (  2.1%)   0.0000 (  0.1%)   0.0240 (  2.1%)   0.0240 ( 
2.1%)  Combine redundant instructions
    0.0130 (  1.2%)   0.0001 (  3.0%)   0.0131 (  1.2%)   0.0131 ( 
1.2%)  Combine redundant instructions
    0.0124 (  1.1%)   0.0000 (  0.3%)   0.0124 (  1.1%)   0.0124 ( 
1.1%)  Combine redundant instructions
    0.0120 (  1.1%)   0.0000 (  0.1%)   0.0120 (  1.1%)   0.0120 ( 
1.1%)  Combine redundant instructions

  cat A2.txt:
   0.0230 (  2.1%)   0.0008 (  2.0%)   0.0238 (  2.1%)   0.0238 ( 2.1%)  
Combine redundant instructions
    0.0224 (  2.1%)   0.0007 (  1.9%)   0.0232 (  2.1%)   0.0232 ( 
2.1%)  Combine redundant instructions
    0.0115 (  1.1%)   0.0005 (  1.3%)   0.0120 (  1.1%)   0.0120 ( 
1.1%)  Combine redundant instructions
    0.0113 (  1.0%)   0.0004 (  1.0%)   0.0116 (  1.0%)   0.0116 ( 
1.0%)  Combine redundant instructions
    0.0113 (  1.0%)   0.0004 (  1.0%)   0.0116 (  1.0%)   0.0116 ( 
1.0%)  Combine redundant instructions

   cat B2.txt:
    0.0248 (  2.3%)   0.0000 (  0.2%)   0.0248 (  2.3%)   0.0248 ( 
2.3%)  Combine redundant instructions
    0.0241 (  2.2%)   0.0000 (  0.2%)   0.0241 (  2.2%)   0.0241 ( 
2.2%)  Combine redundant instructions
    0.0121 (  1.1%)   0.0001 (  6.9%)   0.0122 (  1.1%)   0.0122 ( 
1.1%)  Combine redundant instructions
    0.0121 (  1.1%)   0.0000 (  0.6%)   0.0121 (  1.1%)   0.0121 ( 
1.1%)  Combine redundant instructions
    0.0120 (  1.1%)   0.0000 (  0.1%)   0.0120 (  1.1%)   0.0120 ( 
1.1%)  Combine redundant instructions


-------------- next part --------------
Index: InstCombineFastMath.h
===================================================================

--- InstCombineFastMath.h	(revision 0)
+++ InstCombineFastMath.h	(revision 0)
@@ -0,0 +1,209 @@
+//===- InstCombineFastMath.h - Fast-math InstCombine definition  ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef INSTCOMBINE_FASTMATH_H
+#define INSTCOMBINE_FASTMATH_H
+
+namespace llvm {
+
+class FastMathInstComb {
+public:
+  Value *simplifyFAdd(Instruction *I) { return getFAddCombiner()->simplify(I); }
+  Value *simplifyFSub(Instruction *I) { return simplifyFAdd(I);}
+
+  FastMathInstComb(InstCombiner::BuilderTy *B) : FAddComb(0), Builder(B) {}
+  ~FastMathInstComb() { delete FAddComb; }
+
+private:
+  class FAddCombine;
+
+  FAddCombine *getFAddCombiner()
+    { return FAddComb ? FAddComb : (FAddComb = new FAddCombine(Builder)); }
+
+  FAddCombine *FAddComb;
+  InstCombiner::BuilderTy *Builder;
+
+
+  //===----------------------------------------------------------------===//
+  //
+  //    Helper classes starts from this point.
+  //
+  //===-----------------------------------------------------------------===//
+private:
+
+  /// Class representing coefficient of floating-point addend.
+  /// This class needs to be highly efficient.
+  class FAddendCoef {
+  public:
+    // The constructor has to initialize a APFloat, which is uncessary for
+    // most addends which have coefficient either 1 or -1. So, the constructor
+    // is expensive. In order to avoid the cost of the constructor, we should
+    // reuse some instances whenever possible. The pre-created instances
+    // FAddCombine::Add[0-5] embodies this idea.
+    //
+    FAddendCoef() : FpVal(0.0), IntVal(0), isInt(true) {}
+
+    void set(short C) {
+      assert(!InsaneIntVal(C) && "Insane coefficient");
+      isInt = true; IntVal = C;
+    }
+    void set(const APFloat& C) { isInt = false; FpVal = C; }
+
+    bool isZero() const { return isInt ? !IntVal : FpVal.isZero();}
+
+    void negate();
+
+    // If possible, don't define operator+/operator- etc because these
+    // operators inevitably call FAddendCoef's constructor which is not cheap.
+    void operator=(const FAddendCoef &A);
+    void operator+=(const FAddendCoef &A);
+    void operator-=(const FAddendCoef &A);
+    void operator*=(const FAddendCoef &S);
+
+    bool isOne() const { return isInt && IntVal == 1; }
+    bool isTwo() const { return isInt && IntVal == 2; }
+    bool isMinusOne() const { return isInt && IntVal == -1; }
+    bool isMinusTwo() const { return isInt && IntVal == -2; }
+
+    Value *getValue(Type *) const;
+
+  private:
+    bool InsaneIntVal(int V) { return V > 4 || V < -4; }
+
+    APFloat FpVal;
+    // The integer coefficient of an individual addend is either 1 or -1,
+    // and we try to simplify at most 4 addends from neighboring at most
+    // two instructions. So the range if <IntVal> falls in [-4, 4]. APInt
+    // is overkill of this end.
+    short IntVal;
+    bool isInt;
+  };
+
+  /// FAddend is used to represent floating-point addend. An addend is
+  /// represented as <C, V>, where the V is is symbolic value, and C is a
+  /// constant coefficient. A constant addend is represented as <C, 0>.
+  ///
+  class FAddend {
+  public:
+    typedef enum {
+      Simpler,     // addend1 = c1*x, addend2 = c2*x, result = (c1+c2)*x
+      FlushToZero, // similar to the case of Simpler, except that (c1+c2) is a
+                   // denormal, and the result is flushed to zero.
+      Zero,        // addend1 = c1*x, addend2 = -c1*x , result = 0
+      Fail         // addend1 = c1*x, addend2 = c2*y, and x != y
+    } SimpResult;
+
+    FAddend() { Val = 0; }
+
+    Value *getSymVal (void) const { return Val; }
+    const FAddendCoef& getCoef(void) const { return Coeff; }
+
+    bool isConstant() const { return Val == 0; }
+
+    void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
+    void set(const APFloat& Coefficient, Value *V)
+      { Coeff.set(Coefficient); Val = V; }
+    void set(const ConstantFP* Coefficient, Value *V)
+      { Coeff.set(Coefficient->getValueAPF()); Val = V; }
+
+    void negate() { Coeff.negate(); }
+
+    /// Try to simplify "\p this + \p Addend2". Iff simplification was
+    /// successful, the resulting value will be saved to "this" instance.
+    SimpResult trySimplifyAdd(const FAddend& Addend2, bool FlushToZero=false);
+
+    /// Drill down the U-D chain one step to find the definition of V, and
+    /// try to break the definition into one or two addends.
+    static unsigned drillDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+
+    /// Similar to FAddend::drillDownOneStep() except that the value being
+    /// splitted is the addend itself.
+    unsigned drillDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+
+  private:
+    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+
+    // This addend has the value of "Coeff * Val".
+    FAddendCoef Coeff;
+    Value *Val;
+  };
+
+  /// This functor works with std::sort to permute addends such that those
+  /// having same symbolic-value are clustered together.
+  struct FAddendCmp {
+    bool operator()(const FAddend *A1, const FAddend *A2) {
+      return A1->getSymVal() < A2->getSymVal();
+    }
+  };
+
+  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+  /// with its neighboring at most two instructions.
+  ///
+  class FAddCombine {
+  public:
+    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
+    Value *simplify(Instruction *FAdd);
+
+  private:
+    typedef SmallVector<const FAddend*, 4> AddendVect;
+
+    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+    /// Convert given addend to a Value
+    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+    
+    /// Return the number of instruction needed to emit the N-ary addition.
+    unsigned calcInstrNumber(const AddendVect& Vect);
+    Value *createFSub(Value *Opnd0, Value *Opnd1);
+    Value *createFAdd(Value *Opnd0, Value *Opnd1);
+    Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFNeg(Value *V);
+    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+    void createInstPostProc(Instruction *NewInst);
+
+    InstCombiner::BuilderTy *Builder;
+    Instruction *Instr;
+
+    FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+  private:
+    // "Messy" stuff to make simplifyFAdd() faster. NOTE: the functions
+    // defined this section can only be called by simplifyFAdd() itself.
+
+    // At most 4 addends are involved in simplification, so we need at
+    // most 4 - 1 tmp addends to evaluate the intermediate results.
+    #define MAX_TMP_ADDEND_NUM 3
+
+    FAddend *allocTmpAddend() {
+      assert((NextFreeIdx < MAX_TMP_ADDEND_NUM) && "run out of tmp addends");
+      return &TmpAddends[NextFreeIdx++];
+    }
+
+    void freeAllTmpAddends() { NextFreeIdx = 0; }
+
+    FAddend TmpAddends[MAX_TMP_ADDEND_NUM];
+    unsigned NextFreeIdx;
+
+    #undef MAX_TMP_ADDEND_NUM
+
+  private:
+     // Debugging stuff are clustered here.
+    #ifndef NDEBUG
+      unsigned CreateInstrNum;
+      void initCreateInstNum() { CreateInstrNum = 0; }
+      void incCreateInstNum() { CreateInstrNum++; }
+    #else
+      void initCreateInstNum() {}
+      void incCreateInstNum() {}
+    #endif
+  };
+};
+
+} // end namespace llvm.
+
+#endif
Index: InstructionCombining.cpp
===================================================================
--- InstructionCombining.cpp	(revision 169997)
+++ InstructionCombining.cpp	(working copy)
@@ -53,6 +53,7 @@
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "InstCombineFastMath.h"
 #include <algorithm>
 #include <climits>
 using namespace llvm;
@@ -2406,6 +2407,9 @@
   InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this);
   Simplifier = &TheSimplifier;
 
+  FastMathInstComb FMC(Builder);
+  FastMathCombiner = &FMC;
+
   bool EverMadeChange = false;
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
Index: InstCombine.h
===================================================================
--- InstCombine.h	(revision 169997)
+++ InstCombine.h	(working copy)
@@ -67,6 +67,8 @@
     Worklist.Add(I);
   }
 };
+
+class FastMathInstComb;
   
 /// InstCombiner - The -instcombine pass.
 class LLVM_LIBRARY_VISIBILITY InstCombiner
@@ -76,6 +78,8 @@
   TargetLibraryInfo *TLI;
   bool MadeIRChange;
   LibCallSimplifier *Simplifier;
+  FastMathInstComb *FastMathCombiner;
+
 public:
   /// Worklist - All of the instructions that need to be simplified.
   InstCombineWorklist Worklist;
Index: InstCombineAddSub.cpp
===================================================================
--- InstCombineAddSub.cpp	(revision 169997)
+++ InstCombineAddSub.cpp	(working copy)
@@ -16,13 +16,516 @@
 #include "llvm/DataLayout.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
+#include "InstCombineFastMath.h"
 using namespace llvm;
 using namespace PatternMatch;
 
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+//    FastMathInstComb::{FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+
+void FastMathInstComb::FAddendCoef::operator=(const FAddendCoef& That) {
+  if ((isInt = That.isInt))
+    IntVal = That.IntVal;
+  else
+    FpVal = That.FpVal;
+}
+
+void FastMathInstComb::FAddendCoef::operator+=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt == That.isInt) {
+    if (isInt)
+      IntVal += That.IntVal;
+    else
+      FpVal.add(That.FpVal, RndMode);
+    return;
+  }
+  
+  if (isInt) {
+    FpVal = That.FpVal;
+    FpVal.add(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+    isInt = false;
+    return;
+  }
+  
+  FpVal.add(APFloat(FpVal.getSemantics(), That.IntVal), RndMode);
+}
+
+void FastMathInstComb::FAddendCoef::operator-=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt == That.isInt) {
+    if (isInt)
+      IntVal -= That.IntVal;
+    else
+      FpVal.subtract(That.FpVal, RndMode);
+    return;
+  }
+  
+  if (isInt) {
+    FpVal = That.FpVal;
+    FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+    isInt = false;
+    return;
+  }
+
+  FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+}
+
+void FastMathInstComb::FAddendCoef::operator*=(const FAddendCoef &That) {
+  if (That.isOne())
+    return;
+
+  if (That.isMinusOne()) {
+    negate();
+    return;
+  }
+
+  if (isInt && That.isInt) {
+    int Res = IntVal * (int)That.IntVal;
+    assert(!InsaneIntVal(Res) && "Insane int value");
+    IntVal = Res;
+    return;
+  }
+
+  const fltSemantics &Semantic = 
+    isInt ? That.FpVal.getSemantics() : FpVal.getSemantics();
+
+  APFloat &F0 = FpVal;
+  if (isInt)
+    F0 = APFloat(Semantic, IntVal);
+
+  if (That.isInt)
+    F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven);
+  else
+    F0.multiply(That.FpVal, APFloat::rmNearestTiesToEven); 
+
+  return;
+}
+
+void FastMathInstComb::FAddendCoef::negate() {
+  if (isInt)
+    IntVal = 0 - IntVal;
+  else
+    FpVal.changeSign();
+}
+
+Value *FastMathInstComb::FAddendCoef::getValue(Type *Ty) const {
+  return isInt ?
+    ConstantFP::get(Ty, float(IntVal)) :
+    ConstantFP::get(Ty->getContext(), FpVal);
+}
+
+FastMathInstComb::FAddend::SimpResult
+FastMathInstComb::FAddend::trySimplifyAdd
+  (const FAddend& Addend2, bool FlushToZero) {
+  // Currently flush-to-0 is ignored. Following stmtement is to suppress
+  // compile-warning.
+  FlushToZero = !FlushToZero;
+ 
+  if (Val != Addend2.Val)
+    return Fail;
+  
+  Coeff += Addend2.Coeff;
+
+  return Coeff.isZero() ? Zero : Simpler;
+}
+
+// The definition of <Val>     Addends
+// =========================================
+//  A + B                     <1, A>, <1,B>
+//  A - B                     <1, A>, <1,B>
+//  0 - B                     <-1, B>
+//  C * A,                    <C, A>
+//  A + C                     <1, A> <C, NULL> 
+//  0 +/- 0                   <0, NULL> (corner case)
+//
+// Legend: A, B are not constant, C is constant
+// 
+unsigned FastMathInstComb::FAddend::drillDownOneStep
+  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+  Instruction *I = 0;
+  if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
+    return 0;
+
+  unsigned Opcode = I->getOpcode();
+
+  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+    ConstantFP *C0, *C1;
+    Value *Opnd0 = I->getOperand(0);
+    Value *Opnd1 = I->getOperand(1);
+    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+      Opnd0 = 0;
+
+    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+      Opnd1 = 0;
+
+    if (Opnd0) {
+      if (!C0)
+        Addend0.set(1, Opnd0);
+      else
+        Addend0.set(C0, 0);
+    }
+
+    if (Opnd1) {
+      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+      if (!C1)
+        Addend.set(1, Opnd1);
+      else
+        Addend.set(C1, 0);
+      if (Opcode == Instruction::FSub)
+        Addend.negate();
+    }
+
+    if (Opnd0 || Opnd1)
+      return Opnd0 && Opnd1 ? 2 : 1;
+
+    // Both operands are zero. Weird!
+    Addend0.set(APFloat(0.0f), 0);
+    return 1;
+  }
+
+  if (I->getOpcode() == Instruction::FMul) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+      Addend0.set(C, V1);
+      return 1;
+    }
+
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+      Addend0.set(C, V0);
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+unsigned FastMathInstComb::FAddend::drillDownOneStep
+  (FAddend &Addend0, FAddend &Addend1) const {
+  if (isConstant())
+    return 0;
+
+  unsigned BreakNum = FAddend::drillDownOneStep(Val, Addend0, Addend1);
+  if (!BreakNum || Coeff.isOne()) 
+    return BreakNum;
+
+  Addend0.Scale(Coeff);
+
+  if (BreakNum == 2)
+    Addend1.Scale(Coeff);
+
+  return BreakNum;
+}
+
+Value *FastMathInstComb::FAddCombine::simplify(Instruction *I) {
+
+  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+
+  // Currently we are able to handle vector type.
+  if (I->getType()->isVectorTy())
+    return 0;
+
+  if (I->getOpcode() != Instruction::FAdd && 
+      I->getOpcode() != Instruction::FSub)
+    return 0;
+
+  // Save the instruction before calling other member-functions. 
+  Instr = I;
+
+  unsigned OpndNum = FAddend::drillDownOneStep(I, Opnd0, Opnd1);
+
+  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1
+  unsigned Opnd0_ExpNum = 0;
+  unsigned Opnd1_ExpNum = 0;
+
+  if (!Opnd0.isConstant()) 
+    Opnd0_ExpNum = Opnd0.drillDownOneStep(Opnd0_0, Opnd0_1);
+
+  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+  if (OpndNum == 2 && !Opnd1.isConstant())
+    Opnd1_ExpNum = Opnd1.drillDownOneStep(Opnd1_0, Opnd1_1);
+
+  // Step 3: try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+  if (Opnd0_ExpNum && Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0_0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    // Compute instruction quota. We should save at least one instruction.
+    unsigned InstQuota = 0;
+
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&  
+                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+      return R;
+  }
+
+  if (OpndNum != 2) {
+    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+    // splitted into two addends, say "V = X - Y", the instruction would have
+    // been optimized into "I = Y - X" in the previous steps.
+    //
+    const FAddendCoef& CE = Opnd0.getCoef();
+    return CE.isOne() ? Opnd0.getSymVal() : 0;
+  }
+
+  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+  if (Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 4: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+  if (Opnd0_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd1);
+    AllOpnds.push_back(&Opnd0_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  return 0;
+}
+
+
+Value *FastMathInstComb::FAddCombine::simplifyFAdd
+  (AddendVect& Addends, unsigned InstrQuota) {
+
+  // Permute the input addends such that addends sharing same symbolic-value
+  // are clustered together. e.g. { c1*x, c2*y, c3*x, c4*y, ... } => 
+  /// { c1*x, c3*x, c2*y, c4*y, ...}.
+  std::sort(Addends.begin(), Addends.end(), FAddendCmp());
+
+  freeAllTmpAddends();
+
+  // Walk forward along the sorted addends, trying to combine adjacent two
+  // addends into a single one.
+  AddendVect SimpVect;
+  for (AddendVect::iterator I = Addends.begin(), E = Addends.end();
+       I != E; I++) {
+    const FAddend* Opnd = *I;
+    if (SimpVect.empty()) {
+      SimpVect.push_back(Opnd);
+      continue;
+    }
+
+    // Try to combine current addend with the previous adjacent addent
+    const FAddend *Opnd0 = SimpVect.back();
+    if (Opnd0->getSymVal() != Opnd->getSymVal()) {
+      // case 1: Opnd0 + Opnd can not be simplified.
+      SimpVect.push_back(Opnd);
+      continue;
+    }
+
+    SimpVect.pop_back();
+    FAddend *T = allocTmpAddend();
+    *T = *Opnd0;
+
+    FAddend::SimpResult R = T->trySimplifyAdd(*Opnd);
+    
+    // case 2: Opnd0 + Opnd = 0
+    if (R == FAddend::Zero || R == FAddend::FlushToZero)
+      continue;
+
+    // case 3: Opnd0 + Opnd = C * X
+    assert (R == FAddend::Simpler);
+    SimpVect.push_back(T);
+  }
+
+  Value *Result;
+  if (!SimpVect.empty())
+    Result = createNaryFAdd(SimpVect, InstrQuota);
+  else {
+    // The addition is folded to 0.0
+    Result = ConstantFP::get(Instr->getType(), 0.0);
+  }
+
+  return Result;
+}
+
+Value *FastMathInstComb::FAddCombine::createNaryFAdd
+  (const AddendVect& Opnds, unsigned InstrQuota) {
+  assert(!Opnds.empty() && "Exect at least one addend");
+
+  // Step 1: Check if the # of instruction needed exceeds the quota.
+  // 
+  unsigned InstrNeeded = calcInstrNumber(Opnds);
+  if (InstrNeeded > InstrQuota)
+    return 0;
+
+  initCreateInstNum();
+
+  // step 2: Emit the N-ary addition.
+  // Note that at most threee instructions involved in Fadd-InstCombine: the
+  // addition in question, and at most two neighboring instructions.
+  // The resulting optimized addition should have at least one less instruction
+  // than the original addition expression tree. This implies the resulting
+  // N-ary addition has at most two instructions, and we don't need to worry
+  // about tree-height when constructing the N-ary addition.
+
+  Value *LastVal = 0;
+  bool LastValNeedNeg = false;
+
+  // Iterate the addends, creating fadd/fsub using adjacent two addends.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    bool NeedNeg; 
+    Value *V = createAddendVal(**I, NeedNeg);
+    if (!LastVal) {
+      LastVal = V;
+      LastValNeedNeg = NeedNeg;
+      continue;
+    }
+
+    if (LastValNeedNeg == NeedNeg) {
+      LastVal = createFAdd(LastVal, V);
+      continue;
+    }
+
+    if (LastValNeedNeg)
+      LastVal = createFSub(V, LastVal);
+    else
+      LastVal = createFSub(LastVal, V);
+
+    LastValNeedNeg = false;
+  }
+
+  if (LastValNeedNeg) {
+    LastVal = createFNeg(LastVal);
+  }
+
+  #ifndef NDEBUG
+    assert(CreateInstrNum == InstrNeeded && 
+           "Inconsistent in instruction numbers");
+  #endif
+
+  return LastVal;
+}
+
+Value *FastMathInstComb::FAddCombine::createFSub
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFSub(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+Value *FastMathInstComb::FAddCombine::createFNeg(Value *V) {
+  Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0));
+  return createFSub(Zero, V);
+}
+
+Value *FastMathInstComb::FAddCombine::createFAdd
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+Value *FastMathInstComb::FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFMul(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+void FastMathInstComb::FAddCombine::createInstPostProc(Instruction *NewInstr) {
+  NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+  // keep track of the number of instruction created.
+  incCreateInstNum();
+
+  // Propagate fast-math flags
+  NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FastMathInstComb::FAddCombine::calcInstrNumber
+  (const AddendVect &Opnds) {
+  unsigned OpndNum = Opnds.size();
+  unsigned InstrNeeded = OpndNum - 1;
+
+  // The number of addends in the form the "(-1)*x". 
+  unsigned NegOpndNum = 0; 
+
+  // Adjust the the number of instruction needed to emit the N-ary add.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    const FAddend *Opnd = *I;
+    if (Opnd->isConstant())
+      continue;
+
+    const FAddendCoef& CE = Opnd->getCoef();
+    if (CE.isMinusOne() || CE.isMinusTwo())
+      NegOpndNum++;
+
+    // Let the addend be "c * x". If "c == +/-1", the value of the addend
+    // is immediately aviable; otherwise, it needs exactly one instruction
+    // to evaluate the value.
+    if (!CE.isMinusOne() && !CE.isOne())
+      InstrNeeded++;
+  }
+  if (NegOpndNum == OpndNum)
+    InstrNeeded++;
+  return InstrNeeded;
+}
+
+// Input Addend        Value           NeedNeg(output)
+// ================================================================
+// Constant C          C               false
+// <+/-1, V>           V               coefficient is -1
+// <2/-2, V>          "fadd V, V"      coefficient is -2
+// <C, V>             "fmul V, C"      false
+//
+Value *FastMathInstComb::FAddCombine::createAddendVal
+  (const FAddend &Opnd, bool& NeedNeg) {
+  const FAddendCoef& Coeff = Opnd.getCoef();
+
+  if (Opnd.isConstant()) {
+    NeedNeg = false;
+    return Coeff.getValue(Instr->getType());
+  }
+
+  Value *OpndVal = Opnd.getSymVal();
+
+  if (Coeff.isMinusOne() || Coeff.isOne()) {
+    NeedNeg = Coeff.isMinusOne();
+    return OpndVal;
+  }
+
+  if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+    NeedNeg = Coeff.isMinusTwo();
+    return createFAdd(OpndVal, OpndVal);
+  }
+
+  NeedNeg = false;
+  return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
 /// AddOne - Add one to a ConstantInt.
 static Constant *AddOne(Constant *C) {
   return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
 }
+
 /// SubOne - Subtract one from a ConstantInt.
 static Constant *SubOne(ConstantInt *C) {
   return ConstantInt::get(C->getContext(), C->getValue()-1);
@@ -402,6 +905,10 @@
     }
   }
   
+  if (I.hasUnsafeAlgebra())
+    if (Value *V = FastMathCombiner->simplifyFAdd(&I))
+      return ReplaceInstUsesWith(I, V);
+
   return Changed ? &I : 0;
 }
 
@@ -645,5 +1152,8 @@
   if (Value *V = dyn_castFNegVal(Op1))
     return BinaryOperator::CreateFAdd(Op0, V);
 
+  if (I.hasUnsafeAlgebra())
+    if (Value *V = FastMathCombiner->simplifyFAdd(&I))
+      return ReplaceInstUsesWith(I, V);
   return 0;
 }
-------------- next part --------------
Index: InstCombineFastMath.h
===================================================================
--- InstCombineFastMath.h	(revision 0)
+++ InstCombineFastMath.h	(revision 0)
@@ -0,0 +1,183 @@
+//===- InstCombineFastMath.h - Fast-math InstCombine definition  ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef INSTCOMBINE_FASTMATH_H
+#define INSTCOMBINE_FASTMATH_H
+
+namespace llvm {
+
+  /// Class representing coefficient of floating-point addend.
+  /// This class needs to be highly efficient.
+  class FAddendCoef {
+  public:
+    // The constructor has to initialize a APFloat, which is uncessary for
+    // most addends which have coefficient either 1 or -1. So, the constructor
+    // is expensive. In order to avoid the cost of the constructor, we should
+    // reuse some instances whenever possible. The pre-created instances
+    // FAddCombine::Add[0-5] embodies this idea.
+    //
+    FAddendCoef() : FpVal(0.0), IntVal(0), isInt(true) {}
+
+    void set(short C) {
+      assert(!InsaneIntVal(C) && "Insane coefficient");
+      isInt = true; IntVal = C;
+    }
+    void set(const APFloat& C) { isInt = false; FpVal = C; }
+
+    bool isZero() const { return isInt ? !IntVal : FpVal.isZero();}
+
+    void negate();
+
+    // If possible, don't define operator+/operator- etc because these
+    // operators inevitably call FAddendCoef's constructor which is not cheap.
+    void operator=(const FAddendCoef &A);
+    void operator+=(const FAddendCoef &A);
+    void operator-=(const FAddendCoef &A);
+    void operator*=(const FAddendCoef &S);
+
+    bool isOne() const { return isInt && IntVal == 1; }
+    bool isTwo() const { return isInt && IntVal == 2; }
+    bool isMinusOne() const { return isInt && IntVal == -1; }
+    bool isMinusTwo() const { return isInt && IntVal == -2; }
+
+    Value *getValue(Type *) const;
+
+  private:
+    bool InsaneIntVal(int V) { return V > 4 || V < -4; }
+
+    APFloat FpVal;
+    // The integer coefficient of an individual addend is either 1 or -1,
+    // and we try to simplify at most 4 addends from neighboring at most
+    // two instructions. So the range if <IntVal> falls in [-4, 4]. APInt
+    // is overkill of this end.
+    short IntVal;
+    bool isInt;
+  };
+
+  /// FAddend is used to represent floating-point addend. An addend is
+  /// represented as <C, V>, where the V is is symbolic value, and C is a
+  /// constant coefficient. A constant addend is represented as <C, 0>.
+  ///
+  class FAddend {
+  public:
+    typedef enum {
+      Simpler,     // addend1 = c1*x, addend2 = c2*x, result = (c1+c2)*x
+      FlushToZero, // similar to the case of Simpler, except that (c1+c2) is a
+                   // denormal, and the result is flushed to zero.
+      Zero,        // addend1 = c1*x, addend2 = -c1*x , result = 0
+      Fail         // addend1 = c1*x, addend2 = c2*y, and x != y
+    } SimpResult;
+
+    FAddend() { Val = 0; }
+
+    Value *getSymVal (void) const { return Val; }
+    const FAddendCoef& getCoef(void) const { return Coeff; }
+
+    bool isConstant() const { return Val == 0; }
+
+    void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
+    void set(const APFloat& Coefficient, Value *V)
+      { Coeff.set(Coefficient); Val = V; }
+    void set(const ConstantFP* Coefficient, Value *V)
+      { Coeff.set(Coefficient->getValueAPF()); Val = V; }
+
+    void negate() { Coeff.negate(); }
+
+    /// Try to simplify "\p this + \p Addend2". Iff simplification was
+    /// successful, the resulting value will be saved to "this" instance.
+    SimpResult trySimplifyAdd(const FAddend& Addend2, bool FlushToZero=false);
+
+    /// Drill down the U-D chain one step to find the definition of V, and
+    /// try to break the definition into one or two addends.
+    static unsigned drillDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+
+    /// Similar to FAddend::drillDownOneStep() except that the value being
+    /// splitted is the addend itself.
+    unsigned drillDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+
+  private:
+    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+
+    // This addend has the value of "Coeff * Val".
+    FAddendCoef Coeff;
+    Value *Val;
+  };
+
+  /// This functor works with std::sort to permute addends such that those
+  /// having same symbolic-value are clustered together.
+  struct FAddendCmp {
+    bool operator()(const FAddend *A1, const FAddend *A2) {
+      return A1->getSymVal() < A2->getSymVal();
+    }
+  };
+
+  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+  /// with its neighboring at most two instructions.
+  ///
+  class FAddCombine {
+  public:
+    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
+    Value *simplify(Instruction *FAdd);
+
+  private:
+    typedef SmallVector<const FAddend*, 4> AddendVect;
+
+    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+    /// Convert given addend to a Value
+    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+    
+    /// Return the number of instruction needed to emit the N-ary addition.
+    unsigned calcInstrNumber(const AddendVect& Vect);
+    Value *createFSub(Value *Opnd0, Value *Opnd1);
+    Value *createFAdd(Value *Opnd0, Value *Opnd1);
+    Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFNeg(Value *V);
+    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+    void createInstPostProc(Instruction *NewInst);
+
+    InstCombiner::BuilderTy *Builder;
+    Instruction *Instr;
+
+    FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+  private:
+    // "Messy" stuff to make simplifyFAdd() faster. NOTE: the functions
+    // defined this section can only be called by simplifyFAdd() itself.
+
+    // At most 4 addends are involved in simplification, so we need at
+    // most 4 - 1 tmp addends to evaluate the intermediate results.
+    #define MAX_TMP_ADDEND_NUM 3
+
+    FAddend *allocTmpAddend() {
+      assert((NextFreeIdx < MAX_TMP_ADDEND_NUM) && "run out of tmp addends");
+      return &TmpAddends[NextFreeIdx++];
+    }
+
+    void freeAllTmpAddends() { NextFreeIdx = 0; }
+
+    FAddend TmpAddends[MAX_TMP_ADDEND_NUM];
+    unsigned NextFreeIdx;
+
+    #undef MAX_TMP_ADDEND_NUM
+
+  private:
+     // Debugging stuff are clustered here.
+    #ifndef NDEBUG
+      unsigned CreateInstrNum;
+      void initCreateInstNum() { CreateInstrNum = 0; }
+      void incCreateInstNum() { CreateInstrNum++; }
+    #else
+      void initCreateInstNum() {}
+      void incCreateInstNum() {}
+    #endif
+  };
+
+} // end namespace llvm.
+
+#endif
Index: InstructionCombining.cpp
===================================================================
--- InstructionCombining.cpp	(revision 169996)
+++ InstructionCombining.cpp	(working copy)
@@ -53,6 +53,7 @@
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "InstCombineFastMath.h"
 #include <algorithm>
 #include <climits>
 using namespace llvm;
Index: InstCombine.h
===================================================================
--- InstCombine.h	(revision 169996)
+++ InstCombine.h	(working copy)
@@ -67,6 +67,8 @@
     Worklist.Add(I);
   }
 };
+
+class FastMathInstComb;
   
 /// InstCombiner - The -instcombine pass.
 class LLVM_LIBRARY_VISIBILITY InstCombiner
@@ -76,6 +78,8 @@
   TargetLibraryInfo *TLI;
   bool MadeIRChange;
   LibCallSimplifier *Simplifier;
+  FastMathInstComb *FastMathCombiner;
+
 public:
   /// Worklist - All of the instructions that need to be simplified.
   InstCombineWorklist Worklist;
Index: InstCombineAddSub.cpp
===================================================================
--- InstCombineAddSub.cpp	(revision 169996)
+++ InstCombineAddSub.cpp	(working copy)
@@ -16,13 +16,516 @@
 #include "llvm/DataLayout.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
+#include "InstCombineFastMath.h"
 using namespace llvm;
 using namespace PatternMatch;
 
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+//    {FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+
+void FAddendCoef::operator=(const FAddendCoef& That) {
+  if ((isInt = That.isInt))
+    IntVal = That.IntVal;
+  else
+    FpVal = That.FpVal;
+}
+
+void FAddendCoef::operator+=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt == That.isInt) {
+    if (isInt)
+      IntVal += That.IntVal;
+    else
+      FpVal.add(That.FpVal, RndMode);
+    return;
+  }
+  
+  if (isInt) {
+    FpVal = That.FpVal;
+    FpVal.add(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+    isInt = false;
+    return;
+  }
+  
+  FpVal.add(APFloat(FpVal.getSemantics(), That.IntVal), RndMode);
+}
+
+void FAddendCoef::operator-=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt == That.isInt) {
+    if (isInt)
+      IntVal -= That.IntVal;
+    else
+      FpVal.subtract(That.FpVal, RndMode);
+    return;
+  }
+  
+  if (isInt) {
+    FpVal = That.FpVal;
+    FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+    isInt = false;
+    return;
+  }
+
+  FpVal.subtract(APFloat(FpVal.getSemantics(), IntVal), RndMode);
+}
+
+void FAddendCoef::operator*=(const FAddendCoef &That) {
+  if (That.isOne())
+    return;
+
+  if (That.isMinusOne()) {
+    negate();
+    return;
+  }
+
+  if (isInt && That.isInt) {
+    int Res = IntVal * (int)That.IntVal;
+    assert(!InsaneIntVal(Res) && "Insane int value");
+    IntVal = Res;
+    return;
+  }
+
+  const fltSemantics &Semantic = 
+    isInt ? That.FpVal.getSemantics() : FpVal.getSemantics();
+
+  APFloat &F0 = FpVal;
+  if (isInt)
+    F0 = APFloat(Semantic, IntVal);
+
+  if (That.isInt)
+    F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven);
+  else
+    F0.multiply(That.FpVal, APFloat::rmNearestTiesToEven); 
+
+  return;
+}
+
+void FAddendCoef::negate() {
+  if (isInt)
+    IntVal = 0 - IntVal;
+  else
+    FpVal.changeSign();
+}
+
+Value *FAddendCoef::getValue(Type *Ty) const {
+  return isInt ?
+    ConstantFP::get(Ty, float(IntVal)) :
+    ConstantFP::get(Ty->getContext(), FpVal);
+}
+
+FAddend::SimpResult
+FAddend::trySimplifyAdd
+  (const FAddend& Addend2, bool FlushToZero) {
+  // Currently flush-to-0 is ignored. Following stmtement is to suppress
+  // compile-warning.
+  FlushToZero = !FlushToZero;
+ 
+  if (Val != Addend2.Val)
+    return Fail;
+  
+  Coeff += Addend2.Coeff;
+
+  return Coeff.isZero() ? Zero : Simpler;
+}
+
+// The definition of <Val>     Addends
+// =========================================
+//  A + B                     <1, A>, <1,B>
+//  A - B                     <1, A>, <1,B>
+//  0 - B                     <-1, B>
+//  C * A,                    <C, A>
+//  A + C                     <1, A> <C, NULL> 
+//  0 +/- 0                   <0, NULL> (corner case)
+//
+// Legend: A, B are not constant, C is constant
+// 
+unsigned FAddend::drillDownOneStep
+  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+  Instruction *I = 0;
+  if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
+    return 0;
+
+  unsigned Opcode = I->getOpcode();
+
+  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+    ConstantFP *C0, *C1;
+    Value *Opnd0 = I->getOperand(0);
+    Value *Opnd1 = I->getOperand(1);
+    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+      Opnd0 = 0;
+
+    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+      Opnd1 = 0;
+
+    if (Opnd0) {
+      if (!C0)
+        Addend0.set(1, Opnd0);
+      else
+        Addend0.set(C0, 0);
+    }
+
+    if (Opnd1) {
+      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+      if (!C1)
+        Addend.set(1, Opnd1);
+      else
+        Addend.set(C1, 0);
+      if (Opcode == Instruction::FSub)
+        Addend.negate();
+    }
+
+    if (Opnd0 || Opnd1)
+      return Opnd0 && Opnd1 ? 2 : 1;
+
+    // Both operands are zero. Weird!
+    Addend0.set(APFloat(0.0f), 0);
+    return 1;
+  }
+
+  if (I->getOpcode() == Instruction::FMul) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+      Addend0.set(C, V1);
+      return 1;
+    }
+
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+      Addend0.set(C, V0);
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+unsigned FAddend::drillDownOneStep
+  (FAddend &Addend0, FAddend &Addend1) const {
+  if (isConstant())
+    return 0;
+
+  unsigned BreakNum = FAddend::drillDownOneStep(Val, Addend0, Addend1);
+  if (!BreakNum || Coeff.isOne()) 
+    return BreakNum;
+
+  Addend0.Scale(Coeff);
+
+  if (BreakNum == 2)
+    Addend1.Scale(Coeff);
+
+  return BreakNum;
+}
+
+Value *FAddCombine::simplify(Instruction *I) {
+
+  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+
+  // Currently we are able to handle vector type.
+  if (I->getType()->isVectorTy())
+    return 0;
+
+  if (I->getOpcode() != Instruction::FAdd && 
+      I->getOpcode() != Instruction::FSub)
+    return 0;
+
+  // Save the instruction before calling other member-functions. 
+  Instr = I;
+
+  unsigned OpndNum = FAddend::drillDownOneStep(I, Opnd0, Opnd1);
+
+  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1
+  unsigned Opnd0_ExpNum = 0;
+  unsigned Opnd1_ExpNum = 0;
+
+  if (!Opnd0.isConstant()) 
+    Opnd0_ExpNum = Opnd0.drillDownOneStep(Opnd0_0, Opnd0_1);
+
+  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+  if (OpndNum == 2 && !Opnd1.isConstant())
+    Opnd1_ExpNum = Opnd1.drillDownOneStep(Opnd1_0, Opnd1_1);
+
+  // Step 3: try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+  if (Opnd0_ExpNum && Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0_0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    // Compute instruction quota. We should save at least one instruction.
+    unsigned InstQuota = 0;
+
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&  
+                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+      return R;
+  }
+
+  if (OpndNum != 2) {
+    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+    // splitted into two addends, say "V = X - Y", the instruction would have
+    // been optimized into "I = Y - X" in the previous steps.
+    //
+    const FAddendCoef& CE = Opnd0.getCoef();
+    return CE.isOne() ? Opnd0.getSymVal() : 0;
+  }
+
+  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+  if (Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 4: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+  if (Opnd0_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd1);
+    AllOpnds.push_back(&Opnd0_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  return 0;
+}
+
+
+Value *FAddCombine::simplifyFAdd
+  (AddendVect& Addends, unsigned InstrQuota) {
+
+  // Permute the input addends such that addends sharing same symbolic-value
+  // are clustered together. e.g. { c1*x, c2*y, c3*x, c4*y, ... } => 
+  /// { c1*x, c3*x, c2*y, c4*y, ...}.
+  std::sort(Addends.begin(), Addends.end(), FAddendCmp());
+
+  freeAllTmpAddends();
+
+  // Walk forward along the sorted addends, trying to combine adjacent two
+  // addends into a single one.
+  AddendVect SimpVect;
+  for (AddendVect::iterator I = Addends.begin(), E = Addends.end();
+       I != E; I++) {
+    const FAddend* Opnd = *I;
+    if (SimpVect.empty()) {
+      SimpVect.push_back(Opnd);
+      continue;
+    }
+
+    // Try to combine current addend with the previous adjacent addent
+    const FAddend *Opnd0 = SimpVect.back();
+    if (Opnd0->getSymVal() != Opnd->getSymVal()) {
+      // case 1: Opnd0 + Opnd can not be simplified.
+      SimpVect.push_back(Opnd);
+      continue;
+    }
+
+    SimpVect.pop_back();
+    FAddend *T = allocTmpAddend();
+    *T = *Opnd0;
+
+    FAddend::SimpResult R = T->trySimplifyAdd(*Opnd);
+    
+    // case 2: Opnd0 + Opnd = 0
+    if (R == FAddend::Zero || R == FAddend::FlushToZero)
+      continue;
+
+    // case 3: Opnd0 + Opnd = C * X
+    assert (R == FAddend::Simpler);
+    SimpVect.push_back(T);
+  }
+
+  Value *Result;
+  if (!SimpVect.empty())
+    Result = createNaryFAdd(SimpVect, InstrQuota);
+  else {
+    // The addition is folded to 0.0
+    Result = ConstantFP::get(Instr->getType(), 0.0);
+  }
+
+  return Result;
+}
+
+Value *FAddCombine::createNaryFAdd
+  (const AddendVect& Opnds, unsigned InstrQuota) {
+  assert(!Opnds.empty() && "Exect at least one addend");
+
+  // Step 1: Check if the # of instruction needed exceeds the quota.
+  // 
+  unsigned InstrNeeded = calcInstrNumber(Opnds);
+  if (InstrNeeded > InstrQuota)
+    return 0;
+
+  initCreateInstNum();
+
+  // step 2: Emit the N-ary addition.
+  // Note that at most threee instructions involved in Fadd-InstCombine: the
+  // addition in question, and at most two neighboring instructions.
+  // The resulting optimized addition should have at least one less instruction
+  // than the original addition expression tree. This implies the resulting
+  // N-ary addition has at most two instructions, and we don't need to worry
+  // about tree-height when constructing the N-ary addition.
+
+  Value *LastVal = 0;
+  bool LastValNeedNeg = false;
+
+  // Iterate the addends, creating fadd/fsub using adjacent two addends.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    bool NeedNeg; 
+    Value *V = createAddendVal(**I, NeedNeg);
+    if (!LastVal) {
+      LastVal = V;
+      LastValNeedNeg = NeedNeg;
+      continue;
+    }
+
+    if (LastValNeedNeg == NeedNeg) {
+      LastVal = createFAdd(LastVal, V);
+      continue;
+    }
+
+    if (LastValNeedNeg)
+      LastVal = createFSub(V, LastVal);
+    else
+      LastVal = createFSub(LastVal, V);
+
+    LastValNeedNeg = false;
+  }
+
+  if (LastValNeedNeg) {
+    LastVal = createFNeg(LastVal);
+  }
+
+  #ifndef NDEBUG
+    assert(CreateInstrNum == InstrNeeded && 
+           "Inconsistent in instruction numbers");
+  #endif
+
+  return LastVal;
+}
+
+Value *FAddCombine::createFSub
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFSub(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+Value *FAddCombine::createFNeg(Value *V) {
+  Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0));
+  return createFSub(Zero, V);
+}
+
+Value *FAddCombine::createFAdd
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFMul(Opnd0, Opnd1);
+  createInstPostProc(cast<Instruction>(V));
+  return V;
+}
+
+void FAddCombine::createInstPostProc(Instruction *NewInstr) {
+  NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+  // keep track of the number of instruction created.
+  incCreateInstNum();
+
+  // Propagate fast-math flags
+  NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FAddCombine::calcInstrNumber
+  (const AddendVect &Opnds) {
+  unsigned OpndNum = Opnds.size();
+  unsigned InstrNeeded = OpndNum - 1;
+
+  // The number of addends in the form the "(-1)*x". 
+  unsigned NegOpndNum = 0; 
+
+  // Adjust the the number of instruction needed to emit the N-ary add.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    const FAddend *Opnd = *I;
+    if (Opnd->isConstant())
+      continue;
+
+    const FAddendCoef& CE = Opnd->getCoef();
+    if (CE.isMinusOne() || CE.isMinusTwo())
+      NegOpndNum++;
+
+    // Let the addend be "c * x". If "c == +/-1", the value of the addend
+    // is immediately aviable; otherwise, it needs exactly one instruction
+    // to evaluate the value.
+    if (!CE.isMinusOne() && !CE.isOne())
+      InstrNeeded++;
+  }
+  if (NegOpndNum == OpndNum)
+    InstrNeeded++;
+  return InstrNeeded;
+}
+
+// Input Addend        Value           NeedNeg(output)
+// ================================================================
+// Constant C          C               false
+// <+/-1, V>           V               coefficient is -1
+// <2/-2, V>          "fadd V, V"      coefficient is -2
+// <C, V>             "fmul V, C"      false
+//
+Value *FAddCombine::createAddendVal
+  (const FAddend &Opnd, bool& NeedNeg) {
+  const FAddendCoef& Coeff = Opnd.getCoef();
+
+  if (Opnd.isConstant()) {
+    NeedNeg = false;
+    return Coeff.getValue(Instr->getType());
+  }
+
+  Value *OpndVal = Opnd.getSymVal();
+
+  if (Coeff.isMinusOne() || Coeff.isOne()) {
+    NeedNeg = Coeff.isMinusOne();
+    return OpndVal;
+  }
+
+  if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+    NeedNeg = Coeff.isMinusTwo();
+    return createFAdd(OpndVal, OpndVal);
+  }
+
+  NeedNeg = false;
+  return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
 /// AddOne - Add one to a ConstantInt.
 static Constant *AddOne(Constant *C) {
   return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
 }
+
 /// SubOne - Subtract one from a ConstantInt.
 static Constant *SubOne(ConstantInt *C) {
   return ConstantInt::get(C->getContext(), C->getValue()-1);
@@ -402,6 +905,13 @@
     }
   }
   
+  I.setHasUnsafeAlgebra(true);
+  if (I.hasUnsafeAlgebra()) {
+    FAddCombine C(Builder);
+    if (Value *V = C.simplify(&I))
+      return ReplaceInstUsesWith(I, V);
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -645,5 +1155,12 @@
   if (Value *V = dyn_castFNegVal(Op1))
     return BinaryOperator::CreateFAdd(Op0, V);
 
+  I.setHasUnsafeAlgebra(true);
+  if (I.hasUnsafeAlgebra()) {
+    FAddCombine C(Builder);
+    if (Value *V = C.simplify(&I))
+      return ReplaceInstUsesWith(I, V);
+  }
+
   return 0;
 }