[llvm-commits] [llvm] r170471 - in /llvm/trunk: lib/Transforms/InstCombine/InstCombineAddSub.cpp test/Transforms/InstCombine/fast-math.ll

Tue Dec 18 15:52:39 PST 2012

On Tue, Dec 18, 2012 at 3:49 PM, Eli Friedman <eli.friedman at gmail.com> wrote:
> On Tue, Dec 18, 2012 at 3:10 PM, Shuxin Yang <shuxin.llvm at gmail.com> wrote:
>> Author: shuxin_yang
>> Date: Tue Dec 18 17:10:12 2012
>> New Revision: 170471
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=170471&view=rev
>> Log:
>> rdar://12801297
>>
>>  InstCombine for unsafe floating-point add/sub.
>>
>> Modified:
>>     llvm/trunk/lib/Transforms/InstCombine/InstCombineAddSub.cpp
>>     llvm/trunk/test/Transforms/InstCombine/fast-math.ll
>
> General comment: I was sort of expecting one more round of review
> before you committed this because you made some substantial changes to
> the previous version.  (No need to back out, though.)
>
>> Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineAddSub.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineAddSub.cpp?rev=170471&r1=170470&r2=170471&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Transforms/InstCombine/InstCombineAddSub.cpp (original)
>> +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineAddSub.cpp Tue Dec 18 17:10:12 2012
>> @@ -19,10 +19,715 @@
>>  using namespace llvm;
>>  using namespace PatternMatch;
>>
>> +namespace {
>> +
>> +  /// Class representing coefficient of floating-point addend.
>> +  /// This class needs to be highly efficient, which is especially true for
>> +  /// the constructor. As of I write this comment, the cost of the default
>> +  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to
>> +  /// perform write-merging).
>> +  ///
>> +  class FAddendCoef {
>> +  public:
>> +    // The constructor has to initialize a APFloat, which is uncessary for
>> +    // most addends which have coefficient either 1 or -1. So, the constructor
>> +    // is expensive. In order to avoid the cost of the constructor, we should
>> +    // reuse some instances whenever possible. The pre-created instances
>> +    // FAddCombine::Add[0-5] embodies this idea.
>> +    //
>> +    FAddendCoef() : IsFp(false), BufHasFpVal(false), IntVal(0) {}
>> +    ~FAddendCoef();
>> +
>> +    void set(short C) {
>> +      assert(!insaneIntVal(C) && "Insane coefficient");
>> +      IsFp = false; IntVal = C;
>> +    }
>> +
>> +    void set(const APFloat& C);
>> +
>> +    void negate();
>> +
>> +    bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
>> +    Value *getValue(Type *) const;
>> +
>> +    // If possible, don't define operator+/operator- etc because these
>> +    // operators inevitably call FAddendCoef's constructor which is not cheap.
>> +    void operator=(const FAddendCoef &A);
>> +    void operator+=(const FAddendCoef &A);
>> +    void operator-=(const FAddendCoef &A);
>> +    void operator*=(const FAddendCoef &S);
>> +
>> +    bool isOne() const { return isInt() && IntVal == 1; }
>> +    bool isTwo() const { return isInt() && IntVal == 2; }
>> +    bool isMinusOne() const { return isInt() && IntVal == -1; }
>> +    bool isMinusTwo() const { return isInt() && IntVal == -2; }
>> +
>> +  private:
>> +    bool insaneIntVal(int V) { return V > 4 || V < -4; }
>> +    APFloat *getFpValPtr(void)
>> +      { return reinterpret_cast<APFloat*>(&FpValBuf[0]); }
>> +
>> +    const APFloat &getFpVal(void) const {
>> +      assert(IsFp && BufHasFpVal && "Incorret state");
>> +      return *reinterpret_cast<const APFloat*>(&FpValBuf[0]);
>> +    }
>> +
>> +    APFloat &getFpVal(void)
>> +      { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); }
>> +
>> +    bool isInt() const { return !IsFp; }
>> +
>> +  private:
>> +    bool IsFp;
>> +
>> +    // True iff FpValBuf contains an instance of APFloat.
>> +    bool BufHasFpVal;
>> +
>> +    // The integer coefficient of an individual addend is either 1 or -1,
>> +    // and we try to simplify at most 4 addends from neighboring at most
>> +    // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
>> +    // is overkill of this end.
>> +    short IntVal;
>> +
>> +    union {
>> +      char FpValBuf[sizeof(APFloat)];
>> +      int dummy; // So this structure has at least 4-byte alignment.
>> +    };
>
> I'm sure I made a comment here before about using llvm::AlignedCharArrayUnion...
>
>> +  };
>> +
>> +  /// FAddend is used to represent floating-point addend. An addend is
>> +  /// represented as <C, V>, where the V is a symbolic value, and C is a
>> +  /// constant coefficient. A constant addend is represented as <C, 0>.
>> +  ///
>> +  class FAddend {
>> +  public:
>> +    FAddend() { Val = 0; }
>> +
>> +    Value *getSymVal (void) const { return Val; }
>> +    const FAddendCoef &getCoef(void) const { return Coeff; }
>> +
>> +    bool isConstant() const { return Val == 0; }
>> +    bool isZero() const { return Coeff.isZero(); }
>> +
>> +    void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
>> +    void set(const APFloat& Coefficient, Value *V)
>> +      { Coeff.set(Coefficient); Val = V; }
>> +    void set(const ConstantFP* Coefficient, Value *V)
>> +      { Coeff.set(Coefficient->getValueAPF()); Val = V; }
>> +
>> +    void negate() { Coeff.negate(); }
>> +
>> +    /// Drill down the U-D chain one step to find the definition of V, and
>> +    /// try to break the definition into one or two addends.
>> +    static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
>> +
>> +    /// Similar to FAddend::drillDownOneStep() except that the value being
>> +    /// splitted is the addend itself.
>> +    unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
>> +
>> +    void operator+=(const FAddend &T) {
>> +      assert((Val == T.Val) && "Symbolic-values disagree");
>> +      Coeff += T.Coeff;
>> +    }
>> +
>> +  private:
>> +    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
>> +
>> +    // This addend has the value of "Coeff * Val".
>> +    Value *Val;
>> +    FAddendCoef Coeff;
>> +  };
>> +
>> +  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
>> +  /// with its neighboring at most two instructions.
>> +  ///
>> +  class FAddCombine {
>> +  public:
>> +    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
>> +    Value *simplify(Instruction *FAdd);
>> +
>> +  private:
>> +    typedef SmallVector<const FAddend*, 4> AddendVect;
>> +
>> +    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
>> +
>> +    /// Convert given addend to a Value
>> +    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
>> +
>> +    /// Return the number of instructions needed to emit the N-ary addition.
>> +    unsigned calcInstrNumber(const AddendVect& Vect);
>> +    Value *createFSub(Value *Opnd0, Value *Opnd1);
>> +    Value *createFAdd(Value *Opnd0, Value *Opnd1);
>> +    Value *createFMul(Value *Opnd0, Value *Opnd1);
>> +    Value *createFNeg(Value *V);
>> +    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
>> +    void createInstPostProc(Instruction *NewInst);
>> +
>> +    InstCombiner::BuilderTy *Builder;
>> +    Instruction *Instr;
>> +
>> +  private:
>> +     // Debugging stuff are clustered here.
>> +    #ifndef NDEBUG
>> +      unsigned CreateInstrNum;
>> +      void initCreateInstNum() { CreateInstrNum = 0; }
>> +      void incCreateInstNum() { CreateInstrNum++; }
>> +    #else
>> +      void initCreateInstNum() {}
>> +      void incCreateInstNum() {}
>> +    #endif
>> +  };
>> +}
>> +
>> +//===----------------------------------------------------------------------===//
>> +//
>> +// Implementation of
>> +//    {FAddendCoef, FAddend, FAddition, FAddCombine}.
>> +//
>> +//===----------------------------------------------------------------------===//
>> +FAddendCoef::~FAddendCoef() {
>> +  if (BufHasFpVal)
>> +    getFpValPtr()->~APFloat();
>> +}
>> +
>> +void FAddendCoef::set(const APFloat& C) {
>> +  APFloat *P = getFpValPtr();
>> +
>> +  if (isInt()) {
>> +    // As the buffer is meanless byte stream, we cannot call
>> +    // APFloat::operator=().
>> +    new(P) APFloat(C);
>> +  } else
>> +    *P = C;
>> +
>> +  IsFp = BufHasFpVal = true;
>> +}
>> +
>> +void FAddendCoef::operator=(const FAddendCoef& That) {
>> +  if (That.isInt())
>> +    set(That.IntVal);
>> +  else
>> +    set(That.getFpVal());
>> +}
>> +
>> +void FAddendCoef::operator+=(const FAddendCoef &That) {
>> +  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
>> +  if (isInt() == That.isInt()) {
>> +    if (isInt())
>> +      IntVal += That.IntVal;
>> +    else
>> +      getFpVal().add(That.getFpVal(), RndMode);
>> +    return;
>> +  }
>> +
>> +  if (isInt()) {
>> +    const APFloat &T = That.getFpVal();
>> +    set(T);
>> +    getFpVal().add(APFloat(T.getSemantics(), IntVal), RndMode);
>> +    return;
>> +  }
>> +
>> +  APFloat &T = getFpVal();
>> +  T.add(APFloat(T.getSemantics(), That.IntVal), RndMode);
>> +}
>> +
>> +void FAddendCoef::operator-=(const FAddendCoef &That) {
>> +  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
>> +  if (isInt() == That.isInt()) {
>> +    if (isInt())
>> +      IntVal -= That.IntVal;
>> +    else
>> +      getFpVal().subtract(That.getFpVal(), RndMode);
>> +    return;
>> +  }
>> +
>> +  if (isInt()) {
>> +    const APFloat &T = That.getFpVal();
>> +    set(T);
>> +    getFpVal().subtract(APFloat(T.getSemantics(), IntVal), RndMode);
>> +    return;
>> +  }
>> +
>> +  APFloat &T = getFpVal();
>> +  T.subtract(APFloat(T.getSemantics(), IntVal), RndMode);
>> +}
>> +
>> +void FAddendCoef::operator*=(const FAddendCoef &That) {
>> +  if (That.isOne())
>> +    return;
>> +
>> +  if (That.isMinusOne()) {
>> +    negate();
>> +    return;
>> +  }
>> +
>> +  if (isInt() && That.isInt()) {
>> +    int Res = IntVal * (int)That.IntVal;
>> +    assert(!insaneIntVal(Res) && "Insane int value");
>> +    IntVal = Res;
>> +    return;
>> +  }
>> +
>> +  const fltSemantics &Semantic =
>> +    isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
>> +
>> +  if (isInt())
>> +    set(APFloat(Semantic, IntVal));
>> +  APFloat &F0 = getFpVal();
>> +
>> +  if (That.isInt())
>> +    F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven);
>> +  else
>> +    F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
>> +
>> +  return;
>> +}
>> +
>> +void FAddendCoef::negate() {
>> +  if (isInt())
>> +    IntVal = 0 - IntVal;
>> +  else
>> +    getFpVal().changeSign();
>> +}
>> +
>> +Value *FAddendCoef::getValue(Type *Ty) const {
>> +  return isInt() ?
>> +    ConstantFP::get(Ty, float(IntVal)) :
>> +    ConstantFP::get(Ty->getContext(), getFpVal());
>> +}
>> +
>> +// The definition of <Val>     Addends
>> +// =========================================
>> +//  A + B                     <1, A>, <1,B>
>> +//  A - B                     <1, A>, <1,B>
>> +//  0 - B                     <-1, B>
>> +//  C * A,                    <C, A>
>> +//  A + C                     <1, A> <C, NULL>
>> +//  0 +/- 0                   <0, NULL> (corner case)
>> +//
>> +// Legend: A and B are not constant, C is constant
>> +//
>> +unsigned FAddend::drillValueDownOneStep
>> +  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
>> +  Instruction *I = 0;
>> +  if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
>> +    return 0;
>> +
>> +  unsigned Opcode = I->getOpcode();
>> +
>> +  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
>> +    ConstantFP *C0, *C1;
>> +    Value *Opnd0 = I->getOperand(0);
>> +    Value *Opnd1 = I->getOperand(1);
>> +    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
>> +      Opnd0 = 0;
>> +
>> +    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
>> +      Opnd1 = 0;
>> +
>> +    if (Opnd0) {
>> +      if (!C0)
>> +        Addend0.set(1, Opnd0);
>> +      else
>> +        Addend0.set(C0, 0);
>> +    }
>> +
>> +    if (Opnd1) {
>> +      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
>> +      if (!C1)
>> +        Addend.set(1, Opnd1);
>> +      else
>> +        Addend.set(C1, 0);
>> +      if (Opcode == Instruction::FSub)
>> +        Addend.negate();
>> +    }
>> +
>> +    if (Opnd0 || Opnd1)
>> +      return Opnd0 && Opnd1 ? 2 : 1;
>> +
>> +    // Both operands are zero. Weird!
>> +    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), 0);
>> +    return 1;
>> +  }
>> +
>> +  if (I->getOpcode() == Instruction::FMul) {
>> +    Value *V0 = I->getOperand(0);
>> +    Value *V1 = I->getOperand(1);
>> +    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
>> +      Addend0.set(C, V1);
>> +      return 1;
>> +    }
>> +
>> +    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
>> +      Addend0.set(C, V0);
>> +      return 1;
>> +    }
>> +  }
>> +
>> +  return 0;
>> +}
>> +
>> +// Try to break *this* addend into two addends. e.g. Suppose this addend is
>> +// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
>> +// i.e. <2.3, X> and <2.3, Y>.
>> +//
>> +unsigned FAddend::drillAddendDownOneStep
>> +  (FAddend &Addend0, FAddend &Addend1) const {
>> +  if (isConstant())
>> +    return 0;
>> +
>> +  unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
>> +  if (!BreakNum || Coeff.isOne())
>> +    return BreakNum;
>> +
>> +  Addend0.Scale(Coeff);
>> +
>> +  if (BreakNum == 2)
>> +    Addend1.Scale(Coeff);
>> +
>> +  return BreakNum;
>> +}
>> +
>> +Value *FAddCombine::simplify(Instruction *I) {
>> +  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
>> +
>> +  // Currently we are not able to handle vector type.
>> +  if (I->getType()->isVectorTy())
>> +    return 0;
>> +
>> +  assert((I->getOpcode() == Instruction::FAdd ||
>> +          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
>> +
>> +  // Save the instruction before calling other member-functions.
>> +  Instr = I;
>> +
>> +  FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
>> +
>> +  unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);
>> +
>> +  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
>> +  unsigned Opnd0_ExpNum = 0;
>> +  unsigned Opnd1_ExpNum = 0;
>> +
>> +  if (!Opnd0.isConstant())
>> +    Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
>> +
>> +  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
>> +  if (OpndNum == 2 && !Opnd1.isConstant())
>> +    Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);
>> +
>> +  // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
>> +  if (Opnd0_ExpNum && Opnd1_ExpNum) {
>> +    AddendVect AllOpnds;
>> +    AllOpnds.push_back(&Opnd0_0);
>> +    AllOpnds.push_back(&Opnd1_0);
>> +    if (Opnd0_ExpNum == 2)
>> +      AllOpnds.push_back(&Opnd0_1);
>> +    if (Opnd1_ExpNum == 2)
>> +      AllOpnds.push_back(&Opnd1_1);
>> +
>> +    // Compute instruction quota. We should save at least one instruction.
>> +    unsigned InstQuota = 0;
>> +
>> +    Value *V0 = I->getOperand(0);
>> +    Value *V1 = I->getOperand(1);
>> +    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
>> +                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
>> +
>> +    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
>> +      return R;
>> +  }
>> +
>> +  if (OpndNum != 2) {
>> +    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
>> +    // splitted into two addends, say "V = X - Y", the instruction would have
>> +    // been optimized into "I = Y - X" in the previous steps.
>> +    //
>> +    const FAddendCoef &CE = Opnd0.getCoef();
>> +    return CE.isOne() ? Opnd0.getSymVal() : 0;
>> +  }
>> +
>> +  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
>> +  if (Opnd1_ExpNum) {
>> +    AddendVect AllOpnds;
>> +    AllOpnds.push_back(&Opnd0);
>> +    AllOpnds.push_back(&Opnd1_0);
>> +    if (Opnd1_ExpNum == 2)
>> +      AllOpnds.push_back(&Opnd1_1);
>> +
>> +    if (Value *R = simplifyFAdd(AllOpnds, 1))
>> +      return R;
>> +  }
>> +
>> +  // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
>> +  if (Opnd0_ExpNum) {
>> +    AddendVect AllOpnds;
>> +    AllOpnds.push_back(&Opnd1);
>> +    AllOpnds.push_back(&Opnd0_0);
>> +    if (Opnd0_ExpNum == 2)
>> +      AllOpnds.push_back(&Opnd0_1);
>> +
>> +    if (Value *R = simplifyFAdd(AllOpnds, 1))
>> +      return R;
>> +  }
>> +
>> +  return 0;
>> +}
>> +
>> +Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
>> +
>> +  unsigned AddendNum = Addends.size();
>> +  assert(AddendNum <= 4 && "Too many addends");
>> +
>> +  // For saving intermediate results;
>> +  unsigned NextTmpIdx = 0;
>> +  FAddend TmpResult[3];
>> +
>> +  // Points to the constant addend of the resulting simplified expression.
>> +  // If the resulting expr has constant-addend, this constant-addend is
>> +  // desirable to reside at the top of the resulting expression tree. Placing
>> +  // constant close to supper-expr(s) will potentially reveal some optimization
>> +  // opportunities in super-expr(s).
>> +  //
>> +  const FAddend *ConstAdd = 0;
>> +
>> +  // Simplified addends are placed <SimpVect>.
>> +  AddendVect SimpVect;
>> +
>> +  // The outer loop works on one symbolic-value at a time. Suppose the input
>> +  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
>> +  // The symbolic-values will be processed in this order: x, y, z.
>> +  //
>> +  for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
>> +
>> +    const FAddend *ThisAddend = Addends[SymIdx];
>> +    if (!ThisAddend) {
>> +      // This addend was processed before.
>> +      continue;
>> +    }
>> +
>> +    Value *Val = ThisAddend->getSymVal();
>> +    unsigned StartIdx = SimpVect.size();
>> +    SimpVect.push_back(ThisAddend);
>> +
>> +    // The inner loop collects addends sharing same symbolic-value, and these
>> +    // addends will be later on folded into a single addend. Following above
>> +    // example, if the symbolic value "y" is being processed, the inner loop
>> +    // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
>> +    // be later on folded into "<b1+b2, y>".
>> +    //
>> +    for (unsigned SameSymIdx = SymIdx + 1;
>> +         SameSymIdx < AddendNum; SameSymIdx++) {
>> +      const FAddend *T = Addends[SameSymIdx];
>> +      if (T && T->getSymVal() == Val) {
>> +        // Set null such that next iteration of the outer loop will not process
>> +        // this addend again.
>> +        Addends[SameSymIdx] = 0;
>> +        SimpVect.push_back(T);
>> +      }
>> +    }
>> +
>> +    // If multiple addends share same symbolic value, fold them together.
>> +    if (StartIdx + 1 != SimpVect.size()) {
>> +      FAddend &R = TmpResult[NextTmpIdx ++];
>> +      R = *SimpVect[StartIdx];
>> +      for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
>> +        R += *SimpVect[Idx];
>> +
>> +      // Pop all addends being folded and push the resulting folded addend.
>> +      SimpVect.resize(StartIdx);
>> +      if (Val != 0) {
>> +        if (!R.isZero()) {
>> +          SimpVect.push_back(&R);
>> +        }
>> +      } else {
>> +        // Don't push constant addend at this time. It will be the last element
>> +        // of <SimpVect>.
>> +        ConstAdd = &R;
>> +      }
>> +    }
>> +  }
>
> This loop seems overly complicated for what it's actually doing... are
> you sure you can't simplify it any?
>
>> +  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) &&
>> +         "out-of-bound access");
>
> llvm::array_lengthof?
>
> -Eli

Oh, and one more comment: I would like to see at least twice as many
tests as you have now.  The tests you have cover the basic
functionality, but not the more complicated cases.

-Eli