[llvm-commits] CVS: llvm/lib/Transforms/Scalar/InstructionCombining.cpp

Wed Oct 4 23:56:04 PDT 2006

Changes in directory llvm/lib/Transforms/Scalar:

InstructionCombining.cpp updated: 1.517 -> 1.518
---
Log message:

add a new SimplifyDemandedVectorElts method, which works similarly to 
SimplifyDemandedBits.  The idea is that some operations can be simplified if
not all of the computed elements are needed.  Some targets (like x86) have a
large number of intrinsics that operate on a single element, but pass other
elts through unmodified.  If those other elements are not needed, the 
intrinsics can be simplified to scalar operations, and insertelement ops can
be removed.

This turns (f.e.):

ushort %Convert_sse(float %f) {
        %tmp = insertelement <4 x float> undef, float %f, uint 0                ; <<4 x float>> [#uses=1]
        %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, uint 1             ; <<4 x float>> [#uses=1]
        %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, uint 2           ; <<4 x float>> [#uses=1]
        %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, uint 3           ; <<4 x float>> [#uses=1]
        %tmp28 = tail call <4 x float> %llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )               ; <<4 x float>> [#uses=1]
        %tmp37 = tail call <4 x float> %llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )               ; <<4 x float>> [#uses=1]
        %tmp48 = tail call <4 x float> %llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )               ; <<4 x float>> [#uses=1]
        %tmp59 = tail call <4 x float> %llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )          ; <<4 x float>> [#uses=1]
        %tmp = tail call int %llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )              ; <int> [#uses=1]
        %tmp69 = cast int %tmp to ushort                ; <ushort> [#uses=1]
        ret ushort %tmp69
}

into:

ushort %Convert_sse(float %f) {
entry:
        %tmp28 = sub float %f, 1.000000e+00             ; <float> [#uses=1]
        %tmp37 = mul float %tmp28, 5.000000e-01         ; <float> [#uses=1]
        %tmp375 = insertelement <4 x float> undef, float %tmp37, uint 0         ; <<4 x float>> [#uses=1]
        %tmp48 = tail call <4 x float> %llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > )           ; <<4 x float>> [#uses=1]
        %tmp59 = tail call <4 x float> %llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > )            ; <<4 x float>> [#uses=1]
        %tmp = tail call int %llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )              ; <int> [#uses=1]
        %tmp69 = cast int %tmp to ushort                ; <ushort> [#uses=1]
        ret ushort %tmp69
}

which improves codegen from:

_Convert_sse:
        movss LCPI1_0, %xmm0
        movss 4(%esp), %xmm1
        subss %xmm0, %xmm1
        movss LCPI1_1, %xmm0
        mulss %xmm0, %xmm1
        movss LCPI1_2, %xmm0
        minss %xmm0, %xmm1
        xorps %xmm0, %xmm0
        maxss %xmm0, %xmm1
        cvttss2si %xmm1, %eax
        andl $65535, %eax
        ret

to:

_Convert_sse:
        movss 4(%esp), %xmm0
        subss LCPI1_0, %xmm0
        mulss LCPI1_1, %xmm0
        movss LCPI1_2, %xmm1
        minss %xmm1, %xmm0
        xorps %xmm1, %xmm1
        maxss %xmm1, %xmm0
        cvttss2si %xmm0, %eax
        andl $65535, %eax
        ret


This is just a first step, it can be extended in many ways.  Testcase here:
Transforms/InstCombine/vec_demanded_elts.ll



---
Diffs of the changes:  (+254 -8)

 InstructionCombining.cpp |  262 +++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 254 insertions(+), 8 deletions(-)


Index: llvm/lib/Transforms/Scalar/InstructionCombining.cpp
diff -u llvm/lib/Transforms/Scalar/InstructionCombining.cpp:1.517 llvm/lib/Transforms/Scalar/InstructionCombining.cpp:1.518

--- llvm/lib/Transforms/Scalar/InstructionCombining.cpp:1.517	Sun Oct  1 14:40:58 2006
+++ llvm/lib/Transforms/Scalar/InstructionCombining.cpp	Thu Oct  5 01:55:50 2006
@@ -88,6 +88,25 @@
         if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i)))
           WorkList.push_back(Op);
     }
+    
+    /// AddSoonDeadInstToWorklist - The specified instruction is about to become
+    /// dead.  Add all of its operands to the worklist, turning them into
+    /// undef's to reduce the number of uses of those instructions.
+    ///
+    /// Return the specified operand before it is turned into an undef.
+    ///
+    Value *AddSoonDeadInstToWorklist(Instruction &I, unsigned op) {
+      Value *R = I.getOperand(op);
+      
+      for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i))) {
+          WorkList.push_back(Op);
+          // Set the operand to undef to drop the use.
+          I.setOperand(i, UndefValue::get(Op->getType()));
+        }
+      
+      return R;
+    }
 
     // removeFromWorkList - remove all instances of I from the worklist.
     void removeFromWorkList(Instruction *I);
@@ -241,6 +260,9 @@
                               uint64_t &KnownZero, uint64_t &KnownOne,
                               unsigned Depth = 0);
 
+    Value *SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
+                                      uint64_t &UndefElts, unsigned Depth = 0);
+      
     // FoldOpIntoPhi - Given a binary operator or cast instruction which has a
     // PHI node as operand #0, see if we can fold the instruction into the PHI
     // (which is only possible if all operands to the PHI are constants).
@@ -1173,6 +1195,208 @@
   return false;
 }  
 
+
+/// SimplifyDemandedVectorElts - The specified value producecs a vector with
+/// 64 or fewer elements.  DemandedElts contains the set of elements that are
+/// actually used by the caller.  This method analyzes which elements of the
+/// operand are undef and returns that information in UndefElts.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned.  This returns null if no change was made.
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
+                                                uint64_t &UndefElts,
+                                                unsigned Depth) {
+  unsigned VWidth = cast<PackedType>(V->getType())->getNumElements();
+  assert(VWidth <= 64 && "Vector too wide to analyze!");
+  uint64_t EltMask = ~0ULL >> (64-VWidth);
+  assert(DemandedElts != EltMask && (DemandedElts & ~EltMask) == 0 &&
+         "Invalid DemandedElts!");
+
+  if (isa<UndefValue>(V)) {
+    // If the entire vector is undefined, just return this info.
+    UndefElts = EltMask;
+    return 0;
+  } else if (DemandedElts == 0) { // If nothing is demanded, provide undef.
+    UndefElts = EltMask;
+    return UndefValue::get(V->getType());
+  }
+  
+  UndefElts = 0;
+  if (ConstantPacked *CP = dyn_cast<ConstantPacked>(V)) {
+    const Type *EltTy = cast<PackedType>(V->getType())->getElementType();
+    Constant *Undef = UndefValue::get(EltTy);
+
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i)
+      if (!(DemandedElts & (1ULL << i))) {   // If not demanded, set to undef.
+        Elts.push_back(Undef);
+        UndefElts |= (1ULL << i);
+      } else if (isa<UndefValue>(CP->getOperand(i))) {   // Already undef.
+        Elts.push_back(Undef);
+        UndefElts |= (1ULL << i);
+      } else {                               // Otherwise, defined.
+        Elts.push_back(CP->getOperand(i));
+      }
+        
+    // If we changed the constant, return it.
+    Constant *NewCP = ConstantPacked::get(Elts);
+    return NewCP != CP ? NewCP : 0;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    // Simplify the CAZ to a ConstantPacked where the non-demanded elements are
+    // set to undef.
+    const Type *EltTy = cast<PackedType>(V->getType())->getElementType();
+    Constant *Zero = Constant::getNullValue(EltTy);
+    Constant *Undef = UndefValue::get(EltTy);
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i)
+      Elts.push_back((DemandedElts & (1ULL << i)) ? Zero : Undef);
+    UndefElts = DemandedElts ^ EltMask;
+    return ConstantPacked::get(Elts);
+  }
+  
+  if (!V->hasOneUse()) {    // Other users may use these bits.
+    if (Depth != 0) {       // Not at the root.
+      // TODO: Just compute the UndefElts information recursively.
+      return false;
+    }
+    return false;
+  } else if (Depth == 10) {        // Limit search depth.
+    return false;
+  }
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;        // Only analyze instructions.
+  
+  bool MadeChange = false;
+  uint64_t UndefElts2;
+  Value *TmpV;
+  switch (I->getOpcode()) {
+  default: break;
+    
+  case Instruction::InsertElement: {
+    // If this is a variable index, we don't know which element it overwrites.
+    // demand exactly the same input as we produce.
+    ConstantUInt *Idx = dyn_cast<ConstantUInt>(I->getOperand(2));
+    if (Idx == 0) {
+      // Note that we can't propagate undef elt info, because we don't know
+      // which elt is getting updated.
+      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+      break;
+    }
+    
+    // If this is inserting an element that isn't demanded, remove this
+    // insertelement.
+    unsigned IdxNo = Idx->getValue();
+    if (IdxNo >= VWidth || (DemandedElts & (1ULL << IdxNo)) == 0)
+      return AddSoonDeadInstToWorklist(*I, 0);
+    
+    // Otherwise, the element inserted overwrites whatever was there, so the
+    // input demanded set is simpler than the output set.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0),
+                                      DemandedElts & ~(1ULL << IdxNo),
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    // The inserted element is defined.
+    UndefElts |= 1ULL << IdxNo;
+    break;
+  }
+    
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+      
+    // Output elements are undefined if both are undefined.  Consider things
+    // like undef&0.  The result is known zero, not undef.
+    UndefElts &= UndefElts2;
+    break;
+    
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (!II) break;
+    switch (II->getIntrinsicID()) {
+    default: break;
+      
+    // Binary vector operations that work column-wise.  A dest element is a
+    // function of the corresponding input elements from the two inputs.
+    case Intrinsic::x86_sse_sub_ss:
+    case Intrinsic::x86_sse_mul_ss:
+    case Intrinsic::x86_sse_min_ss:
+    case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse2_sub_sd:
+    case Intrinsic::x86_sse2_mul_sd:
+    case Intrinsic::x86_sse2_min_sd:
+    case Intrinsic::x86_sse2_max_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+                                        UndefElts, Depth+1);
+      if (TmpV) { II->setOperand(1, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(2), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { II->setOperand(2, TmpV); MadeChange = true; }
+
+      // If only the low elt is demanded and this is a scalarizable intrinsic,
+      // scalarize it now.
+      if (DemandedElts == 1) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::x86_sse_sub_ss:
+        case Intrinsic::x86_sse_mul_ss:
+        case Intrinsic::x86_sse2_sub_sd:
+        case Intrinsic::x86_sse2_mul_sd:
+          // TODO: Lower MIN/MAX/ABS/etc
+          Value *LHS = II->getOperand(1);
+          Value *RHS = II->getOperand(2);
+          // Extract the element as scalars.
+          LHS = InsertNewInstBefore(new ExtractElementInst(LHS, 0U,"tmp"), *II);
+          RHS = InsertNewInstBefore(new ExtractElementInst(RHS, 0U,"tmp"), *II);
+          
+          switch (II->getIntrinsicID()) {
+          default: assert(0 && "Case stmts out of sync!");
+          case Intrinsic::x86_sse_sub_ss:
+          case Intrinsic::x86_sse2_sub_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::createSub(LHS, RHS,
+                                                        II->getName()), *II);
+            break;
+          case Intrinsic::x86_sse_mul_ss:
+          case Intrinsic::x86_sse2_mul_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::createMul(LHS, RHS,
+                                                         II->getName()), *II);
+            break;
+          }
+          
+          Instruction *New =
+            new InsertElementInst(UndefValue::get(II->getType()), TmpV, 0U,
+                                  II->getName());
+          InsertNewInstBefore(New, *II);
+          AddSoonDeadInstToWorklist(*II, 0);
+          return New;
+        }            
+      }
+        
+      // Output elements are undefined if both are undefined.  Consider things
+      // like undef&0.  The result is known zero, not undef.
+      UndefElts &= UndefElts2;
+      break;
+    }
+    break;
+  }
+  }
+  return MadeChange ? I : 0;
+}
+
 // isTrueWhenEqual - Return true if the specified setcondinst instruction is
 // true when both operands are equal...
 //
@@ -6088,6 +6312,19 @@
         return new StoreInst(II->getOperand(2), Ptr);
       }
       break;
+      
+    case Intrinsic::x86_sse_cvttss2si: {
+      // These intrinsics only demands the 0th element of its input vector.  If
+      // we can simplify the input based on that, do so now.
+      uint64_t UndefElts;
+      if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, 
+                                                UndefElts)) {
+        II->setOperand(1, V);
+        return II;
+      }
+      break;
+    }
+      
     case Intrinsic::ppc_altivec_vperm:
       // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
       if (ConstantPacked *Mask = dyn_cast<ConstantPacked>(II->getOperand(3))) {
@@ -6121,17 +6358,13 @@
             
             if (ExtractedElts[Idx] == 0) {
               Instruction *Elt = 
-                new ExtractElementInst(Idx < 16 ? Op0 : Op1,
-                                       ConstantUInt::get(Type::UIntTy, Idx&15),
-                                       "tmp");
+                new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp");
               InsertNewInstBefore(Elt, CI);
               ExtractedElts[Idx] = Elt;
             }
           
             // Insert this value into the result vector.
-            Result = new InsertElementInst(Result, ExtractedElts[Idx],
-                                           ConstantUInt::get(Type::UIntTy, i),
-                                           "tmp");
+            Result = new InsertElementInst(Result, ExtractedElts[Idx], i,"tmp");
             InsertNewInstBefore(cast<Instruction>(Result), CI);
           }
           return new CastInst(Result, CI.getType());
@@ -7512,6 +7745,19 @@
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
   if (ConstantUInt *IdxC = dyn_cast<ConstantUInt>(EI.getOperand(1))) {
+    // This instruction only demands the single element from the input vector.
+    // If the input vector has a single use, simplify it based on this use
+    // property.
+    if (EI.getOperand(0)->hasOneUse()) {
+      uint64_t UndefElts;
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
+                                                1 << IdxC->getValue(),
+                                                UndefElts)) {
+        EI.setOperand(0, V);
+        return &EI;
+      }
+    }
+    
     if (Value *Elt = FindScalarElement(EI.getOperand(0), IdxC->getValue()))
       return ReplaceInstUsesWith(EI, Elt);
   }
@@ -7569,8 +7815,7 @@
         } else {
           return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
         }
-        return new ExtractElementInst(Src,
-                                      ConstantUInt::get(Type::UIntTy, SrcIdx));
+        return new ExtractElementInst(Src, SrcIdx);
       }
     }
   }
@@ -7782,6 +8027,7 @@
 
   bool MadeChange = false;
   
+  // Undefined shuffle mask -> undefined value.
   if (isa<UndefValue>(SVI.getOperand(2)))
     return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));