[llvm-commits] [llvm] r167256 - in /llvm/trunk: lib/Transforms/Vectorize/BBVectorize.cpp test/Transforms/BBVectorize/X86/simple-ldstr.ll test/Transforms/BBVectorize/X86/simple.ll

Hal Finkel hfinkel at anl.gov
Thu Nov 1 14:50:13 PDT 2012


Author: hfinkel
Date: Thu Nov  1 16:50:12 2012
New Revision: 167256

URL: http://llvm.org/viewvc/llvm-project?rev=167256&view=rev
Log:
BBVectorize: Use target costs for incoming and outgoing values instead of the depth heuristic.

When target cost information is available, compute explicit costs of inserting and
extracting values from vectors. At this point, all costs are estimated using the
target information, and the chain-depth heuristic is not needed. As a result, it is now, by
default, disabled when using target costs.

Added:
    llvm/trunk/test/Transforms/BBVectorize/X86/simple-ldstr.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp
    llvm/trunk/test/Transforms/BBVectorize/X86/simple.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp?rev=167256&r1=167255&r2=167256&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp Thu Nov  1 16:50:12 2012
@@ -58,6 +58,11 @@
 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
   cl::desc("The required chain depth for vectorization"));
 
+static cl::opt<bool>
+UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
+  cl::Hidden, cl::desc("Use the chain depth requirement with"
+                       " target information"));
+
 static cl::opt<unsigned>
 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
   cl::desc("The maximum search distance for instruction pairs"));
@@ -227,6 +232,9 @@
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len);
 
+    // FIXME: The current implementation does not account for pairs that
+    // are connected in multiple ways. For example:
+    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
     enum PairConnectionType {
       PairConnectionDirect,
       PairConnectionSwap,
@@ -1665,20 +1673,39 @@
 
       int EffSize = 0;
       if (VTTI) {
+        DenseSet<Value *> PrunedTreeInstrs;
+        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+             E = PrunedTree.end(); S != E; ++S) {
+          PrunedTreeInstrs.insert(S->first);
+          PrunedTreeInstrs.insert(S->second);
+        }
+
+        // The set of pairs that have already contributed to the total cost.
+        DenseSet<ValuePair> IncomingPairs;
+
         // The node weights represent the cost savings associated with
         // fusing the pair of instructions.
         for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
              E = PrunedTree.end(); S != E; ++S) {
-          if (getDepthFactor(S->first))
-            EffSize += CandidatePairCostSavings.find(*S)->second;
+          bool FlipOrder = false;
 
-        // The edge weights contribute in a negative sense: they represent
-        // the cost of shuffles.
+          if (getDepthFactor(S->first)) {
+            int ESContrib = CandidatePairCostSavings.find(*S)->second;
+            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
+                   << *S->first << " <-> " << *S->second << "} = " <<
+                   ESContrib << "\n");
+            EffSize += ESContrib;
+          }
+
+          // The edge weights contribute in a negative sense: they represent
+          // the cost of shuffles.
           VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
           if (IP.first != ConnectedPairDeps.end()) {
             unsigned NumDepsDirect = 0, NumDepsSwap = 0;
             for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
                  Q != IP.second; ++Q) {
+              if (!PrunedTree.count(Q->second))
+                continue;
               DenseMap<VPPair, unsigned>::iterator R =
                 PairConnectionTypes.find(VPPair(Q->second, Q->first));
               assert(R != PairConnectionTypes.end() &&
@@ -1692,12 +1719,14 @@
             // If there are more swaps than direct connections, then
             // the pair order will be flipped during fusion. So the real
             // number of swaps is the minimum number.
-            bool FlipOrder = !FixedOrderPairs.count(*S) &&
+            FlipOrder = !FixedOrderPairs.count(*S) &&
               ((NumDepsSwap > NumDepsDirect) ||
                 FixedOrderPairs.count(ValuePair(S->second, S->first)));
 
             for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
                  Q != IP.second; ++Q) {
+              if (!PrunedTree.count(Q->second))
+                continue;
               DenseMap<VPPair, unsigned>::iterator R =
                 PairConnectionTypes.find(VPPair(Q->second, Q->first));
               assert(R != PairConnectionTypes.end() &&
@@ -1707,9 +1736,161 @@
               Type *VTy = getVecTypeForPair(Ty1, Ty2);
               if ((R->second == PairConnectionDirect && FlipOrder) ||
                   (R->second == PairConnectionSwap && !FlipOrder)  ||
-                  R->second == PairConnectionSplat)
-                EffSize -= (int) getInstrCost(Instruction::ShuffleVector,
-                                              VTy, VTy);
+                  R->second == PairConnectionSplat) {
+                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                                   VTy, VTy);
+                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                  *Q->second.first << " <-> " << *Q->second.second <<
+                    "} -> {" <<
+                  *S->first << " <-> " << *S->second << "} = " <<
+                   ESContrib << "\n");
+                EffSize -= ESContrib;
+              }
+            }
+          }
+
+          // Compute the cost of outgoing edges. We assume that edges outgoing
+          // to shuffles, inserts or extracts can be merged, and so contribute
+          // no additional cost.
+          if (!S->first->getType()->isVoidTy()) {
+            Type *Ty1 = S->first->getType(),
+                 *Ty2 = S->second->getType();
+            Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+            bool NeedsExtraction = false;
+            for (Value::use_iterator I = S->first->use_begin(),
+                 IE = S->first->use_end(); I != IE; ++I) {
+              if (isa<ShuffleVectorInst>(*I) ||
+                  isa<InsertElementInst>(*I) ||
+                  isa<ExtractElementInst>(*I))
+                continue;
+              if (PrunedTreeInstrs.count(*I))
+                continue;
+              NeedsExtraction = true;
+              break;
+            }
+
+            if (NeedsExtraction) {
+              int ESContrib;
+              if (Ty1->isVectorTy())
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               Ty1, VTy);
+              else
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::ExtractElement, VTy, 0);
+
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                *S->first << "} = " << ESContrib << "\n");
+              EffSize -= ESContrib;
+            }
+
+            NeedsExtraction = false;
+            for (Value::use_iterator I = S->second->use_begin(),
+                 IE = S->second->use_end(); I != IE; ++I) {
+              if (isa<ShuffleVectorInst>(*I) ||
+                  isa<InsertElementInst>(*I) ||
+                  isa<ExtractElementInst>(*I))
+                continue;
+              if (PrunedTreeInstrs.count(*I))
+                continue;
+              NeedsExtraction = true;
+              break;
+            }
+
+            if (NeedsExtraction) {
+              int ESContrib;
+              if (Ty2->isVectorTy())
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               Ty2, VTy);
+              else
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::ExtractElement, VTy, 1);
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                *S->second << "} = " << ESContrib << "\n");
+              EffSize -= ESContrib;
+            }
+          }
+
+          // Compute the cost of incoming edges.
+          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
+            Instruction *S1 = cast<Instruction>(S->first),
+                        *S2 = cast<Instruction>(S->second);
+            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
+              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
+
+              // Combining constants into vector constants (or small vector
+              // constants into larger ones are assumed free).
+              if (isa<Constant>(O1) && isa<Constant>(O2))
+                continue;
+
+              if (FlipOrder)
+                std::swap(O1, O2);
+
+              ValuePair VP  = ValuePair(O1, O2);
+              ValuePair VPR = ValuePair(O2, O1);
+
+              // Internal edges are not handled here.
+              if (PrunedTree.count(VP) || PrunedTree.count(VPR))
+                continue;
+
+              Type *Ty1 = O1->getType(),
+                   *Ty2 = O2->getType();
+              Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+              // Combining vector operations of the same type is also assumed
+              // folded with other operations.
+              if (Ty1 == Ty2 &&
+                  (isa<ShuffleVectorInst>(O1) ||
+                   isa<InsertElementInst>(O1) ||
+                   isa<InsertElementInst>(O1)) &&
+                  (isa<ShuffleVectorInst>(O2) ||
+                   isa<InsertElementInst>(O2) ||
+                   isa<InsertElementInst>(O2)))
+                continue;
+
+              int ESContrib;
+              // This pair has already been formed.
+              if (IncomingPairs.count(VP)) {
+                continue;
+              } else if (IncomingPairs.count(VPR)) {
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               VTy, VTy);
+              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::InsertElement, VTy, 0);
+                ESContrib += (int) VTTI->getVectorInstrCost(
+                                     Instruction::InsertElement, VTy, 1);
+              } else if (!Ty1->isVectorTy()) {
+                // O1 needs to be inserted into a vector of size O2, and then
+                // both need to be shuffled together.
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::InsertElement, Ty2, 0);
+                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                VTy, Ty2);
+              } else if (!Ty2->isVectorTy()) {
+                // O2 needs to be inserted into a vector of size O1, and then
+                // both need to be shuffled together.
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::InsertElement, Ty1, 0);
+                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                VTy, Ty1);
+              } else {
+                Type *TyBig = Ty1, *TySmall = Ty2;
+                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
+                  std::swap(TyBig, TySmall);
+
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               VTy, TyBig);
+                if (TyBig != TySmall)
+                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                  TyBig, TySmall);
+              }
+
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
+                     << *O1 << " <-> " << *O2 << "} = " <<
+                     ESContrib << "\n");
+              EffSize -= ESContrib;
+              IncomingPairs.insert(VP);
             }
           }
         }
@@ -1724,7 +1905,8 @@
              << *J->first << " <-> " << *J->second << "} of depth " <<
              MaxDepth << " and size " << PrunedTree.size() <<
             " (effective size: " << EffSize << ")\n");
-      if (MaxDepth >= Config.ReqChainDepth &&
+      if (((VTTI && !UseChainDepthWithTI) ||
+            MaxDepth >= Config.ReqChainDepth) &&
           EffSize > 0 && EffSize > BestEffSize) {
         BestMaxDepth = MaxDepth;
         BestEffSize = EffSize;

Added: llvm/trunk/test/Transforms/BBVectorize/X86/simple-ldstr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/BBVectorize/X86/simple-ldstr.ll?rev=167256&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/BBVectorize/X86/simple-ldstr.ll (added)
+++ llvm/trunk/test/Transforms/BBVectorize/X86/simple-ldstr.ll Thu Nov  1 16:50:12 2012
@@ -0,0 +1,29 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+}
+

Modified: llvm/trunk/test/Transforms/BBVectorize/X86/simple.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/BBVectorize/X86/simple.ll?rev=167256&r1=167255&r2=167256&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/BBVectorize/X86/simple.ll (original)
+++ llvm/trunk/test/Transforms/BBVectorize/X86/simple.ll Thu Nov  1 16:50:12 2012
@@ -3,25 +3,84 @@
 
 ; Basic depth-3 chain
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
 ; CHECK: @test1
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain
+define double @test1a(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%W1 = fadd double %Y1, %Z1
+	%W2 = fadd double %Y2, %Z2
+	%V1 = fadd double %W1, %Z1
+	%V2 = fadd double %W2, %Z2
+	%Q1 = fadd double %W1, %V1
+	%Q2 = fadd double %W2, %V2
+	%S1 = fadd double %W1, %Q1
+	%S2 = fadd double %W2, %Q2
+	%R  = fmul double %S1, %S2
+	ret double %R
+; CHECK: @test1a
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %W1 = fadd <2 x double> %Y1, %Z1
+; CHECK: %V1 = fadd <2 x double> %W1, %Z1
+; CHECK: %Q1 = fadd <2 x double> %W1, %V1
+; CHECK: %S1 = fadd <2 x double> %W1, %Q1
+; CHECK: %S1.v.r1 = extractelement <2 x double> %S1, i32 0
+; CHECK: %S1.v.r2 = extractelement <2 x double> %S1, i32 1
+; CHECK: %R = fmul double %S1.v.r1, %S1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
-; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 	%Y1 = fmul double %X1, %A1
 	%Y2 = fmul double %X2, %A2
-; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
-	%Z1 = fadd double %Y1, %B1
-	%Z2 = fadd double %Y2, %B2
-; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic depth-4 chain (internal permutation)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%W1 = fadd double %Y2, %Z1
+	%W2 = fadd double %Y1, %Z2
 	%R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
 	ret double %R
+; CHECK: @test4
+; CHECK-NOT: fmul <2 x double>
 ; CHECK: ret double %R
 }
 
@@ -37,8 +96,8 @@
         %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
 	%R  = mul <8 x i8> %Q1, %Q2
 	ret <8 x i8> %R
-; CHECK-TI: @test6
-; CHECK-TI-NOT: sub <16 x i8>
-; CHECK-TI: ret <8 x i8>
+; CHECK: @test6
+; CHECK-NOT: sub <16 x i8>
+; CHECK: ret <8 x i8>
 }
 





More information about the llvm-commits mailing list