[llvm] r267899 - [SLPVectorizer] Extend SLP Vectorizer to deal with aggregates.

Thu Apr 28 09:11:46 PDT 2016

Author: adrobiso
Date: Thu Apr 28 11:11:45 2016
New Revision: 267899

URL: http://llvm.org/viewvc/llvm-project?rev=267899&view=rev
Log:
[SLPVectorizer] Extend SLP Vectorizer to deal with aggregates.

The refactoring portion part was done as r267748.

http://reviews.llvm.org/D14185


Added:
    llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=267899&r1=267898&r2=267899&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Apr 28 11:11:45 2016
@@ -278,36 +278,17 @@ static Type* getSameType(ArrayRef<Value
   return Ty;
 }
 
-/// \returns True if the ExtractElement instructions in VL can be vectorized
-/// to use the original vector.
-static bool CanReuseExtract(ArrayRef<Value *> VL) {
-  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
-  // Check if all of the extracts come from the same vector and from the
-  // correct offset.
-  Value *VL0 = VL[0];
-  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
-  Value *Vec = E0->getOperand(0);
-
-  // We have to extract from the same vector type.
-  unsigned NElts = Vec->getType()->getVectorNumElements();
-
-  if (NElts != VL.size())
-    return false;
-
-  // Check that all of the indices extract from the correct offset.
-  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
-  if (!CI || CI->getZExtValue())
-    return false;
-
-  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  if (Opcode == Instruction::ExtractElement) {
     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
-    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
-      return false;
+    return CI && CI->getZExtValue() == Idx;
+  } else {
+    ExtractValueInst *EI = cast<ExtractValueInst>(E);
+    return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
   }
-
-  return true;
 }
 
 /// \returns True if in-tree use also needs extract. This refers to
@@ -448,6 +429,11 @@ public:
     return MinVecRegSize;
   }
 
+  /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+  ///
+  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
 private:
   struct TreeEntry;
 
@@ -457,6 +443,10 @@ private:
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
 
+  /// \returns True if the ExtractElement/ExtractValue instructions in VL can
+  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
+  bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
 
@@ -1183,8 +1173,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       }
       return;
     }
+    case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      bool Reuse = CanReuseExtract(VL);
+      bool Reuse = canReuseExtract(VL, Opcode);
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
       } else {
@@ -1501,6 +1492,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
   }
 }
 
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+  unsigned N;
+  Type *EltTy;
+  auto *ST = dyn_cast<StructType>(T);
+  if (ST) {
+    N = ST->getNumElements();
+    EltTy = *ST->element_begin();
+  } else {
+    N = cast<ArrayType>(T)->getNumElements();
+    EltTy = cast<ArrayType>(T)->getElementType();
+  }
+  if (!isValidElementType(EltTy))
+    return 0;
+  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+    return 0;
+  if (ST) {
+    // Check that struct is homogeneous.
+    for (const auto *Ty : ST->elements())
+      if (Ty != EltTy)
+        return 0;
+  }
+  return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *VL0 = VL[0];
+  Instruction *E0 = cast<Instruction>(VL0);
+  Value *Vec = E0->getOperand(0);
+
+  // We have to extract from a vector/aggregate with the same number of elements.
+  unsigned NElts;
+  if (Opcode == Instruction::ExtractValue) {
+    const DataLayout &DL = E0->getModule()->getDataLayout();
+    NElts = canMapToVector(Vec->getType(), DL);
+    if (!NElts)
+      return false;
+    // Check if load can be rewritten as load of vector.
+    LoadInst *LI = dyn_cast<LoadInst>(Vec);
+    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+      return false;
+  } else {
+    NElts = Vec->getType()->getVectorNumElements();
+  }
+
+  if (NElts != VL.size())
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  if (!matchExtractIndex(E0, 0, Opcode))
+    return false;
+
+  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+    Instruction *E = cast<Instruction>(VL[i]);
+    if (!matchExtractIndex(E, i, Opcode))
+      return false;
+    if (E->getOperand(0) != Vec)
+      return false;
+  }
+
+  return true;
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
@@ -1530,11 +1589,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
     case Instruction::PHI: {
       return 0;
     }
+    case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(VL)) {
+      if (canReuseExtract(VL, Opcode)) {
         int DeadCost = 0;
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+          Instruction *E = cast<Instruction>(VL[i]);
           if (E->hasOneUse())
             // Take credit for instruction that will become dead.
             DeadCost +=
@@ -2223,13 +2283,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
     }
 
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(E->Scalars)) {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
         Value *V = VL0->getOperand(0);
         E->VectorizedValue = V;
         return V;
       }
       return Gather(E->Scalars, VecTy);
     }
+    case Instruction::ExtractValue: {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+        LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+        Builder.SetInsertPoint(LI);
+        PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+        Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+        LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+        E->VectorizedValue = V;
+        return propagateMetadata(V, E->Scalars);
+      }
+      return Gather(E->Scalars, VecTy);
+    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -3807,13 +3879,14 @@ bool SLPVectorizer::tryToVectorizeList(A
         for (auto &V : BuildVectorSlice) {
           IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
                                       ++BasicBlock::iterator(InsertAfter));
-          InsertElementInst *IE = cast<InsertElementInst>(V);
+          Instruction *I = cast<Instruction>(V);
+          assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
               VectorizedRoot, Builder.getInt32(VecIdx++)));
-          IE->setOperand(1, Extract);
-          IE->removeFromParent();
-          IE->insertAfter(Extract);
-          InsertAfter = IE;
+          I->setOperand(1, Extract);
+          I->removeFromParent();
+          I->insertAfter(Extract);
+          InsertAfter = I;
         }
       }
       // Move to the next bundle.
@@ -4210,6 +4283,25 @@ static bool findBuildVector(InsertElemen
   return false;
 }
 
+/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+///
+/// \return true if it matches.
+static bool findBuildAggregate(InsertValueInst *IV,
+                               SmallVectorImpl<Value *> &BuildVector,
+                               SmallVectorImpl<Value *> &BuildVectorOpds) {
+  if (!IV->hasOneUse())
+    return false;
+  Value *V = IV->getAggregateOperand();
+  if (!isa<UndefValue>(V)) {
+    InsertValueInst *I = dyn_cast<InsertValueInst>(V);
+    if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+      return false;
+  }
+  BuildVector.push_back(IV);
+  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+  return true;
+}
+
 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   return V->getType() < V2->getType();
 }
@@ -4462,6 +4554,28 @@ bool SLPVectorizer::vectorizeChainsInBlo
 
       continue;
     }
+
+    // Try to vectorize trees that start at insertvalue instructions feeding into
+    // a store.
+    if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+      if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
+        const DataLayout &DL = BB->getModule()->getDataLayout();
+        if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
+          SmallVector<Value *, 16> BuildVector;
+          SmallVector<Value *, 16> BuildVectorOpds;
+          if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
+            continue;
+
+          DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
+          if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+          }
+          continue;
+        }
+      }
+    }
   }
 
   return Changed;

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll?rev=267899&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll Thu Apr 28 11:11:45 2016
@@ -0,0 +1,189 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+; CHECK-LABEL: julia_2xdouble
+; CHECK: load <2 x double>
+; CHECK: load <2 x double>
+; CHECK: fmul <2 x double>
+; CHECK: fadd <2 x double>
+define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
+top:
+  %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
+  %x0 = load double, double* %px0, align 4
+  %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
+  %y0 = load double, double* %py0, align 4
+  %m0 = fmul double %x0, %y0
+  %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
+  %x1 = load double, double* %px1, align 4
+  %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1
+  %y1 = load double, double* %py1, align 4
+  %m1 = fmul double %x1, %y1
+  %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0
+  %z0 = load double, double* %pz0, align 4
+  %a0 = fadd double %m0, %z0
+  %i0 = insertvalue [2 x double] undef, double %a0, 0
+  %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1
+  %z1 = load double, double* %pz1, align 4
+  %a1 = fadd double %m1, %z1
+  %i1 = insertvalue [2 x double] %i0, double %a1, 1
+  store [2 x double] %i1, [2 x double]* %0, align 4
+  ret void
+}
+
+; CHECK-LABEL: julia_4xfloat
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: fadd <4 x float>
+define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
+top:
+  %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
+  %x0 = load float, float* %px0, align 4
+  %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0
+  %y0 = load float, float* %py0, align 4
+  %m0 = fmul float %x0, %y0
+  %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
+  %x1 = load float, float* %px1, align 4
+  %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1
+  %y1 = load float, float* %py1, align 4
+  %m1 = fmul float %x1, %y1
+  %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
+  %x2 = load float, float* %px2, align 4
+  %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2
+  %y2 = load float, float* %py2, align 4
+  %m2 = fmul float %x2, %y2
+  %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3
+  %x3 = load float, float* %px3, align 4
+  %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3
+  %y3 = load float, float* %py3, align 4
+  %m3 = fmul float %x3, %y3
+  %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0
+  %z0 = load float, float* %pz0, align 4
+  %a0 = fadd float %m0, %z0
+  %i0 = insertvalue [4 x float] undef, float %a0, 0
+  %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1
+  %z1 = load float, float* %pz1, align 4
+  %a1 = fadd float %m1, %z1
+  %i1 = insertvalue [4 x float] %i0, float %a1, 1
+  %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2
+  %z2 = load float, float* %pz2, align 4
+  %a2 = fadd float %m2, %z2
+  %i2 = insertvalue [4 x float] %i1, float %a2, 2
+  %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3
+  %z3 = load float, float* %pz3, align 4
+  %a3 = fadd float %m3, %z3
+  %i3 = insertvalue [4 x float] %i2, float %a3, 3
+  store [4 x float] %i3, [4 x float]* %0, align 4
+  ret void
+}
+
+; CHECK-LABEL: julia_load_array_of_float
+; CHECK: fsub <4 x float>
+define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
+top:
+  %a_arr = load [4 x float], [4 x float]* %a, align 4
+  %a0 = extractvalue [4 x float] %a_arr, 0
+  %a2 = extractvalue [4 x float] %a_arr, 2
+  %a1 = extractvalue [4 x float] %a_arr, 1
+  %b_arr = load [4 x float], [4 x float]* %b, align 4
+  %b0 = extractvalue [4 x float] %b_arr, 0
+  %b2 = extractvalue [4 x float] %b_arr, 2
+  %b1 = extractvalue [4 x float] %b_arr, 1
+  %a3 = extractvalue [4 x float] %a_arr, 3
+  %c1 = fsub float %a1, %b1
+  %b3 = extractvalue [4 x float] %b_arr, 3
+  %c0 = fsub float %a0, %b0
+  %c2 = fsub float %a2, %b2
+  %c_arr0 = insertvalue [4 x float] undef, float %c0, 0
+  %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
+  %c3 = fsub float %a3, %b3
+  %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
+  %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
+  store [4 x float] %c_arr3, [4 x float]* %c, align 4
+  ret void
+}
+
+; CHECK-LABEL: julia_load_array_of_i32
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: sub <4 x i32>
+define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
+top:
+  %a_arr = load [4 x i32], [4 x i32]* %a, align 4
+  %a0 = extractvalue [4 x i32] %a_arr, 0
+  %a2 = extractvalue [4 x i32] %a_arr, 2
+  %a1 = extractvalue [4 x i32] %a_arr, 1
+  %b_arr = load [4 x i32], [4 x i32]* %b, align 4
+  %b0 = extractvalue [4 x i32] %b_arr, 0
+  %b2 = extractvalue [4 x i32] %b_arr, 2
+  %b1 = extractvalue [4 x i32] %b_arr, 1
+  %a3 = extractvalue [4 x i32] %a_arr, 3
+  %c1 = sub i32 %a1, %b1
+  %b3 = extractvalue [4 x i32] %b_arr, 3
+  %c0 = sub i32 %a0, %b0
+  %c2 = sub i32 %a2, %b2
+  %c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0
+  %c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1
+  %c3 = sub i32 %a3, %b3
+  %c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2
+  %c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3
+  store [4 x i32] %c_arr3, [4 x i32]* %c, align 4
+  ret void
+}
+
+; Almost identical to previous test, but for type that should NOT be vectorized.
+;
+; CHECK-LABEL: julia_load_array_of_i16
+; CHECK-NOT: i2>
+define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
+top:
+  %a_arr = load [4 x i16], [4 x i16]* %a, align 4
+  %a0 = extractvalue [4 x i16] %a_arr, 0
+  %a2 = extractvalue [4 x i16] %a_arr, 2
+  %a1 = extractvalue [4 x i16] %a_arr, 1
+  %b_arr = load [4 x i16], [4 x i16]* %b, align 4
+  %b0 = extractvalue [4 x i16] %b_arr, 0
+  %b2 = extractvalue [4 x i16] %b_arr, 2
+  %b1 = extractvalue [4 x i16] %b_arr, 1
+  %a3 = extractvalue [4 x i16] %a_arr, 3
+  %c1 = sub i16 %a1, %b1
+  %b3 = extractvalue [4 x i16] %b_arr, 3
+  %c0 = sub i16 %a0, %b0
+  %c2 = sub i16 %a2, %b2
+  %c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0
+  %c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1
+  %c3 = sub i16 %a3, %b3
+  %c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2
+  %c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3
+  store [4 x i16] %c_arr3, [4 x i16]* %c, align 4
+  ret void
+}
+
+%pseudovec = type { float, float, float, float }
+
+; CHECK-LABEL: julia_load_struct_of_float
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fsub <4 x float>
+define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
+top:
+  %a_struct = load %pseudovec, %pseudovec* %a, align 4
+  %a0 = extractvalue %pseudovec %a_struct, 0
+  %a1 = extractvalue %pseudovec %a_struct, 1
+  %b_struct = load %pseudovec, %pseudovec* %b, align 4
+  %a2 = extractvalue %pseudovec %a_struct, 2
+  %b0 = extractvalue %pseudovec %b_struct, 0
+  %a3 = extractvalue %pseudovec %a_struct, 3
+  %c0 = fsub float %a0, %b0
+  %b1 = extractvalue %pseudovec %b_struct, 1
+  %b2 = extractvalue %pseudovec %b_struct, 2
+  %c1 = fsub float %a1, %b1
+  %c_struct0 = insertvalue %pseudovec undef, float %c0, 0
+  %b3 = extractvalue %pseudovec %b_struct, 3
+  %c3 = fsub float %a3, %b3
+  %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
+  %c2 = fsub float %a2, %b2
+  %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
+  %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
+  store %pseudovec %c_struct3, %pseudovec* %c, align 4
+  ret void
+}