[llvm] r267899 - [SLPVectorizer] Extend SLP Vectorizer to deal with aggregates.
Arch D. Robison via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 28 09:11:46 PDT 2016
Author: adrobiso
Date: Thu Apr 28 11:11:45 2016
New Revision: 267899
URL: http://llvm.org/viewvc/llvm-project?rev=267899&view=rev
Log:
[SLPVectorizer] Extend SLP Vectorizer to deal with aggregates.
The refactoring portion part was done as r267748.
http://reviews.llvm.org/D14185
Added:
llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll
Modified:
llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=267899&r1=267898&r2=267899&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Apr 28 11:11:45 2016
@@ -278,36 +278,17 @@ static Type* getSameType(ArrayRef<Value
return Ty;
}
-/// \returns True if the ExtractElement instructions in VL can be vectorized
-/// to use the original vector.
-static bool CanReuseExtract(ArrayRef<Value *> VL) {
- assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
- // Check if all of the extracts come from the same vector and from the
- // correct offset.
- Value *VL0 = VL[0];
- ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
- Value *Vec = E0->getOperand(0);
-
- // We have to extract from the same vector type.
- unsigned NElts = Vec->getType()->getVectorNumElements();
-
- if (NElts != VL.size())
- return false;
-
- // Check that all of the indices extract from the correct offset.
- ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
- if (!CI || CI->getZExtValue())
- return false;
-
- for (unsigned i = 1, e = VL.size(); i < e; ++i) {
- ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
+ assert(Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue);
+ if (Opcode == Instruction::ExtractElement) {
ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
- if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
- return false;
+ return CI && CI->getZExtValue() == Idx;
+ } else {
+ ExtractValueInst *EI = cast<ExtractValueInst>(E);
+ return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
}
-
- return true;
}
/// \returns True if in-tree use also needs extract. This refers to
@@ -448,6 +429,11 @@ public:
return MinVecRegSize;
}
+ /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+ ///
+ /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+ unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
private:
struct TreeEntry;
@@ -457,6 +443,10 @@ private:
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+ /// \returns True if the ExtractElement/ExtractValue instructions in VL can
+ /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
+ bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
@@ -1183,8 +1173,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
}
return;
}
+ case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = CanReuseExtract(VL);
+ bool Reuse = canReuseExtract(VL, Opcode);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
} else {
@@ -1501,6 +1492,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
}
}
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+ unsigned N;
+ Type *EltTy;
+ auto *ST = dyn_cast<StructType>(T);
+ if (ST) {
+ N = ST->getNumElements();
+ EltTy = *ST->element_begin();
+ } else {
+ N = cast<ArrayType>(T)->getNumElements();
+ EltTy = cast<ArrayType>(T)->getElementType();
+ }
+ if (!isValidElementType(EltTy))
+ return 0;
+ uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+ if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+ return 0;
+ if (ST) {
+ // Check that struct is homogeneous.
+ for (const auto *Ty : ST->elements())
+ if (Ty != EltTy)
+ return 0;
+ }
+ return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
+ assert(Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue);
+ assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+ // Check if all of the extracts come from the same vector and from the
+ // correct offset.
+ Value *VL0 = VL[0];
+ Instruction *E0 = cast<Instruction>(VL0);
+ Value *Vec = E0->getOperand(0);
+
+ // We have to extract from a vector/aggregate with the same number of elements.
+ unsigned NElts;
+ if (Opcode == Instruction::ExtractValue) {
+ const DataLayout &DL = E0->getModule()->getDataLayout();
+ NElts = canMapToVector(Vec->getType(), DL);
+ if (!NElts)
+ return false;
+ // Check if load can be rewritten as load of vector.
+ LoadInst *LI = dyn_cast<LoadInst>(Vec);
+ if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+ return false;
+ } else {
+ NElts = Vec->getType()->getVectorNumElements();
+ }
+
+ if (NElts != VL.size())
+ return false;
+
+ // Check that all of the indices extract from the correct offset.
+ if (!matchExtractIndex(E0, 0, Opcode))
+ return false;
+
+ for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+ Instruction *E = cast<Instruction>(VL[i]);
+ if (!matchExtractIndex(E, i, Opcode))
+ return false;
+ if (E->getOperand(0) != Vec)
+ return false;
+ }
+
+ return true;
+}
+
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
@@ -1530,11 +1589,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
case Instruction::PHI: {
return 0;
}
+ case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- if (CanReuseExtract(VL)) {
+ if (canReuseExtract(VL, Opcode)) {
int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+ Instruction *E = cast<Instruction>(VL[i]);
if (E->hasOneUse())
// Take credit for instruction that will become dead.
DeadCost +=
@@ -2223,13 +2283,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
}
case Instruction::ExtractElement: {
- if (CanReuseExtract(E->Scalars)) {
+ if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
Value *V = VL0->getOperand(0);
E->VectorizedValue = V;
return V;
}
return Gather(E->Scalars, VecTy);
}
+ case Instruction::ExtractValue: {
+ if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+ LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+ Builder.SetInsertPoint(LI);
+ PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+ Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+ E->VectorizedValue = V;
+ return propagateMetadata(V, E->Scalars);
+ }
+ return Gather(E->Scalars, VecTy);
+ }
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -3807,13 +3879,14 @@ bool SLPVectorizer::tryToVectorizeList(A
for (auto &V : BuildVectorSlice) {
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
++BasicBlock::iterator(InsertAfter));
- InsertElementInst *IE = cast<InsertElementInst>(V);
+ Instruction *I = cast<Instruction>(V);
+ assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
- IE->setOperand(1, Extract);
- IE->removeFromParent();
- IE->insertAfter(Extract);
- InsertAfter = IE;
+ I->setOperand(1, Extract);
+ I->removeFromParent();
+ I->insertAfter(Extract);
+ InsertAfter = I;
}
}
// Move to the next bundle.
@@ -4210,6 +4283,25 @@ static bool findBuildVector(InsertElemen
return false;
}
+/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+///
+/// \return true if it matches.
+static bool findBuildAggregate(InsertValueInst *IV,
+ SmallVectorImpl<Value *> &BuildVector,
+ SmallVectorImpl<Value *> &BuildVectorOpds) {
+ if (!IV->hasOneUse())
+ return false;
+ Value *V = IV->getAggregateOperand();
+ if (!isa<UndefValue>(V)) {
+ InsertValueInst *I = dyn_cast<InsertValueInst>(V);
+ if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+ return false;
+ }
+ BuildVector.push_back(IV);
+ BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+ return true;
+}
+
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
@@ -4462,6 +4554,28 @@ bool SLPVectorizer::vectorizeChainsInBlo
continue;
}
+
+ // Try to vectorize trees that start at insertvalue instructions feeding into
+ // a store.
+ if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+ if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
+ continue;
+
+ DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
+ if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ }
+ continue;
+ }
+ }
+ }
}
return Changed;
Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll?rev=267899&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/insertvalue.ll Thu Apr 28 11:11:45 2016
@@ -0,0 +1,189 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+; CHECK-LABEL: julia_2xdouble
+; CHECK: load <2 x double>
+; CHECK: load <2 x double>
+; CHECK: fmul <2 x double>
+; CHECK: fadd <2 x double>
+define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
+top:
+ %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
+ %x0 = load double, double* %px0, align 4
+ %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
+ %y0 = load double, double* %py0, align 4
+ %m0 = fmul double %x0, %y0
+ %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
+ %x1 = load double, double* %px1, align 4
+ %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1
+ %y1 = load double, double* %py1, align 4
+ %m1 = fmul double %x1, %y1
+ %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0
+ %z0 = load double, double* %pz0, align 4
+ %a0 = fadd double %m0, %z0
+ %i0 = insertvalue [2 x double] undef, double %a0, 0
+ %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1
+ %z1 = load double, double* %pz1, align 4
+ %a1 = fadd double %m1, %z1
+ %i1 = insertvalue [2 x double] %i0, double %a1, 1
+ store [2 x double] %i1, [2 x double]* %0, align 4
+ ret void
+}
+
+; CHECK-LABEL: julia_4xfloat
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: fadd <4 x float>
+define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
+top:
+ %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
+ %x0 = load float, float* %px0, align 4
+ %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0
+ %y0 = load float, float* %py0, align 4
+ %m0 = fmul float %x0, %y0
+ %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
+ %x1 = load float, float* %px1, align 4
+ %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1
+ %y1 = load float, float* %py1, align 4
+ %m1 = fmul float %x1, %y1
+ %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
+ %x2 = load float, float* %px2, align 4
+ %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2
+ %y2 = load float, float* %py2, align 4
+ %m2 = fmul float %x2, %y2
+ %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3
+ %x3 = load float, float* %px3, align 4
+ %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3
+ %y3 = load float, float* %py3, align 4
+ %m3 = fmul float %x3, %y3
+ %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0
+ %z0 = load float, float* %pz0, align 4
+ %a0 = fadd float %m0, %z0
+ %i0 = insertvalue [4 x float] undef, float %a0, 0
+ %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1
+ %z1 = load float, float* %pz1, align 4
+ %a1 = fadd float %m1, %z1
+ %i1 = insertvalue [4 x float] %i0, float %a1, 1
+ %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2
+ %z2 = load float, float* %pz2, align 4
+ %a2 = fadd float %m2, %z2
+ %i2 = insertvalue [4 x float] %i1, float %a2, 2
+ %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3
+ %z3 = load float, float* %pz3, align 4
+ %a3 = fadd float %m3, %z3
+ %i3 = insertvalue [4 x float] %i2, float %a3, 3
+ store [4 x float] %i3, [4 x float]* %0, align 4
+ ret void
+}
+
+; CHECK-LABEL: julia_load_array_of_float
+; CHECK: fsub <4 x float>
+define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
+top:
+ %a_arr = load [4 x float], [4 x float]* %a, align 4
+ %a0 = extractvalue [4 x float] %a_arr, 0
+ %a2 = extractvalue [4 x float] %a_arr, 2
+ %a1 = extractvalue [4 x float] %a_arr, 1
+ %b_arr = load [4 x float], [4 x float]* %b, align 4
+ %b0 = extractvalue [4 x float] %b_arr, 0
+ %b2 = extractvalue [4 x float] %b_arr, 2
+ %b1 = extractvalue [4 x float] %b_arr, 1
+ %a3 = extractvalue [4 x float] %a_arr, 3
+ %c1 = fsub float %a1, %b1
+ %b3 = extractvalue [4 x float] %b_arr, 3
+ %c0 = fsub float %a0, %b0
+ %c2 = fsub float %a2, %b2
+ %c_arr0 = insertvalue [4 x float] undef, float %c0, 0
+ %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
+ %c3 = fsub float %a3, %b3
+ %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
+ %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
+ store [4 x float] %c_arr3, [4 x float]* %c, align 4
+ ret void
+}
+
+; CHECK-LABEL: julia_load_array_of_i32
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: sub <4 x i32>
+define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
+top:
+ %a_arr = load [4 x i32], [4 x i32]* %a, align 4
+ %a0 = extractvalue [4 x i32] %a_arr, 0
+ %a2 = extractvalue [4 x i32] %a_arr, 2
+ %a1 = extractvalue [4 x i32] %a_arr, 1
+ %b_arr = load [4 x i32], [4 x i32]* %b, align 4
+ %b0 = extractvalue [4 x i32] %b_arr, 0
+ %b2 = extractvalue [4 x i32] %b_arr, 2
+ %b1 = extractvalue [4 x i32] %b_arr, 1
+ %a3 = extractvalue [4 x i32] %a_arr, 3
+ %c1 = sub i32 %a1, %b1
+ %b3 = extractvalue [4 x i32] %b_arr, 3
+ %c0 = sub i32 %a0, %b0
+ %c2 = sub i32 %a2, %b2
+ %c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0
+ %c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1
+ %c3 = sub i32 %a3, %b3
+ %c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2
+ %c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3
+ store [4 x i32] %c_arr3, [4 x i32]* %c, align 4
+ ret void
+}
+
+; Almost identical to previous test, but for type that should NOT be vectorized.
+;
+; CHECK-LABEL: julia_load_array_of_i16
+; CHECK-NOT: i2>
+define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
+top:
+ %a_arr = load [4 x i16], [4 x i16]* %a, align 4
+ %a0 = extractvalue [4 x i16] %a_arr, 0
+ %a2 = extractvalue [4 x i16] %a_arr, 2
+ %a1 = extractvalue [4 x i16] %a_arr, 1
+ %b_arr = load [4 x i16], [4 x i16]* %b, align 4
+ %b0 = extractvalue [4 x i16] %b_arr, 0
+ %b2 = extractvalue [4 x i16] %b_arr, 2
+ %b1 = extractvalue [4 x i16] %b_arr, 1
+ %a3 = extractvalue [4 x i16] %a_arr, 3
+ %c1 = sub i16 %a1, %b1
+ %b3 = extractvalue [4 x i16] %b_arr, 3
+ %c0 = sub i16 %a0, %b0
+ %c2 = sub i16 %a2, %b2
+ %c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0
+ %c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1
+ %c3 = sub i16 %a3, %b3
+ %c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2
+ %c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3
+ store [4 x i16] %c_arr3, [4 x i16]* %c, align 4
+ ret void
+}
+
+%pseudovec = type { float, float, float, float }
+
+; CHECK-LABEL: julia_load_struct_of_float
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fsub <4 x float>
+define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
+top:
+ %a_struct = load %pseudovec, %pseudovec* %a, align 4
+ %a0 = extractvalue %pseudovec %a_struct, 0
+ %a1 = extractvalue %pseudovec %a_struct, 1
+ %b_struct = load %pseudovec, %pseudovec* %b, align 4
+ %a2 = extractvalue %pseudovec %a_struct, 2
+ %b0 = extractvalue %pseudovec %b_struct, 0
+ %a3 = extractvalue %pseudovec %a_struct, 3
+ %c0 = fsub float %a0, %b0
+ %b1 = extractvalue %pseudovec %b_struct, 1
+ %b2 = extractvalue %pseudovec %b_struct, 2
+ %c1 = fsub float %a1, %b1
+ %c_struct0 = insertvalue %pseudovec undef, float %c0, 0
+ %b3 = extractvalue %pseudovec %b_struct, 3
+ %c3 = fsub float %a3, %b3
+ %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
+ %c2 = fsub float %a2, %b2
+ %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
+ %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
+ store %pseudovec %c_struct3, %pseudovec* %c, align 4
+ ret void
+}
More information about the llvm-commits
mailing list