[llvm] 80cd6b6 - [SLP] Enhance SLPVectorizer to vectorize vector aggregate

Fri Nov 22 09:07:24 PST 2019

Author: Anton Afanasyev
Date: 2019-11-22T20:01:59+03:00
New Revision: 80cd6b6e043f04ff7cb525bc1f09ab1f6400e9db

URL: https://github.com/llvm/llvm-project/commit/80cd6b6e043f04ff7cb525bc1f09ab1f6400e9db
DIFF: https://github.com/llvm/llvm-project/commit/80cd6b6e043f04ff7cb525bc1f09ab1f6400e9db.diff

LOG: [SLP] Enhance SLPVectorizer to vectorize vector aggregate

Summary:
Vector aggregate is homogeneous aggregate of vectors like `{ <2 x float>, <2 x float> }`.
This patch allows `findBuildAggregate()` to consider vector aggregates as
well as scalar ones. For instance, `{ <2 x float>, <2 x float> }` maps to `<4 x float>`.

Fixes vector part of llvm.org/PR42022

Reviewers: RKSimon

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70068

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 57a17e035835..e4f50a8787df 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -630,6 +630,8 @@ class BoUpSLP {
   }
 
   /// Check if ArrayType or StructType is isomorphic to some VectorType.
+  /// Accepts homogeneous aggregate of vectors like
+  /// { <2 x float>, <2 x float> }
   ///
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -3096,6 +3098,12 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
     N = cast<ArrayType>(T)->getNumElements();
     EltTy = cast<ArrayType>(T)->getElementType();
   }
+
+  if (auto *VT = dyn_cast<VectorType>(EltTy)) {
+    EltTy = VT->getElementType();
+    N *= VT->getNumElements();
+  }
+
   if (!isValidElementType(EltTy))
     return 0;
   uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
@@ -3104,7 +3112,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   if (ST) {
     // Check that struct is homogeneous.
     for (const auto *Ty : ST->elements())
-      if (Ty != EltTy)
+      if (Ty != *ST->element_begin())
         return 0;
   }
   return N;
@@ -6960,12 +6968,24 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
 }
 
 /// Like findBuildVector, but looks for construction of aggregate.
+/// Accepts homegeneous aggregate of vectors like { <2 x float>, <2 x float> }.
 ///
 /// \return true if it matches.
-static bool findBuildAggregate(InsertValueInst *IV,
-                               SmallVectorImpl<Value *> &BuildVectorOpds) {
+static bool findBuildAggregate(InsertValueInst *IV, TargetTransformInfo *TTI,
+                               SmallVectorImpl<Value *> &BuildVectorOpds,
+                               int &UserCost) {
+  UserCost = 0;
   do {
-    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+    if (auto *IE = dyn_cast<InsertElementInst>(IV->getInsertedValueOperand())) {
+      int TmpUserCost;
+      SmallVector<Value *, 4> TmpBuildVectorOpds;
+      if (!findBuildVector(IE, TTI, TmpBuildVectorOpds, TmpUserCost))
+        return false;
+      BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), TmpBuildVectorOpds.rend());
+      UserCost += TmpUserCost;
+    } else {
+      BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+    }
     Value *V = IV->getAggregateOperand();
     if (isa<UndefValue>(V))
       break;
@@ -7138,18 +7158,19 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
                                                  BasicBlock *BB, BoUpSLP &R) {
+  int UserCost = 0;
   const DataLayout &DL = BB->getModule()->getDataLayout();
   if (!R.canMapToVector(IVI->getType(), DL))
     return false;
 
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IVI, BuildVectorOpds))
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R);
+  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
index 37602814061a..5124bab7f5d9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
@@ -8,22 +8,19 @@ define { <2 x float>, <2 x float> } @StructOfVectors(float *%Ptr) {
 ; CHECK-LABEL: @StructOfVectors(
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[GEP2]] to <2 x float>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.300000e+01, float 1.400000e+01>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[VECIN0:%.*]] = insertelement <2 x float> undef, float [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[VECIN2:%.*]] = insertelement <2 x float> undef, float [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[VECIN3:%.*]] = insertelement <2 x float> [[VECIN2]], float [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[VECIN0:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECIN2:%.*]] = insertelement <2 x float> undef, float [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECIN3:%.*]] = insertelement <2 x float> [[VECIN2]], float [[TMP7]], i64 1
 ; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[VECIN1]], 0
 ; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { <2 x float>, <2 x float> } [[RET0]], <2 x float> [[VECIN3]], 1
 ; CHECK-NEXT:    ret { <2 x float>, <2 x float> } [[RET1]]