PATCH: WIP SLPVectorize: Enable vectorization of allocas

Tom Stellard tom at stellard.net
Thu Oct 24 21:02:05 PDT 2013


Hi,

As a follow up to this discussion:
http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-October/066780.html

I put together a very simple patch that begins to implement the transformation
mentioned in the llvm-dev thread.  The patch is incomplete and is mostly comments
with question about how to do certain things, but it does work for the
simple test case included in the patch.

I'd appreciate any feedback people can give me on this patch and the
questions posed in the comments.

Thanks,
Tom
-------------- next part --------------
diff --git lib/Transforms/Vectorize/SLPVectorizer.cpp lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4d82bc4..36662b4 100644
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -360,6 +360,18 @@ public:
 
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
+
+  /// Transform:
+  /// %0 = getelementptr [n x t]* %ptr, ... , %dynamic_index
+  /// %1 = load [n x t]* %0
+  ///
+  /// to
+  ///
+  /// %0 = bitcast [n x t]* %0 to <N x T>*
+  /// %1 = load <n x t>* %0
+  /// %2 = extractelement <n x t> %1, i32 %dynamic_index
+  bool vectorizeLoadGEP(LoadInst *LI);
+
 private:
   struct TreeEntry;
 
@@ -1041,7 +1053,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 bool BoUpSLP::isFullyVectorizableTinyTree() {
   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
         VectorizableTree.size() << " is fully vectorizable .\n");
-
+  // HACK: Always vectorize Tiny Trees, should we add a TargetTransformInfo
+  // callback?
+  return true;
   // We only handle trees of height 2.
   if (VectorizableTree.size() != 2)
     return false;
@@ -1689,6 +1703,84 @@ void BoUpSLP::optimizeGatherSequence() {
   }
 }
 
+// FIXME: How to determine this?
+static AllocaInst* derivedFromAlloca(Value *Ptr) {
+  return dyn_cast<AllocaInst>(Ptr);
+}
+
+// FIXME: How to determine this?
+static bool hasVectorStride(Type *Ty) {
+  return true;
+}
+
+// FIXME: Should we only introduce vector types that alredy exists in the
+// program?  If yes, what's the best way to determine which vector types
+// have been used?
+static bool isVectorTypeValid(Type *Ty) {
+  return true;
+}
+
+// FIXME: This function assumes the GEP instruction looks like this:
+// getelemntptr [n x t] %ptr, i32 0, i32 %index
+static Value *calculateVectorIndex(GetElementPtrInst *GEP) {
+  return GEP->getOperand(2);
+}
+
+bool BoUpSLP::vectorizeLoadGEP(LoadInst *LI) {
+
+  // Is the pointer computed by a GEP?
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+  if (!GEP)
+    return false;
+
+  // FIXME: For now we only consider:
+  // getelementptr [n x t]*, i32 0, i32 %value
+  if (GEP->getNumOperands() != 3)
+    return false;
+  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+  if (!I0 || !I0->isZero())
+    return false;
+  if (dyn_cast<ConstantInt>(GEP->getOperand(2)))
+    return false;
+
+  // Check that the pointer is derived from an alloca
+  AllocaInst *Alloca = derivedFromAlloca(GEP->getPointerOperand());
+  if (!Alloca)
+    return false;
+
+  // Check the the alloca type has a vector stride
+  Type *AllocaTy = Alloca->getAllocatedType();
+  if (!hasVectorStride(AllocaTy))
+    return false;
+
+  // Get the vector type that is equivalent to this array type
+  // FIXME: Instead of translating the type from array to vector, should
+  // we check if the alloca has been bitcast to any vector types
+  // and just use that type?
+  if (!AllocaTy->isArrayTy())
+    return false;
+  VectorType *VectorTy = VectorType::get(AllocaTy->getArrayElementType(),
+                                         AllocaTy->getArrayNumElements());
+
+  // Is it OK to use this vector type with this target?
+  if (!isVectorTypeValid(VectorTy))
+    return false;
+
+  // All of our checks have passed, so we can do the transformation.
+  Builder.SetInsertPoint(GEP->getParent(), GEP);
+  Value *BitCast = Builder.CreateBitCast(Alloca,
+      VectorTy->getPointerTo(Alloca->getType()->getPointerAddressSpace()));
+  Value *VecValue = Builder.CreateLoad(BitCast);
+
+  // Calculate the correct index
+  Value *Index = calculateVectorIndex(GEP);
+  Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+
+  LI->replaceAllUsesWith(ExtractElement);
+
+  return true;
+}
+
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
   typedef SmallVector<StoreInst *, 8> StoreList;
@@ -2534,6 +2626,10 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       continue;
     }
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(it)) {
+      Changed |= R.vectorizeLoadGEP(LI);
+    }
   }
 
   return Changed;
diff --git test/Transforms/SLPVectorizer/R600/vectorize-alloca.ll test/Transforms/SLPVectorizer/R600/vectorize-alloca.ll
new file mode 100644
index 0000000..70159eb
--- /dev/null
+++ test/Transforms/SLPVectorizer/R600/vectorize-alloca.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -march=r600 -mcpu=redwood -basicaa -slp-vectorizer -dce -sroa < %s | FileCheck %s
+
+; ModuleID = 'radeon'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+target triple = "r600--"
+
+; This is a simple case were the slp vectorizer should replace the
+; alloca'd array with a vector and enable sroa to eliminate the
+; alloca instruction.
+
+; CHECK-LABEL: vectorize_simple
+; CHECK-NOT: alloca
+; CHECK: extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+define void @vectorize_simple(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %index
+  %2 = load i32* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}


More information about the llvm-commits mailing list