[llvm] r275471 - [LV] Allow interleaved accesses in loops with predicated blocks

Thu Jul 14 13:59:47 PDT 2016

Author: mssimpso
Date: Thu Jul 14 15:59:47 2016
New Revision: 275471

URL: http://llvm.org/viewvc/llvm-project?rev=275471&view=rev
Log:
[LV] Allow interleaved accesses in loops with predicated blocks

This patch allows the formation of interleaved access groups in loops
containing predicated blocks. However, the predicated accesses are prevented
from forming groups.

Differential Revision: https://reviews.llvm.org/D19694

Added:
    llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=275471&r1=275470&r2=275471&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Thu Jul 14 15:59:47 2016
@@ -947,6 +947,11 @@ private:
     return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
   }
 
+  /// \brief Returns true if \p BB is a predicated block.
+  bool isPredicated(BasicBlock *BB) const {
+    return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+  }
+
   /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
   bool areDependencesValid() const {
     return LAI && LAI->getDepChecker().getDependences();
@@ -4925,53 +4930,38 @@ bool LoopVectorizationLegality::blockCan
 void InterleavedAccessInfo::collectConstStridedAccesses(
     MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
     const ValueToValueMap &Strides) {
-  // Holds load/store instructions in program order.
-  SmallVector<Instruction *, 16> AccessList;
+
+  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
 
   // Since it's desired that the load/store instructions be maintained in
   // "program order" for the interleaved access analysis, we have to visit the
   // blocks in the loop in reverse postorder (i.e., in a topological order).
   // Such an ordering will ensure that any load/store that may be executed
-  // before a second load/store will precede the second load/store in the
-  // AccessList.
+  // before a second load/store will precede the second load/store in
+  // StrideAccesses.
   LoopBlocksDFS DFS(TheLoop);
   DFS.perform(LI);
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
-
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
     for (auto &I : *BB) {
-      if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
+      auto *LI = dyn_cast<LoadInst>(&I);
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (!LI && !SI)
         continue;
-      // FIXME: Currently we can't handle mixed accesses and predicated accesses
-      if (IsPred)
-        return;
 
-      AccessList.push_back(&I);
-    }
-  }
-
-  if (AccessList.empty())
-    return;
+      Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+      int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
 
-  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
-  for (auto I : AccessList) {
-    auto *LI = dyn_cast<LoadInst>(I);
-    auto *SI = dyn_cast<StoreInst>(I);
-
-    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
-
-    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
-    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
-    uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
-
-    // An alignment of 0 means target ABI alignment.
-    unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
-    if (!Align)
-      Align = DL.getABITypeAlignment(PtrTy->getElementType());
+      const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+      uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+
+      // An alignment of 0 means target ABI alignment.
+      unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+      if (!Align)
+        Align = DL.getABITypeAlignment(PtrTy->getElementType());
 
-    StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
-  }
+      StrideAccesses[&I] = StrideDescriptor(Stride, Scev, Size, Align);
+    }
 }
 
 // Analyze interleaved accesses and collect them into interleaved load and
@@ -5125,6 +5115,12 @@ void InterleavedAccessInfo::analyzeInter
       if (DistanceToA % static_cast<int64_t>(DesA.Size))
         continue;
 
+      // If either A or B is in a predicated block, we prevent adding them to a
+      // group. We may be able to relax this limitation in the future once we
+      // handle more complicated blocks.
+      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+        continue;
+
       // The index of B is the index of A plus the related index to A.
       int IndexB =
           Group->getIndex(A) + DistanceToA / static_cast<int64_t>(DesA.Size);

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll?rev=275471&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll Thu Jul 14 15:59:47 2016
@@ -0,0 +1,164 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -vectorize-num-stores-pred=1 -enable-cond-stores-vec < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+%pair = type { i64, i64 }
+
+; Ensure that we vectorize the interleaved load group even though the loop
+; contains a conditional store. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_0(
+;
+; CHECK: min.iters.checked
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_0(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't form a single interleaved group for the two loads. The
+; conditional store prevents the second load from being hoisted. The two load
+; groups are separately vectorized. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_1(
+;
+; CHECK: min.iters.checked
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+;
+; CHECK: pred.store.continue
+; CHECK:   %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0
+; CHECK:   store i64 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <4 x i64> %[[L2]], i32 2
+; CHECK:   store i64 %[[X4]], {{.*}}
+
+define void @interleaved_with_cond_store_1(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.0, align 8
+  br label %if.merge
+
+if.merge:
+  %2 = load i64, i64* %p.0, align 8
+  store i64 %2, i64 *%p.1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't create a single interleaved group for the two stores.
+; The second store is conditional and we can't sink the first store inside the
+; predicated block. The load group is vectorized, and the store groups contain
+; gaps and are not vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_2(
+;
+; CHECK: min.iters.checked
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK:   store i64 %x, {{.*}}
+; CHECK:   store i64 %x, {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_2(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  store i64 %x, i64* %p.0, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}