[llvm] r294027 - [SLP] Use SCEV to sort memory accesses.

Fri Feb 3 11:09:45 PST 2017

Author: mkuper
Date: Fri Feb  3 13:09:45 2017
New Revision: 294027

URL: http://llvm.org/viewvc/llvm-project?rev=294027&view=rev
Log:
[SLP] Use SCEV to sort memory accesses.

This generalizes memory access sorting to use differences between SCEVs,
instead of relying on constant offsets. That allows us to properly do
SLP vectorization of non-sequentially ordered loads within loops bodies.

Differential Revision: https://reviews.llvm.org/D29425

Modified:
    llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
    llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll

Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h?rev=294027&r1=294026&r2=294027&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
+++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Fri Feb  3 13:09:45 2017
@@ -690,8 +690,14 @@ int64_t getPtrStride(PredicatedScalarEvo
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
-/// \brief Saves the sorted memory accesses in vector argument 'Sorted' after
-/// sorting the jumbled memory accesses.
+/// \brief Try to sort an array of loads / stores.
+///
+/// If all pointers refer to the same object, and the differences between all
+/// pointer operands are known to be constant, the array is sorted by offset,
+/// and returned in \p Sorted.
+/// 
+/// If those conditions do not hold, the output array is an arbitrary
+/// permutation of the input.
 void sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
                      ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted);
 

Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp?rev=294027&r1=294026&r2=294027&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
+++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Fri Feb  3 13:09:45 2017
@@ -1058,30 +1058,47 @@ static unsigned getAddressSpaceOperand(V
   return -1;
 }
 
-/// Saves the memory accesses after sorting it into vector argument 'Sorted'.
 void llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
                          ScalarEvolution &SE,
                          SmallVectorImpl<Value *> &Sorted) {
-  SmallVector<std::pair<int, Value *>, 4> OffValPairs;
+  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
+  OffValPairs.reserve(VL.size());
+  Sorted.reserve(VL.size());
+
+  // Walk over the pointers, and map each of them to an offset relative to
+  // first pointer in the array.
+  Value *Ptr0 = getPointerOperand(VL[0]);
+  const SCEV *Scev0 = SE.getSCEV(Ptr0);
+  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
+
   for (auto *Val : VL) {
-    // Compute the constant offset from the base pointer of each memory accesses
-    // and insert into the vector of key,value pair which needs to be sorted.
     Value *Ptr = getPointerOperand(Val);
-    unsigned AS = getAddressSpaceOperand(Val);
-    unsigned PtrBitWidth = DL.getPointerSizeInBits(AS);
-    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
-    APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
-
-    // FIXME: Currently the offsets are assumed to be constant.However this not
-    // always true as offsets can be variables also and we would need to
-    // consider the difference of the variable offsets.
-    APInt Offset(PtrBitWidth, 0);
-    Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
-    OffValPairs.push_back(std::make_pair(Offset.getSExtValue(), Val));
+
+    // If a pointer refers to a different underlying object, bail - the
+    // pointers are by definition incomparable.
+    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
+    if (CurrObj != Obj0) {
+      Sorted.append(VL.begin(), VL.end());
+      return;
+    }
+
+    const SCEVConstant *Diff =
+        dyn_cast<SCEVConstant>(SE.getMinusSCEV(SE.getSCEV(Ptr), Scev0));
+
+    // The pointers may not have a constant offset from each other, or SCEV
+    // may just not be smart enough to figure out they do. Regardless,
+    // there's nothing we can do.
+    if (!Diff) {
+      Sorted.append(VL.begin(), VL.end());
+      return;
+    }
+
+    OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val);
   }
+
   std::sort(OffValPairs.begin(), OffValPairs.end(),
-            [](const std::pair<int, Value *> &Left,
-               const std::pair<int, Value *> &Right) {
+            [](const std::pair<int64_t, Value *> &Left,
+               const std::pair<int64_t, Value *> &Right) {
               return Left.first < Right.first;
             });
 

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll?rev=294027&r1=294026&r2=294027&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll Fri Feb  3 13:09:45 2017
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s
 
-
+ at total = common global i32 0, align 4
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
-; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
@@ -20,10 +20,10 @@ define i32 @jumbled-load(i32* noalias no
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
@@ -59,3 +59,116 @@ define i32 @jumbled-load(i32* noalias no
 
   ret i32 undef
 }
+
+; Make sure we can sort loads even if they have non-constant offsets, as long as
+; the offset *differences* are constant and computable by SCEV.
+define void @scev(i64 %N, i32* nocapture readonly %b, i32* nocapture readonly %c) {
+; CHECK-LABEL: @scev(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_OUTER:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_OUTER]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_P:%.*]] = phi i64 [ [[ADD21:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP14:%.*]], [[FOR_BODY]] ], [ zeroinitializer, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I_P]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[I_P]]
+; CHECK-NEXT:    [[ADD3:%.*]] = or i64 [[I_P]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[ADD3]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[ADD3]]
+; CHECK-NEXT:    [[ADD9:%.*]] = or i64 [[I_P]], 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[ADD9]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[ADD9]]
+; CHECK-NEXT:    [[ADD15:%.*]] = or i64 [[I_P]], 3
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[ADD15]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[ADD15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    [[TMP14]] = add <4 x i32> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[ADD21]] = add nuw nsw i64 [[I_P]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[ADD21]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP14]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP15]], i32 1
+; CHECK-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP15]], i32 2
+; CHECK-NEXT:    [[ADD23:%.*]] = add nsw i32 [[ADD22]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP15]], i32 3
+; CHECK-NEXT:    [[ADD24:%.*]] = add nsw i32 [[ADD23]], [[TMP19]]
+; CHECK-NEXT:    store i32 [[ADD24]], i32* @total, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp.outer = icmp sgt i64 %N, 0
+  br i1 %cmp.outer, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %a4.p = phi i32 [ %add20, %for.body ], [ 0, %for.body.preheader ]
+  %a3.p = phi i32 [ %add2, %for.body ], [ 0, %for.body.preheader ]
+  %a2.p = phi i32 [ %add8, %for.body ], [ 0, %for.body.preheader ]
+  %a1.p = phi i32 [ %add14, %for.body ], [ 0, %for.body.preheader ]
+  %i.p = phi i64 [ %add21, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.p
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.p
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add i32 %0, %a3.p
+  %add2 = add i32 %add, %1
+  %add3 = or i64 %i.p, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %b, i64 %add3
+  %2 = load i32, i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %add3
+  %3 = load i32, i32* %arrayidx6, align 4
+  %add7 = add i32 %2, %a2.p
+  %add8 = add i32 %add7, %3
+  %add9 = or i64 %i.p, 2
+  %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %add9
+  %4 = load i32, i32* %arrayidx10, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %add9
+  %5 = load i32, i32* %arrayidx12, align 4
+  %add13 = add i32 %4, %a1.p
+  %add14 = add i32 %add13, %5
+  %add15 = or i64 %i.p, 3
+  %arrayidx16 = getelementptr inbounds i32, i32* %b, i64 %add15
+  %6 = load i32, i32* %arrayidx16, align 4
+  %arrayidx18 = getelementptr inbounds i32, i32* %c, i64 %add15
+  %7 = load i32, i32* %arrayidx18, align 4
+  %add19 = add i32 %6, %a4.p
+  %add20 = add i32 %add19, %7
+  %add21 = add nuw nsw i64 %i.p, 4
+  %cmp = icmp slt i64 %add21, %N
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %a1.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.end.loopexit ]
+  %a2.0.lcssa = phi i32 [ 0, %entry ], [ %add8, %for.end.loopexit ]
+  %a3.0.lcssa = phi i32 [ 0, %entry ], [ %add2, %for.end.loopexit ]
+  %a4.0.lcssa = phi i32 [ 0, %entry ], [ %add20, %for.end.loopexit ]
+  %add22 = add nsw i32 %a2.0.lcssa, %a1.0.lcssa
+  %add23 = add nsw i32 %add22, %a3.0.lcssa
+  %add24 = add nsw i32 %add23, %a4.0.lcssa
+  store i32 %add24, i32* @total, align 4
+  ret void
+}