[llvm] r320548 - [SLP] Vectorize jumbled memory loads.

Tue Dec 19 15:47:14 PST 2017

Hi,

I think this change is buggy for cases like this:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

@array = external global [20 x [13 x float]]

define void @hoge(i64 %idx, <4 x i32>* %sink) {
bb:
  %tmp = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 5
  %tmp1 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 6
  %tmp2 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 7
  %tmp3 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 8
  %tmp4 = bitcast float* %tmp1 to i32*
  %tmp5 = load i32, i32* %tmp4, align 4  ;; index = 6
  %tmp6 = insertelement <4 x i32> undef, i32 %tmp5, i32 0
  %tmp7 = bitcast float* %tmp2 to i32*
  %tmp8 = load i32, i32* %tmp7, align 4  ;; index = 7
  %tmp9 = insertelement <4 x i32> %tmp6, i32 %tmp8, i32 1
  %tmp10 = bitcast float* %tmp3 to i32*
  %tmp11 = load i32, i32* %tmp10, align 4  ;; index = 8
  %tmp12 = insertelement <4 x i32> %tmp9, i32 %tmp11, i32 2
  %tmp13 = bitcast float* %tmp to i32*
  %tmp14 = load i32, i32* %tmp13, align 4  ;; index = 5
  %tmp15 = insertelement <4 x i32> %tmp12, i32 %tmp14, i32 3
  store <4 x i32> %tmp15, <4 x i32>* %sink  ;; Storing [6, 7, 8, 5]
  ret void
}


opt -slp-vectorizer results in

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

@array = external global [20 x [13 x float]]

define void @hoge(i64 %idx, <4 x i32>* %sink) {
bb:
  %tmp = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 5
  %tmp1 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 6
  %tmp2 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 7
  %tmp3 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 8
  %tmp4 = bitcast float* %tmp1 to i32*
  %tmp7 = bitcast float* %tmp2 to i32*
  %tmp10 = bitcast float* %tmp3 to i32*
  %tmp13 = bitcast float* %tmp to i32*
  %0 = bitcast i32* %tmp13 to <4 x i32>*
  %1 = load <4 x i32>, <4 x i32>* %0, align 4 ;; [5, 6, 7, 8]
  %2 = extractelement <4 x i32> %1, i32 0
  %tmp6 = insertelement <4 x i32> undef, i32 %2, i32 0
  %3 = extractelement <4 x i32> %1, i32 1
  %tmp9 = insertelement <4 x i32> %tmp6, i32 %3, i32 1
  %4 = extractelement <4 x i32> %1, i32 2
  %tmp12 = insertelement <4 x i32> %tmp9, i32 %4, i32 2
  %5 = extractelement <4 x i32> %1, i32 3
  %tmp15 = insertelement <4 x i32> %tmp12, i32 %5, i32 3
  store <4 x i32> %tmp15, <4 x i32>* %sink  ;; Storing [5, 6, 7, 8]
  ret void
}

which looks incorrect.

The problem is more obvious with this assert:

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp
b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 76ba62f5d59..1978169f909 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -766,6 +766,10 @@ private:
     if (UserTreeIdx != -1)
       UserTreeEntry = &VectorizableTree[UserTreeIdx];

+    if (!ShuffleMask.empty()) {
+      assert(UserTreeEntry);
+    }
+
     if (UserTreeEntry && !ShuffleMask.empty()) {
       if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
         UserTreeEntry->ShuffleMask.resize(OpdNum + 1);


In that, the SLP vectorizer "forgets" the shuffle mask if the shuffled
load instruction does not have a UserTreeIdx.

If my analysis is correct, can you please revert for now?

The commit also has some stylistic issues, I'll put them on the
https://reviews.llvm.org/D36130 phabricator review.

-- Sanjoy