[llvm] r320548 - [SLP] Vectorize jumbled memory loads.
Sanjoy Das via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 19 15:47:14 PST 2017
Hi,
I think this change is buggy for cases like this:
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
@array = external global [20 x [13 x float]]
define void @hoge(i64 %idx, <4 x i32>* %sink) {
bb:
%tmp = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 5
%tmp1 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 6
%tmp2 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 7
%tmp3 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 8
%tmp4 = bitcast float* %tmp1 to i32*
%tmp5 = load i32, i32* %tmp4, align 4 ;; index = 6
%tmp6 = insertelement <4 x i32> undef, i32 %tmp5, i32 0
%tmp7 = bitcast float* %tmp2 to i32*
%tmp8 = load i32, i32* %tmp7, align 4 ;; index = 7
%tmp9 = insertelement <4 x i32> %tmp6, i32 %tmp8, i32 1
%tmp10 = bitcast float* %tmp3 to i32*
%tmp11 = load i32, i32* %tmp10, align 4 ;; index = 8
%tmp12 = insertelement <4 x i32> %tmp9, i32 %tmp11, i32 2
%tmp13 = bitcast float* %tmp to i32*
%tmp14 = load i32, i32* %tmp13, align 4 ;; index = 5
%tmp15 = insertelement <4 x i32> %tmp12, i32 %tmp14, i32 3
store <4 x i32> %tmp15, <4 x i32>* %sink ;; Storing [6, 7, 8, 5]
ret void
}
opt -slp-vectorizer results in
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
@array = external global [20 x [13 x float]]
define void @hoge(i64 %idx, <4 x i32>* %sink) {
bb:
%tmp = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 5
%tmp1 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 6
%tmp2 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 7
%tmp3 = getelementptr inbounds [20 x [13 x float]], [20 x [13 x
float]]* @array, i64 0, i64 %idx, i64 8
%tmp4 = bitcast float* %tmp1 to i32*
%tmp7 = bitcast float* %tmp2 to i32*
%tmp10 = bitcast float* %tmp3 to i32*
%tmp13 = bitcast float* %tmp to i32*
%0 = bitcast i32* %tmp13 to <4 x i32>*
%1 = load <4 x i32>, <4 x i32>* %0, align 4 ;; [5, 6, 7, 8]
%2 = extractelement <4 x i32> %1, i32 0
%tmp6 = insertelement <4 x i32> undef, i32 %2, i32 0
%3 = extractelement <4 x i32> %1, i32 1
%tmp9 = insertelement <4 x i32> %tmp6, i32 %3, i32 1
%4 = extractelement <4 x i32> %1, i32 2
%tmp12 = insertelement <4 x i32> %tmp9, i32 %4, i32 2
%5 = extractelement <4 x i32> %1, i32 3
%tmp15 = insertelement <4 x i32> %tmp12, i32 %5, i32 3
store <4 x i32> %tmp15, <4 x i32>* %sink ;; Storing [5, 6, 7, 8]
ret void
}
which looks incorrect.
The problem is more obvious with this assert:
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp
b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 76ba62f5d59..1978169f909 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -766,6 +766,10 @@ private:
if (UserTreeIdx != -1)
UserTreeEntry = &VectorizableTree[UserTreeIdx];
+ if (!ShuffleMask.empty()) {
+ assert(UserTreeEntry);
+ }
+
if (UserTreeEntry && !ShuffleMask.empty()) {
if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
UserTreeEntry->ShuffleMask.resize(OpdNum + 1);
In that, the SLP vectorizer "forgets" the shuffle mask if the shuffled
load instruction does not have a UserTreeIdx.
If my analysis is correct, can you please revert for now?
The commit also has some stylistic issues, I'll put them on the
https://reviews.llvm.org/D36130 phabricator review.
-- Sanjoy
More information about the llvm-commits
mailing list