[llvm] fac7c7e - [SLP] Fix vector element size for the store chains

Mon Dec 14 04:54:01 PST 2020

Author: Anton Afanasyev
Date: 2020-12-14T15:51:43+03:00
New Revision: fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047

URL: https://github.com/llvm/llvm-project/commit/fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047
DIFF: https://github.com/llvm/llvm-project/commit/fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047.diff

LOG: [SLP] Fix vector element size for the store chains

Vector element size could be different for different store chains.
This patch prevents wrong computation of maximum number of elements
for that case.

Differential Revision: https://reviews.llvm.org/D93192

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c5ba3709f6b1..e1c1c6edf08c 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6076,7 +6076,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
 
     // If a vector register can't hold 1 element, we are done.
     unsigned MaxVecRegSize = R.getMaxVecRegSize();
-    unsigned EltSize = R.getVectorElementSize(Stores[0]);
+    unsigned EltSize = R.getVectorElementSize(Operands[0]);
     if (MaxVecRegSize % EltSize != 0)
       continue;
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
index 63e3178c0278..2fdef624d48f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
@@ -23,28 +23,21 @@ define void @foo(i8* %v0, i8* readonly %v1) {
 ; CHECK-NEXT:    [[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9
 ; CHECK-NEXT:    [[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 10
 ; CHECK-NEXT:    [[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 11
-; CHECK-NEXT:    [[T19:%.*]] = load i32, i32* [[T14]], align 4
-; CHECK-NEXT:    [[T23:%.*]] = load i32, i32* [[T18]], align 4
-; CHECK-NEXT:    [[T27:%.*]] = load i32, i32* [[T22]], align 4
-; CHECK-NEXT:    [[T30:%.*]] = load i32, i32* [[T26]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:    [[T20:%.*]] = add nsw i32 [[T19]], 4
-; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], 4
-; CHECK-NEXT:    [[T28:%.*]] = add nsw i32 [[T27]], 6
-; CHECK-NEXT:    [[T31:%.*]] = add nsw i32 [[T30]], 7
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[TMP2]], <i64 4, i64 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[TMP4]], <i64 6, i64 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP8]], align 8
-; CHECK-NEXT:    store i32 [[T20]], i32* [[T21]], align 4
-; CHECK-NEXT:    store i32 [[T24]], i32* [[T25]], align 4
-; CHECK-NEXT:    store i32 [[T28]], i32* [[T29]], align 4
-; CHECK-NEXT:    store i32 [[T31]], i32* [[T32]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 4, i32 4, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i64> [[TMP4]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP6]], <i64 6, i64 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T21]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %t0 = bitcast i8* %v0 to i32*