[PATCH] D97691: [SLP] Honor min/max regsize and min/max VF in vectorizeStores

Mon Mar 1 07:49:09 PST 2021

bjope created this revision.
bjope added reviewers: spatel, ABataev, anton-afanasyev.
Herald added a subscriber: hiraditya.
bjope requested review of this revision.
Herald added a project: LLVM.

Make sure we use PowerOf2Floor instead of PowerOf2Ceil when
calculating max number of elements that fits inside a vector
register (otherwise we could end up creating vectors larger
than the maximum vector register size).

Also make sure we honor the min/max VF (as given by TTI or
cmd line parameters) when doing vectorizeStores.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D97691

Files:
  llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
  llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll


Index: llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll
===================================================================

--- llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll
+++ llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll
@@ -15,15 +15,20 @@
 
 define void @foo() {
 ; CHECK-VF8-160-LABEL: @foo(
-; CHECK-VF8-160-NEXT:    store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* bitcast ([8 x i32]* @X to <8 x i32>*), align 1
+; CHECK-VF8-160-NEXT:    store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1
+; CHECK-VF8-160-NEXT:    store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1
 ; CHECK-VF8-160-NEXT:    ret void
 ;
 ; CHECK-VF4-160-LABEL: @foo(
-; CHECK-VF4-160-NEXT:    store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* bitcast ([8 x i32]* @X to <8 x i32>*), align 1
+; CHECK-VF4-160-NEXT:    store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1
+; CHECK-VF4-160-NEXT:    store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1
 ; CHECK-VF4-160-NEXT:    ret void
 ;
 ; CHECK-VF2-160-LABEL: @foo(
-; CHECK-VF2-160-NEXT:    store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* bitcast ([8 x i32]* @X to <8 x i32>*), align 1
+; CHECK-VF2-160-NEXT:    store <2 x i32> <i32 1, i32 2>, <2 x i32>* bitcast ([8 x i32]* @X to <2 x i32>*), align 1
+; CHECK-VF2-160-NEXT:    store <2 x i32> <i32 3, i32 4>, <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2) to <2 x i32>*), align 1
+; CHECK-VF2-160-NEXT:    store <2 x i32> <i32 5, i32 6>, <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <2 x i32>*), align 1
+; CHECK-VF2-160-NEXT:    store <2 x i32> <i32 7, i32 8>, <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6) to <2 x i32>*), align 1
 ; CHECK-VF2-160-NEXT:    ret void
 ;
 ; CHECK-VF8-128-LABEL: @foo(
@@ -37,8 +42,10 @@
 ; CHECK-VF4-128-NEXT:    ret void
 ;
 ; CHECK-VF2-128-LABEL: @foo(
-; CHECK-VF2-128-NEXT:    store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1
-; CHECK-VF2-128-NEXT:    store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1
+; CHECK-VF2-128-NEXT:    store <2 x i32> <i32 1, i32 2>, <2 x i32>* bitcast ([8 x i32]* @X to <2 x i32>*), align 1
+; CHECK-VF2-128-NEXT:    store <2 x i32> <i32 3, i32 4>, <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2) to <2 x i32>*), align 1
+; CHECK-VF2-128-NEXT:    store <2 x i32> <i32 5, i32 6>, <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <2 x i32>*), align 1
+; CHECK-VF2-128-NEXT:    store <2 x i32> <i32 7, i32 8>, <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6) to <2 x i32>*), align 1
 ; CHECK-VF2-128-NEXT:    ret void
 ;
 ; CHECK-VF8-256-LABEL: @foo(
@@ -46,8 +53,14 @@
 ; CHECK-VF8-256-NEXT:    ret void
 ;
 ; CHECK-VF2-128-128-LABEL: @foo(
-; CHECK-VF2-128-128-NEXT:    store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1
-; CHECK-VF2-128-128-NEXT:    store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 0), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 1), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 3), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 5), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6), align 1
+; CHECK-VF2-128-128-NEXT:    store i32 8, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 7), align 1
 ; CHECK-VF2-128-128-NEXT:    ret void
 ;
   store i32 1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 0), align 1
Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6071,17 +6071,25 @@
       I = ConsecutiveChain[I];
     }
 
-    // If a vector register can't hold 1 element, we are done.
     unsigned MaxVecRegSize = R.getMaxVecRegSize();
     unsigned EltSize = R.getVectorElementSize(Operands[0]);
+    unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
+
+    // For legacy reasons we check that MaxVecRegSize is evenly divisible by the
+    // element size. This check used to be described as checking that at least
+    // one element fits in a vector register, but that did not match the
+    // code. Maybe this check can be removed?
     if (MaxVecRegSize % EltSize != 0)
       continue;
 
-    unsigned MaxElts = MaxVecRegSize / EltSize;
+    unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize);
+    unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
+                              MaxElts);
+
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
     unsigned StartIdx = 0;
-    for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
+    for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
       for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
         ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
         if (!VectorizedStores.count(Slice.front()) &&


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D97691.327118.patch
Type: text/x-patch
Size: 6486 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210301/1f076834/attachment.bin>