[llvm] 86dfbc6 - [SLP] add another bailout for load-combine patterns

Tue May 5 09:46:21 PDT 2020

Author: Sanjay Patel
Date: 2020-05-05T12:44:38-04:00
New Revision: 86dfbc676ebe4f46f9cff629f0fe21fa65bd0f97

URL: https://github.com/llvm/llvm-project/commit/86dfbc676ebe4f46f9cff629f0fe21fa65bd0f97
DIFF: https://github.com/llvm/llvm-project/commit/86dfbc676ebe4f46f9cff629f0fe21fa65bd0f97.diff

LOG: [SLP] add another bailout for load-combine patterns

This builds on the or-reduction bailout that was added with D67841.
We still do not have IR-level load combining, although that could
be a target-specific enhancement for -vector-combiner.

The heuristic is narrowly defined to catch the motivating case from
PR39538:
https://bugs.llvm.org/show_bug.cgi?id=39538
...while preserving existing functionality.

That is, there's an unmodified test of pure load/zext/store that is
not seen in this patch at llvm/test/Transforms/SLPVectorizer/X86/cast.ll.
That's the reason for the logic difference to require the 'or'
instructions. The chances that vectorization would actually help a
memory-bound sequence like that seem small, but it looks nicer with:

  vpmovzxwd	(%rsi), %xmm0
  vmovdqu	%xmm0, (%rdi)

rather than:

  movzwl	(%rsi), %eax
  movl	%eax, (%rdi)
  ...

In the motivating test, we avoid creating a vector mess that is
unrecoverable in the backend, and SDAG forms the expected bswap
instructions after load combining:

  movzbl (%rdi), %eax
  vmovd %eax, %xmm0
  movzbl 1(%rdi), %eax
  vmovd %eax, %xmm1
  movzbl 2(%rdi), %eax
  vpinsrb $4, 4(%rdi), %xmm0, %xmm0
  vpinsrb $8, 8(%rdi), %xmm0, %xmm0
  vpinsrb $12, 12(%rdi), %xmm0, %xmm0
  vmovd %eax, %xmm2
  movzbl 3(%rdi), %eax
  vpinsrb $1, 5(%rdi), %xmm1, %xmm1
  vpinsrb $2, 9(%rdi), %xmm1, %xmm1
  vpinsrb $3, 13(%rdi), %xmm1, %xmm1
  vpslld $24, %xmm0, %xmm0
  vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
  vpslld $16, %xmm1, %xmm1
  vpor %xmm0, %xmm1, %xmm0
  vpinsrb $1, 6(%rdi), %xmm2, %xmm1
  vmovd %eax, %xmm2
  vpinsrb $2, 10(%rdi), %xmm1, %xmm1
  vpinsrb $3, 14(%rdi), %xmm1, %xmm1
  vpinsrb $1, 7(%rdi), %xmm2, %xmm2
  vpinsrb $2, 11(%rdi), %xmm2, %xmm2
  vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
  vpinsrb $3, 15(%rdi), %xmm2, %xmm2
  vpslld $8, %xmm1, %xmm1
  vpmovzxbd %xmm2, %xmm2 # xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
  vpor %xmm2, %xmm1, %xmm1
  vpor %xmm1, %xmm0, %xmm0
  vmovdqu %xmm0, (%rsi)

  movl	(%rdi), %eax
  movl	4(%rdi), %ecx
  movl	8(%rdi), %edx
  movbel	%eax, (%rsi)
  movbel	%ecx, 4(%rsi)
  movl	12(%rdi), %ecx
  movbel	%edx, 8(%rsi)
  movbel	%ecx, 12(%rsi)

Differential Revision: https://reviews.llvm.org/D78997

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 235efc450e37..008d4002dd83 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -666,6 +666,15 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
 
+  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  bool isLoadCombineCandidate() const;
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -3673,8 +3682,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
   return true;
 }
 
-static bool isLoadCombineCandidate(Value *Root, unsigned NumElts,
-                                   TargetTransformInfo *TTI) {
+static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
+                                       TargetTransformInfo *TTI) {
   // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
   // shift-left-by-constant.
@@ -3683,9 +3692,9 @@ static bool isLoadCombineCandidate(Value *Root, unsigned NumElts,
          match(ZextLoad, m_Shl(m_Value(), m_Constant())))
     ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
 
-  // Check if the input is an extended load.
+  // Check if the input is an extended load of the required or/shift expression.
   Value *LoadPtr;
-  if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+  if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
     return false;
 
   // Require that the total load bit width is a legal integer type.
@@ -3710,7 +3719,20 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
 
   unsigned NumElts = VectorizableTree[0]->Scalars.size();
   Value *FirstReduced = VectorizableTree[0]->Scalars[0];
-  return isLoadCombineCandidate(FirstReduced, NumElts, TTI);
+  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+}
+
+bool BoUpSLP::isLoadCombineCandidate() const {
+  // Peek through a final sequence of stores and check if all operations are
+  // likely to be load-combined.
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+    Value *X;
+    if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
+        !isLoadCombineCandidateImpl(X, NumElts, TTI))
+      return false;
+  }
+  return true;
 }
 
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
@@ -5758,6 +5780,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   }
   if (R.isTreeTinyAndNotFullyVectorizable())
     return false;
+  if (R.isLoadCombineCandidate())
+    return false;
 
   R.computeMinimumValueSizes();
 
@@ -6010,6 +6034,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       }
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
+      if (R.isLoadCombineCandidate())
+        return false;
 
       R.computeMinimumValueSizes();
       int Cost = R.getTreeCost() - UserCost;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
index fb206b84fa93..319d4775c5eb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
@@ -393,34 +393,50 @@ define void @PR39538(i8* %t0, i32* %t1) {
 ; CHECK-NEXT:    [[T63:%.*]] = load i8, i8* [[T62]], align 1
 ; CHECK-NEXT:    [[T68:%.*]] = load i8, i8* [[T67]], align 1
 ; CHECK-NEXT:    [[T73:%.*]] = load i8, i8* [[T72]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> undef, i8 [[T3]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[T21]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[T40]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[T59]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8> undef, i8 [[T7]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i8> [[TMP6]], i8 [[T25]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i8> [[TMP7]], i8 [[T44]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[T63]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8> undef, i8 [[T12]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[T30]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[T49]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[T68]], i32 3
-; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> undef, i8 [[T17]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[T35]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[T54]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[T73]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw <4 x i32> [[TMP15]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP24:%.*]] = or <4 x i32> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <4 x i32> [[TMP25]], [[TMP20]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[T1]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP26]], <4 x i32>* [[TMP27]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = zext i8 [[T3]] to i32
+; CHECK-NEXT:    [[T8:%.*]] = zext i8 [[T7]] to i32
+; CHECK-NEXT:    [[T13:%.*]] = zext i8 [[T12]] to i32
+; CHECK-NEXT:    [[T18:%.*]] = zext i8 [[T17]] to i32
+; CHECK-NEXT:    [[T22:%.*]] = zext i8 [[T21]] to i32
+; CHECK-NEXT:    [[T26:%.*]] = zext i8 [[T25]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = zext i8 [[T30]] to i32
+; CHECK-NEXT:    [[T36:%.*]] = zext i8 [[T35]] to i32
+; CHECK-NEXT:    [[T41:%.*]] = zext i8 [[T40]] to i32
+; CHECK-NEXT:    [[T45:%.*]] = zext i8 [[T44]] to i32
+; CHECK-NEXT:    [[T50:%.*]] = zext i8 [[T49]] to i32
+; CHECK-NEXT:    [[T55:%.*]] = zext i8 [[T54]] to i32
+; CHECK-NEXT:    [[T60:%.*]] = zext i8 [[T59]] to i32
+; CHECK-NEXT:    [[T64:%.*]] = zext i8 [[T63]] to i32
+; CHECK-NEXT:    [[T69:%.*]] = zext i8 [[T68]] to i32
+; CHECK-NEXT:    [[T74:%.*]] = zext i8 [[T73]] to i32
+; CHECK-NEXT:    [[T5:%.*]] = shl nuw i32 [[T4]], 24
+; CHECK-NEXT:    [[T23:%.*]] = shl nuw i32 [[T22]], 24
+; CHECK-NEXT:    [[T42:%.*]] = shl nuw i32 [[T41]], 24
+; CHECK-NEXT:    [[T61:%.*]] = shl nuw i32 [[T60]], 24
+; CHECK-NEXT:    [[T9:%.*]] = shl nuw nsw i32 [[T8]], 16
+; CHECK-NEXT:    [[T27:%.*]] = shl nuw nsw i32 [[T26]], 16
+; CHECK-NEXT:    [[T46:%.*]] = shl nuw nsw i32 [[T45]], 16
+; CHECK-NEXT:    [[T65:%.*]] = shl nuw nsw i32 [[T64]], 16
+; CHECK-NEXT:    [[T14:%.*]] = shl nuw nsw i32 [[T13]], 8
+; CHECK-NEXT:    [[T32:%.*]] = shl nuw nsw i32 [[T31]], 8
+; CHECK-NEXT:    [[T51:%.*]] = shl nuw nsw i32 [[T50]], 8
+; CHECK-NEXT:    [[T70:%.*]] = shl nuw nsw i32 [[T69]], 8
+; CHECK-NEXT:    [[T10:%.*]] = or i32 [[T9]], [[T5]]
+; CHECK-NEXT:    [[T15:%.*]] = or i32 [[T10]], [[T14]]
+; CHECK-NEXT:    [[T19:%.*]] = or i32 [[T15]], [[T18]]
+; CHECK-NEXT:    [[T28:%.*]] = or i32 [[T27]], [[T23]]
+; CHECK-NEXT:    [[T33:%.*]] = or i32 [[T28]], [[T32]]
+; CHECK-NEXT:    [[T37:%.*]] = or i32 [[T33]], [[T36]]
+; CHECK-NEXT:    [[T47:%.*]] = or i32 [[T46]], [[T42]]
+; CHECK-NEXT:    [[T52:%.*]] = or i32 [[T47]], [[T51]]
+; CHECK-NEXT:    [[T56:%.*]] = or i32 [[T52]], [[T55]]
+; CHECK-NEXT:    [[T66:%.*]] = or i32 [[T65]], [[T61]]
+; CHECK-NEXT:    [[T71:%.*]] = or i32 [[T66]], [[T70]]
+; CHECK-NEXT:    [[T75:%.*]] = or i32 [[T71]], [[T74]]
+; CHECK-NEXT:    store i32 [[T19]], i32* [[T1]], align 4
+; CHECK-NEXT:    store i32 [[T37]], i32* [[T38]], align 4
+; CHECK-NEXT:    store i32 [[T56]], i32* [[T57]], align 4
+; CHECK-NEXT:    store i32 [[T75]], i32* [[T76]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %t6 = getelementptr inbounds i8, i8* %t0, i64 1