[llvm] 40f12ef - [SLP] further limit bailout for load combine candidate (PR47450)

Fri Sep 11 08:56:30 PDT 2020

Author: Sanjay Patel
Date: 2020-09-11T11:56:11-04:00
New Revision: 40f12ef621d9fd2fb2dfe24f82b3f4f8c091f4ba

URL: https://github.com/llvm/llvm-project/commit/40f12ef621d9fd2fb2dfe24f82b3f4f8c091f4ba
DIFF: https://github.com/llvm/llvm-project/commit/40f12ef621d9fd2fb2dfe24f82b3f4f8c091f4ba.diff

LOG: [SLP] further limit bailout for load combine candidate (PR47450)

The test example based on PR47450 shows that we can
match non-byte-sized shifts, but those won't ever be
bswap opportunities. This isn't a full fix (we'd still
match if the shifts were by 8-bits for example), but
this should be enough until there's evidence that we
need to do more (this is a borderline case for
vectorization in the first place).

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5ff2cd18c73c..000bd863a7c5 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3694,11 +3694,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
                                        TargetTransformInfo *TTI) {
   // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
-  // shift-left-by-constant.
+  // shift-left-by-multiple-of-8-bits.
   Value *ZextLoad = Root;
+  const APInt *ShAmtC;
   while (!isa<ConstantExpr>(ZextLoad) &&
          (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
-          match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
+          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
+           ShAmtC->urem(8) == 0)))
     ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
 
   // Check if the input is an extended load of the required or/shift expression.

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
index c78bec1b6a20..e1028cf55276 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
@@ -545,10 +545,11 @@ define void @PR47450(i16* nocapture readonly %p) {
 ; CHECK-NEXT:    [[X:%.*]] = load i16, i16* [[P:%.*]], align 2
 ; CHECK-NEXT:    [[Z:%.*]] = zext i16 [[X]] to i32
 ; CHECK-NEXT:    [[S:%.*]] = shl nuw nsw i32 [[Z]], 1
-; CHECK-NEXT:    store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 0), align 16
-; CHECK-NEXT:    store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 2), align 8
-; CHECK-NEXT:    store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[S]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[S]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[S]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @output to <4 x i32>*), align 16
 ; CHECK-NEXT:    ret void
 ;
   %x = load i16, i16* %p, align 2