[llvm] 49950cb - [SLP] restrict matching of load combine candidates

Tue May 11 05:53:49 PDT 2021

Author: Sanjay Patel
Date: 2021-05-11T08:46:40-04:00
New Revision: 49950cb1f6f699cbb9d8f141c0c043d4795c3417

URL: https://github.com/llvm/llvm-project/commit/49950cb1f6f699cbb9d8f141c0c043d4795c3417
DIFF: https://github.com/llvm/llvm-project/commit/49950cb1f6f699cbb9d8f141c0c043d4795c3417.diff

LOG: [SLP] restrict matching of load combine candidates

The test example from https://llvm.org/PR50256 (and reduced here)
shows that we can match a load combine candidate even when there
are no "or" instructions. We can avoid that by confirming that we
do see an "or". This doesn't apply when matching an or-reduction
because that match begins from the operands of the reduction.

Differential Revision: https://reviews.llvm.org/D102074

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e656b189c7793..222062da13fca 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4066,21 +4066,27 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
 }
 
 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
-                                       TargetTransformInfo *TTI) {
+                                       TargetTransformInfo *TTI,
+                                       bool MustMatchOrInst) {
   // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
   // shift-left-by-multiple-of-8-bits.
   Value *ZextLoad = Root;
   const APInt *ShAmtC;
+  bool FoundOr = false;
   while (!isa<ConstantExpr>(ZextLoad) &&
          (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
           (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
-           ShAmtC->urem(8) == 0)))
-    ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
-
+           ShAmtC->urem(8) == 0))) {
+    auto *BinOp = cast<BinaryOperator>(ZextLoad);
+    ZextLoad = BinOp->getOperand(0);
+    if (BinOp->getOpcode() == Instruction::Or)
+      FoundOr = true;
+  }
   // Check if the input is an extended load of the required or/shift expression.
   Value *LoadPtr;
-  if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+  if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
+      !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
     return false;
 
   // Require that the total load bit width is a legal integer type.
@@ -4105,7 +4111,8 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
 
   unsigned NumElts = VectorizableTree[0]->Scalars.size();
   Value *FirstReduced = VectorizableTree[0]->Scalars[0];
-  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
+                                    /* MatchOr */ false);
 }
 
 bool BoUpSLP::isLoadCombineCandidate() const {
@@ -4115,7 +4122,7 @@ bool BoUpSLP::isLoadCombineCandidate() const {
   for (Value *Scalar : VectorizableTree[0]->Scalars) {
     Value *X;
     if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
-        !isLoadCombineCandidateImpl(X, NumElts, TTI))
+        !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
       return false;
   }
   return true;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
index 89a01b2fba3b2..3129ba79dafaa 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
@@ -4,7 +4,6 @@
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
 
-; FIXME:
 ; This should not be matched as a load combining candidate.
 ; There are no 'or' operations, so it can't be a bswap or
 ; other pattern that we are expecting the backend to handle.
@@ -26,54 +25,14 @@ define void @PR50256(i8* %a, i16* %b, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 15
-; CHECK-NEXT:    [[I:%.*]] = load i8, i8* [[A]], align 1
-; CHECK-NEXT:    [[I1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
-; CHECK-NEXT:    [[I2:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
-; CHECK-NEXT:    [[I3:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
-; CHECK-NEXT:    [[I4:%.*]] = load i8, i8* [[ARRAYIDX_4]], align 1
-; CHECK-NEXT:    [[I5:%.*]] = load i8, i8* [[ARRAYIDX_5]], align 1
-; CHECK-NEXT:    [[I6:%.*]] = load i8, i8* [[ARRAYIDX_6]], align 1
-; CHECK-NEXT:    [[I7:%.*]] = load i8, i8* [[ARRAYIDX_7]], align 1
-; CHECK-NEXT:    [[I8:%.*]] = load i8, i8* [[ARRAYIDX_8]], align 1
-; CHECK-NEXT:    [[I9:%.*]] = load i8, i8* [[ARRAYIDX_9]], align 1
-; CHECK-NEXT:    [[I10:%.*]] = load i8, i8* [[ARRAYIDX_10]], align 1
-; CHECK-NEXT:    [[I11:%.*]] = load i8, i8* [[ARRAYIDX_11]], align 1
-; CHECK-NEXT:    [[I12:%.*]] = load i8, i8* [[ARRAYIDX_12]], align 1
-; CHECK-NEXT:    [[I13:%.*]] = load i8, i8* [[ARRAYIDX_13]], align 1
-; CHECK-NEXT:    [[I14:%.*]] = load i8, i8* [[ARRAYIDX_14]], align 1
-; CHECK-NEXT:    [[I15:%.*]] = load i8, i8* [[ARRAYIDX_15]], align 1
-; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[I]] to i16
-; CHECK-NEXT:    [[CONV5_1:%.*]] = zext i8 [[I1]] to i16
-; CHECK-NEXT:    [[CONV5_2:%.*]] = zext i8 [[I2]] to i16
-; CHECK-NEXT:    [[CONV5_3:%.*]] = zext i8 [[I3]] to i16
-; CHECK-NEXT:    [[CONV5_4:%.*]] = zext i8 [[I4]] to i16
-; CHECK-NEXT:    [[CONV5_5:%.*]] = zext i8 [[I5]] to i16
-; CHECK-NEXT:    [[CONV5_6:%.*]] = zext i8 [[I6]] to i16
-; CHECK-NEXT:    [[CONV5_7:%.*]] = zext i8 [[I7]] to i16
-; CHECK-NEXT:    [[CONV5_8:%.*]] = zext i8 [[I8]] to i16
-; CHECK-NEXT:    [[CONV5_9:%.*]] = zext i8 [[I9]] to i16
-; CHECK-NEXT:    [[CONV5_10:%.*]] = zext i8 [[I10]] to i16
-; CHECK-NEXT:    [[CONV5_11:%.*]] = zext i8 [[I11]] to i16
-; CHECK-NEXT:    [[CONV5_12:%.*]] = zext i8 [[I12]] to i16
-; CHECK-NEXT:    [[CONV5_13:%.*]] = zext i8 [[I13]] to i16
-; CHECK-NEXT:    [[CONV5_14:%.*]] = zext i8 [[I14]] to i16
-; CHECK-NEXT:    [[CONV5_15:%.*]] = zext i8 [[I15]] to i16
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i16 [[CONV5]], 8
-; CHECK-NEXT:    [[SHL_1:%.*]] = shl nuw i16 [[CONV5_1]], 8
-; CHECK-NEXT:    [[SHL_2:%.*]] = shl nuw i16 [[CONV5_2]], 8
-; CHECK-NEXT:    [[SHL_3:%.*]] = shl nuw i16 [[CONV5_3]], 8
-; CHECK-NEXT:    [[SHL_4:%.*]] = shl nuw i16 [[CONV5_4]], 8
-; CHECK-NEXT:    [[SHL_5:%.*]] = shl nuw i16 [[CONV5_5]], 8
-; CHECK-NEXT:    [[SHL_6:%.*]] = shl nuw i16 [[CONV5_6]], 8
-; CHECK-NEXT:    [[SHL_7:%.*]] = shl nuw i16 [[CONV5_7]], 8
-; CHECK-NEXT:    [[SHL_8:%.*]] = shl nuw i16 [[CONV5_8]], 8
-; CHECK-NEXT:    [[SHL_9:%.*]] = shl nuw i16 [[CONV5_9]], 8
-; CHECK-NEXT:    [[SHL_10:%.*]] = shl nuw i16 [[CONV5_10]], 8
-; CHECK-NEXT:    [[SHL_11:%.*]] = shl nuw i16 [[CONV5_11]], 8
-; CHECK-NEXT:    [[SHL_12:%.*]] = shl nuw i16 [[CONV5_12]], 8
-; CHECK-NEXT:    [[SHL_13:%.*]] = shl nuw i16 [[CONV5_13]], 8
-; CHECK-NEXT:    [[SHL_14:%.*]] = shl nuw i16 [[CONV5_14]], 8
-; CHECK-NEXT:    [[SHL_15:%.*]] = shl nuw i16 [[CONV5_15]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[A]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARRAYIDX_8]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw <8 x i16> [[TMP5]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw <8 x i16> [[TMP6]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 3
@@ -89,22 +48,10 @@ define void @PR50256(i8* %a, i16* %b, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX3_13:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX3_14:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX3_15:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 15
-; CHECK-NEXT:    store i16 [[SHL]], i16* [[B]], align 2
-; CHECK-NEXT:    store i16 [[SHL_1]], i16* [[ARRAYIDX3_1]], align 2
-; CHECK-NEXT:    store i16 [[SHL_2]], i16* [[ARRAYIDX3_2]], align 2
-; CHECK-NEXT:    store i16 [[SHL_3]], i16* [[ARRAYIDX3_3]], align 2
-; CHECK-NEXT:    store i16 [[SHL_4]], i16* [[ARRAYIDX3_4]], align 2
-; CHECK-NEXT:    store i16 [[SHL_5]], i16* [[ARRAYIDX3_5]], align 2
-; CHECK-NEXT:    store i16 [[SHL_6]], i16* [[ARRAYIDX3_6]], align 2
-; CHECK-NEXT:    store i16 [[SHL_7]], i16* [[ARRAYIDX3_7]], align 2
-; CHECK-NEXT:    store i16 [[SHL_8]], i16* [[ARRAYIDX3_8]], align 2
-; CHECK-NEXT:    store i16 [[SHL_9]], i16* [[ARRAYIDX3_9]], align 2
-; CHECK-NEXT:    store i16 [[SHL_10]], i16* [[ARRAYIDX3_10]], align 2
-; CHECK-NEXT:    store i16 [[SHL_11]], i16* [[ARRAYIDX3_11]], align 2
-; CHECK-NEXT:    store i16 [[SHL_12]], i16* [[ARRAYIDX3_12]], align 2
-; CHECK-NEXT:    store i16 [[SHL_13]], i16* [[ARRAYIDX3_13]], align 2
-; CHECK-NEXT:    store i16 [[SHL_14]], i16* [[ARRAYIDX3_14]], align 2
-; CHECK-NEXT:    store i16 [[SHL_15]], i16* [[ARRAYIDX3_15]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[B]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* [[TMP9]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3_8]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* [[TMP10]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i64 1