[llvm] [SLP]Do extra analysis int minbitwidth if some checks return false. (PR #84363)

Thu Mar 14 12:58:09 PDT 2024

https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/84363

>From d50a0a177abb6877bdd4cfe8b712f097aed59297 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 7 Mar 2024 19:13:43 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  6 +++---
 .../SLPVectorizer/AArch64/horizontal.ll       | 20 ++++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1889bc09e85028..c6502c9f77f001 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13942,7 +13942,7 @@ bool BoUpSLP::collectValuesToDemote(
         !collectValuesToDemote(I->getOperand(1), IsProfitableToDemoteRoot,
                                BitWidth, ToDemote, DemotedConsts, Visited,
                                Level2, IsProfitableToDemote))
-      return false;
+      return IsProfitableToDemote && IsPotentiallyTruncated(I, BitWidth);
     MaxDepthLevel = std::max(Level1, Level2);
     break;
   }
@@ -13958,7 +13958,7 @@ bool BoUpSLP::collectValuesToDemote(
         !collectValuesToDemote(SI->getFalseValue(), IsProfitableToDemoteRoot,
                                BitWidth, ToDemote, DemotedConsts, Visited,
                                Level2, IsProfitableToDemote))
-      return false;
+      return IsProfitableToDemote && IsPotentiallyTruncated(I, BitWidth);
     MaxDepthLevel = std::max(Level1, Level2);
     break;
   }
@@ -13971,7 +13971,7 @@ bool BoUpSLP::collectValuesToDemote(
       if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth,
                                  ToDemote, DemotedConsts, Visited,
                                  MaxDepthLevel, IsProfitableToDemote))
-        return false;
+        return IsProfitableToDemote && IsPotentiallyTruncated(I, BitWidth);
     break;
   }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 1986b51ec94828..02d1f9f60d0ca1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -228,7 +228,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; YAML-NEXT: Function:        test_unrolled_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-36'
+; YAML-NEXT:   - Cost:            '-40'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '10'
 
@@ -246,15 +246,17 @@ define i32 @test_unrolled_select(ptr noalias nocapture readonly %blk1, ptr noali
 ; CHECK-NEXT:    [[P2_045:%.*]] = phi ptr [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR88:%.*]], [[IF_END_86]] ]
 ; CHECK-NEXT:    [[P1_044:%.*]] = phi ptr [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END_86]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[P1_044]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[P2_045]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP8]], [[S_047]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i16> [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i1> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <8 x i16> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <8 x i16> [[TMP8]] to <8 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP10]], [[S_047]]
 ; CHECK-NEXT:    [[CMP83:%.*]] = icmp slt i32 [[OP_RDX]], [[LIM:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       if.end.86: