[llvm] 6d3d5f3 - [SLP][REVEC] getWidenedType should be used instead of FixedVectorType::get. (#109843)

Tue Sep 24 11:23:17 PDT 2024

Author: Han-Kuan Chen
Date: 2024-09-25T02:23:14+08:00
New Revision: 6d3d5f30bd58af6d16d0b4b7d32dc3ead1e098ec

URL: https://github.com/llvm/llvm-project/commit/6d3d5f30bd58af6d16d0b4b7d32dc3ead1e098ec
DIFF: https://github.com/llvm/llvm-project/commit/6d3d5f30bd58af6d16d0b4b7d32dc3ead1e098ec.diff

LOG: [SLP][REVEC] getWidenedType should be used instead of FixedVectorType::get. (#109843)

reference: https://github.com/llvm/llvm-project/issues/109835

Added: 
    llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7e3dbe6260983e..b79e964cdb1b6b 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9986,8 +9986,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         Cost += ::getShuffleCost(
             TTI, TTI::SK_InsertSubvector,
-            FixedVectorType::get(ScalarTy, CommonMask.size()), {}, CostKind,
-            Idx, FixedVectorType::get(ScalarTy, E->getVectorFactor()));
+            getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
+            getWidenedType(ScalarTy, E->getVectorFactor()));
         if (!CommonMask.empty()) {
           std::iota(std::next(CommonMask.begin(), Idx),
                     std::next(CommonMask.begin(), Idx + E->getVectorFactor()),

diff  --git a/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll b/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll
new file mode 100644
index 00000000000000..965bfc7074c638
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
+
+ at b = external dso_local local_unnamed_addr global i64, align 8
+ at d = external dso_local local_unnamed_addr global i32, align 4
+ at c = external dso_local local_unnamed_addr global i32, align 4
+ at a = external dso_local local_unnamed_addr global i8, align 2
+
+define void @e() {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[C_PROMOTED5:%.*]] = load i32, ptr @c, align 4
+; CHECK-NEXT:    [[A_PROMOTED7:%.*]] = load i8, ptr @a, align 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[C_PROMOTED5]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i8> <i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 [[A_PROMOTED7]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 -6, i32 3, i32 12, i32 21, i32 30, i32 39, i32 48, i32 57, i32 66, i32 75, i32 84, i32 93, i32 102, i32 111, i32 120, i32 129>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 -4, i32 5, i32 14, i32 23, i32 32, i32 41, i32 50, i32 59, i32 68, i32 77, i32 86, i32 95, i32 104, i32 113, i32 122, i32 131>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 -2, i32 7, i32 16, i32 25, i32 34, i32 43, i32 52, i32 61, i32 70, i32 79, i32 88, i32 97, i32 106, i32 115, i32 124, i32 133>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 0, i32 9, i32 18, i32 27, i32 36, i32 45, i32 54, i32 63, i32 72, i32 81, i32 90, i32 99, i32 108, i32 117, i32 126, i32 135>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <16 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <16 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <16 x i32> [[TMP3]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <16 x i32> [[INDUCTION]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[DOTSPLAT]], <i32 -1, i32 -10, i32 -19, i32 -28, i32 -37, i32 -46, i32 -55, i32 -64, i32 -73, i32 -82, i32 -91, i32 -100, i32 -109, i32 -118, i32 -127, i32 -136>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <16 x i1> [[TMP12]] to <16 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i8> [[TMP0]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[C_PROMOTED5]], 81
+; CHECK-NEXT:    store i64 -1, ptr @b, align 8
+; CHECK-NEXT:    store i32 9, ptr @d, align 4
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @c, align 4
+; CHECK-NEXT:    store i8 [[TMP16]], ptr @a, align 2
+; CHECK-NEXT:    ret void
+;
+vector.ph:
+  %c.promoted5 = load i32, ptr @c, align 4
+  %a.promoted7 = load i8, ptr @a, align 2
+  %.splatinsert = insertelement <16 x i32> poison, i32 %c.promoted5, i64 0
+  %.splat = shufflevector <16 x i32> %.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer
+  %0 = insertelement <16 x i8> <i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %a.promoted7, i64 0
+  %1 = add <16 x i32> %.splat, <i32 -6, i32 3, i32 12, i32 21, i32 30, i32 39, i32 48, i32 57, i32 66, i32 75, i32 84, i32 93, i32 102, i32 111, i32 120, i32 129>
+  %2 = add <16 x i32> %.splat, <i32 -4, i32 5, i32 14, i32 23, i32 32, i32 41, i32 50, i32 59, i32 68, i32 77, i32 86, i32 95, i32 104, i32 113, i32 122, i32 131>
+  %3 = add <16 x i32> %.splat, <i32 -2, i32 7, i32 16, i32 25, i32 34, i32 43, i32 52, i32 61, i32 70, i32 79, i32 88, i32 97, i32 106, i32 115, i32 124, i32 133>
+  %induction = add <16 x i32> %.splat, <i32 0, i32 9, i32 18, i32 27, i32 36, i32 45, i32 54, i32 63, i32 72, i32 81, i32 90, i32 99, i32 108, i32 117, i32 126, i32 135>
+  %4 = icmp ult <16 x i32> %1, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %5 = icmp ult <16 x i32> %2, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %6 = icmp ult <16 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %7 = icmp ult <16 x i32> %induction, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %8 = icmp eq <16 x i32> %.splat, <i32 -1, i32 -10, i32 -19, i32 -28, i32 -37, i32 -46, i32 -55, i32 -64, i32 -73, i32 -82, i32 -91, i32 -100, i32 -109, i32 -118, i32 -127, i32 -136>
+  %9 = or <16 x i1> %4, %5
+  %10 = or <16 x i1> %9, %6
+  %11 = or <16 x i1> %10, %7
+  %12 = or <16 x i1> %11, %8
+  %13 = zext <16 x i1> %12 to <16 x i8>
+  %14 = or <16 x i8> %0, %13
+  %15 = shufflevector <16 x i8> %14, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %15)
+  %17 = add i32 %c.promoted5, 81
+  store i64 -1, ptr @b, align 8
+  store i32 9, ptr @d, align 4
+  store i32 %17, ptr @c, align 4
+  store i8 %16, ptr @a, align 2
+  ret void
+}