[llvm] b65b2b4 - [SLP]Expand vector to the whole register size in extracts adjustment

Wed Oct 23 12:08:47 PDT 2024

Author: Alexey Bataev
Date: 2024-10-23T12:04:40-07:00
New Revision: b65b2b4ab60763515694c740935989f908a03312

URL: https://github.com/llvm/llvm-project/commit/b65b2b4ab60763515694c740935989f908a03312
DIFF: https://github.com/llvm/llvm-project/commit/b65b2b4ab60763515694c740935989f908a03312.diff

LOG: [SLP]Expand vector to the whole register size in extracts adjustment

Need to expand the number of elements to the whole register to correctly
process estimation and avoid compiler crash.

Fixes #113462

Added: 
    llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d824c40c401df7..889c4d94ba5e55 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9840,13 +9840,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             ::getShuffleCost(TTI, *RegShuffleKind,
                              getWidenedType(ScalarTy, EltsPerVector), SubMask);
       }
+      const unsigned BaseVF = getFullVectorNumberOfElements(
+          *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
       for (unsigned Idx : Indices) {
-        assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
+        assert((Idx + EltsPerVector) <= BaseVF &&
                "SK_ExtractSubvector index out of range");
-        Cost += ::getShuffleCost(
-            TTI, TTI::SK_ExtractSubvector,
-            getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)), {},
-            CostKind, Idx, getWidenedType(ScalarTy, EltsPerVector));
+        Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
+                                 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
+                                 Idx, getWidenedType(ScalarTy, EltsPerVector));
       }
       // Second attempt to check, if just a permute is better estimated than
       // subvector extract.

diff  --git a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
new file mode 100644
index 00000000000000..43ce36337f4df7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define i32 @test(i32 %v, ptr %p) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[V:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    br i1 false, label %[[INC:.*]], label %[[PH:.*]]
+; CHECK:       [[PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 [[V]], i32 13
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> [[TMP4]], <4 x i32> <i32 0, i32 31, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> <i1 poison, i1 poison, i1 false, i1 false>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[I8_I_I:%.*]] = select i1 false, i64 0, i64 0
+; CHECK-NEXT:    [[I9_I_I:%.*]] = select i1 false, i64 0, i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[I8_I_I]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[I9_I_I]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <16 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP11]], i1 true, i1 [[TMP13]]
+; CHECK-NEXT:    [[AND252_US_I_24_I_I:%.*]] = select i1 [[OP_RDX]], i32 0, i32 0
+; CHECK-NEXT:    br label %[[INC]]
+; CHECK:       [[INC]]:
+; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[AND252_US_I_24_I_I]], %[[PH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[P2:%.*]] = phi i64 [ [[OP_RDX2]], %[[PH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %ld = load i32, ptr %p, align 4
+  br i1 false, label %inc, label %ph
+
+ph:
+  %bi.i.not = icmp eq i32 %ld, 0
+  %b1.i.i = icmp eq i32 %ld, 0
+  %b3.i.i = icmp eq i32 %ld, 0
+  %0 = or i1 %b3.i.i, %b1.i.i
+  %b4.i.i = icmp eq i32 %ld, 0
+  %i4.i.i = select i1 %b4.i.i, i64 0, i64 0
+  %b5.i.i = icmp eq i32 0, 0
+  %i5.i.i = select i1 %b5.i.i, i64 0, i64 0
+  %inc34.5.i.i = or i64 %i4.i.i, %i5.i.i
+  %1 = or i1 %b5.i.i, %b4.i.i
+  %i6.i.i = select i1 false, i64 0, i64 0
+  %inc34.6.i.i = or i64 %inc34.5.i.i, %i6.i.i
+  %b7.i.i = icmp eq i32 0, 0
+  %i7.i.i = select i1 false, i64 0, i64 0
+  %inc34.7.i.i = or i64 %inc34.6.i.i, %i7.i.i
+  %i8.i.i = select i1 false, i64 0, i64 0
+  %inc34.8.i.i = or i64 %inc34.7.i.i, %i8.i.i
+  %i9.i.i = select i1 false, i64 0, i64 0
+  %inc34.9.i.i = or i64 %inc34.8.i.i, %i9.i.i
+  %b10.i.i = icmp eq i32 0, 0
+  %b11.i.i = icmp eq i32 0, 0
+  %2 = or i1 %b11.i.i, %b10.i.i
+  %b12.i.i = icmp eq i32 %v, 0
+  %3 = or i1 %b12.i.i, %2
+  %b13.i.i = icmp eq i32 0, 0
+  %b14.i.i = icmp eq i32 0, 0
+  %4 = or i1 %b14.i.i, %b13.i.i
+  %b16.i.i = icmp eq i32 0, 0
+  %b17.i.i = icmp eq i32 0, 0
+  %5 = or i1 %b17.i.i, %b16.i.i
+  %b18.i.i = icmp eq i32 0, 0
+  %6 = or i1 %b18.i.i, %5
+  %b19.i.i = icmp eq i32 0, 0
+  %b20.i.i = icmp eq i32 0, 0
+  %7 = or i1 %b20.i.i, %b19.i.i
+  %b21.i.i = icmp eq i32 0, 0
+  %8 = or i1 %b21.i.i, %7
+  %b22.i.i = icmp eq i32 0, 0
+  %b23.i.i = icmp eq i32 0, 0
+  %9 = or i1 %b23.i.i, %b22.i.i
+  %b24.i.i = icmp eq i32 0, 0
+  %10 = or i1 %b24.i.i, %9
+  %11 = select i1 %10, i1 true, i1 %8
+  %12 = select i1 %11, i1 true, i1 %6
+  %13 = select i1 %12, i1 true, i1 %4
+  %14 = select i1 %13, i1 true, i1 %3
+  %15 = select i1 %14, i1 true, i1 %b7.i.i
+  %16 = select i1 %15, i1 true, i1 %1
+  %17 = or i1 %0, %bi.i.not
+  %18 = select i1 %16, i1 true, i1 %17
+  %and252.us.i.24.i.i = select i1 %18, i32 0, i32 0
+  br label %inc
+
+inc:
+  %p1 = phi i32 [ %and252.us.i.24.i.i, %ph ], [ 0, %entry ]
+  %p2 = phi i64 [ %inc34.9.i.i, %ph ], [ 0, %entry ]
+  ret i32 0
+}