[llvm] [SLP][TTI]Improve detection of the insert-subvector pattern for SLP. (PR #74749)

Thu Dec 7 11:10:56 PST 2023

https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/74749

SLP vectorizer passes the type of the subvector and the mask, which size
determines the size of the resulting vector. TTI should support this
pattern to improve cost estimation of the insert_subvector shuffle
pattern.


>From 334c9cfb9b6c3e6d79248ee44f0dd9faa27fea68 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 7 Dec 2023 10:17:14 -0800
Subject: [PATCH] [SLP][TTI]Improve detection of the insert-subvector pattern
 for SLP.

SLP vectorizer passes the type of the subvector and the mask, which size
determines the size of the resulting vector. TTI should support this
pattern to improve cost estimation of the insert_subvector shuffle
pattern.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h        | 11 ++++++++---
 .../RISCV/remarks-insert-into-small-vector.ll   | 17 ++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e05ce2890a08c8..94e35e15293970 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -942,7 +942,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
                                               ArrayRef<int> Mask,
-                                              VectorType *Ty, int &Index,
+                                              VectorType *&Ty, int &Index,
                                               VectorType *&SubTy) const {
     if (Mask.empty())
       return Kind;
@@ -963,8 +963,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       int NumSubElts;
       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
                                  Mask, NumSrcElts, NumSubElts, Index)) {
-        if (Index + NumSubElts > NumSrcElts)
-          return Kind;
+        if (Index + NumSubElts > NumSrcElts) {
+          if (Index + NumSrcElts > static_cast<int>(Mask.size()))
+            return Kind;
+          SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
+          std::swap(Ty, SubTy);
+          return TTI::SK_InsertSubvector;
+        }
         SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
         return TTI::SK_InsertSubvector;
       }
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index 8e0f382222241f..de1eecd98eeb3f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -8,7 +8,7 @@
 ; YAML-NEXT:  Function:        test
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:  - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:  - Cost:            '9'
+; YAML-NEXT:  - Cost:            '3'
 ; YAML-NEXT:  - String:          ' and with tree size '
 ; YAML-NEXT:  - TreeSize:        '7'
 
@@ -19,20 +19,15 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[V9IDX:%.*]] = getelementptr i8, ptr null, i32 4
-; CHECK-NEXT:    [[V14IDX:%.*]] = getelementptr i8, ptr null, i32 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    [[V0_0:%.*]] = select i1 [[TMP7]], float [[TMP0]], float 0.000000e+00
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP3]], <2 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
-; CHECK-NEXT:    [[V9_0:%.*]] = select i1 [[TMP9]], float [[TMP2]], float 0.000000e+00
-; CHECK-NEXT:    store float [[V0_0]], ptr null, align 4
-; CHECK-NEXT:    store float [[V9_0]], ptr [[V9IDX]], align 4
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[V14IDX]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP9]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr null, align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: