[llvm] a3fd82c - [SLP]Fix the crash on cost calculation if non-compatible vectors shuffled.

Fri Apr 30 09:52:15 PDT 2021

Author: Alexey Bataev
Date: 2021-04-30T09:34:20-07:00
New Revision: a3fd82c289878e1a8fa5833d87b688cd50624247

URL: https://github.com/llvm/llvm-project/commit/a3fd82c289878e1a8fa5833d87b688cd50624247
DIFF: https://github.com/llvm/llvm-project/commit/a3fd82c289878e1a8fa5833d87b688cd50624247.diff

LOG: [SLP]Fix the crash on cost calculation if non-compatible vectors shuffled.

If the extracts from the non-power-2 vectors are recognized as shuffles,
need some extra checks to not crash cost calculations if trying to gext
the ecost for subvector extracts. In this case need to check carefully
that we do not exit out of bounds of the original vector, otherwise the
TTI's cost model will crash on assert.

Differential Revision: https://reviews.llvm.org/D101477

Added: 
    llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cd3d4b8a6c07b..5e2a8a10207ab 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3589,13 +3589,27 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
     for (const auto &Data : ExtractVectorsTys) {
       auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
       unsigned NumElts = VecTy->getNumElements();
-      if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy))
-        Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                      EEVTy, None,
-                                      (Data.second / NumElts) * NumElts, VecTy);
-      else
+      if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
+        unsigned Idx = (Data.second / NumElts) * NumElts;
+        unsigned EENumElts = EEVTy->getNumElements();
+        if (Idx + NumElts <= EENumElts) {
+          Cost +=
+              TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                    EEVTy, None, Idx, VecTy);
+        } else {
+          // Need to round up the subvector type vectorization factor to avoid a
+          // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
+          // <= EENumElts.
+          auto *SubVT =
+              FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
+          Cost +=
+              TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                    EEVTy, None, Idx, SubVT);
+        }
+      } else {
         Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
                                       VecTy, None, 0, EEVTy);
+      }
     }
   };
   if (E->State == TreeEntry::NeedToGather) {

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
new file mode 100644
index 0000000000000..3a2553066f9bc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck %s
+
+define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
+; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
+; CHECK-NEXT:    [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
+; CHECK-NEXT:    [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
+; CHECK-NEXT:    [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[ARG0_1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 [[ARG0_2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[ARG1_1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> [[TMP2]], i16 [[ARG1_2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[TMP6]], i64 1
+; CHECK-NEXT:    ret <2 x i16> [[INS_2]]
+;
+bb:
+  %arg0.1 = extractelement <9 x i16> undef, i64 7
+  %arg0.2 = extractelement <9 x i16> %arg0, i64 8
+  %arg1.1 = extractelement <9 x i16> %arg1, i64 7
+  %arg1.2 = extractelement <9 x i16> %arg1, i64 8
+  %add.1 = call i16 @llvm.uadd.sat.i16(i16 %arg0.1, i16 %arg1.1)
+  %add.2 = call i16 @llvm.uadd.sat.i16(i16 %arg0.2, i16 %arg1.2)
+  %ins.1 = insertelement <2 x i16> undef, i16 %add.1, i64 0
+  %ins.2 = insertelement <2 x i16> %ins.1, i16 %add.2, i64 1
+  ret <2 x i16> %ins.2
+}
+
+declare i16 @llvm.uadd.sat.i16(i16, i16) #0
+attributes #0 = { nounwind readnone speculatable willreturn }