[llvm] [SLP][REVEC] getScalarizationOverhead should not be used when ScalarTy is FixedVectorType. (PR #117536)
Han-Kuan Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 25 02:29:23 PST 2024
https://github.com/HanKuanChen created https://github.com/llvm/llvm-project/pull/117536
reference: https://github.com/llvm/llvm-project/issues/117393
>From 74c6638fbae52a76bc183e1adf24bd7710e0782a Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 25 Nov 2024 02:22:41 -0800
Subject: [PATCH 1/2] [SLP][REVEC] Pre-commit test.
---
.../SLPVectorizer/SystemZ/revec-fix-117393.ll | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
new file mode 100644
index 00000000000000..8ad86c8601db37
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=systemz-unknown -mcpu=z15 -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
+
+define void @h() {
+entry:
+ %0 = shl <4 x i32> zeroinitializer, zeroinitializer
+ %1 = or <4 x i32> %0, zeroinitializer
+ %2 = or <4 x i32> splat (i32 1), zeroinitializer
+ %3 = or <4 x i32> zeroinitializer, zeroinitializer
+ %4 = shl <4 x i32> zeroinitializer, zeroinitializer
+ %5 = or <4 x i32> %4, zeroinitializer
+ %6 = and <4 x i32> %2, %1
+ %7 = and <4 x i32> %3, %6
+ %8 = and <4 x i32> %5, %7
+ %9 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %8)
+ ret void
+}
>From 05beaf2f0d2798d7fcb0b1ba650f15a18d366b6a Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 25 Nov 2024 02:22:52 -0800
Subject: [PATCH 2/2] [SLP][REVEC] getScalarizationOverhead should not be used
when ScalarTy is FixedVectorType.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++++++++--
.../SLPVectorizer/SystemZ/revec-fix-117393.ll | 13 +++++++++++++
2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d033b7c2ef4a92..f208ad9c9e1c38 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9616,8 +9616,20 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
Idx, getWidenedType(ScalarTy, Sz));
}
- Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
- /*Extract=*/false, CostKind);
+ if (isa<FixedVectorType>(ScalarTy)) {
+ assert(SLPReVec && "Only supported by REVEC.");
+ // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
+ // of CreateInsertElement.
+ unsigned ScalarTyNumElements = getNumElements(ScalarTy);
+ for (unsigned I : seq<unsigned>(TE.Scalars.size()))
+ if (DemandedElts[I])
+ Cost += TTI->getShuffleCost(
+ TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
+ I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
+ } else {
+ Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ }
int Sz = TE.Scalars.size();
SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
index 8ad86c8601db37..c40e32baad7b31 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
@@ -2,6 +2,19 @@
; RUN: opt -mtriple=systemz-unknown -mcpu=z15 -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
define void @h() {
+; CHECK-LABEL: @h(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> splat (i32 1), zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP4]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT: ret void
+;
entry:
%0 = shl <4 x i32> zeroinitializer, zeroinitializer
%1 = or <4 x i32> %0, zeroinitializer
More information about the llvm-commits
mailing list