[llvm] [SLP][REVEC] Fix cost model for getGatherCost with FixedVectorType ScalarTy. (PR #109369)
Han-Kuan Chen via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 22 00:35:10 PDT 2024
https://github.com/HanKuanChen updated https://github.com/llvm/llvm-project/pull/109369
>From 95191e0a817864780094ef4360cec4ea4a98fd46 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Thu, 19 Sep 2024 20:50:39 -0700
Subject: [PATCH 1/4] [SLP][REVEC] Pre-commit test.
---
.../RISCV/revec-getGatherCost.ll | 27 +++++++++++++++++++
1 file changed, 27 insertions(+)
create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
new file mode 100644
index 00000000000000..29dca7f3c60aca
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 %s | FileCheck %s
+
+define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: [[FMULADD8:%.*]] = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[VEXT165_I]], <4 x float> [[LOAD17:%.*]], <4 x float> [[FMULADD7:%.*]])
+; CHECK-NEXT: [[FMULADD17:%.*]] = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[VEXT309_I]], <4 x float> [[LOAD17]], <4 x float> [[FMULADD16:%.*]])
+; CHECK-NEXT: [[ADD_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[OUT_PTR:%.*]], i64 16
+; CHECK-NEXT: store <4 x float> [[FMULADD8]], ptr [[OUT_PTR]], align 4
+; CHECK-NEXT: store <4 x float> [[FMULADD17]], ptr [[ADD_PTR_I_I]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %vext165.i = shufflevector <4 x float> %load6, <4 x float> %load7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %vext309.i = shufflevector <4 x float> %load7, <4 x float> %load8, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %fmuladd8 = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> %vext165.i, <4 x float> %load17, <4 x float> %fmuladd7)
+ %fmuladd17 = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> %vext309.i, <4 x float> %load17, <4 x float> %fmuladd16)
+ %add.ptr.i.i = getelementptr inbounds i8, ptr %out_ptr, i64 16
+ store <4 x float> %fmuladd8, ptr %out_ptr, align 4
+ store <4 x float> %fmuladd17, ptr %add.ptr.i.i, align 4
+ ret void
+}
+
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
>From 5915170c7bef7129cd646b05129ef9ed2587149d Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Thu, 19 Sep 2024 20:57:26 -0700
Subject: [PATCH 2/4] [SLP][REVEC] Fix cost model for getGatherCost with
FixedVectorType ScalarTy.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 31 +++++++++++++------
.../RISCV/revec-getGatherCost.ll | 14 ++++++---
2 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ddeb97463281c9..14450ff27a6844 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11732,8 +11732,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
- APInt ShuffledElements = APInt::getZero(VecTy->getNumElements());
+ APInt ShuffledElements = APInt::getZero(VL.size());
DenseMap<Value *, unsigned> UniqueElements;
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost;
@@ -11753,8 +11752,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
Value *V = VL[I];
// No need to shuffle duplicates for constants.
if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
- ShuffledElements.setBits(I * ScalarTyNumElements,
- I * ScalarTyNumElements + ScalarTyNumElements);
+ ShuffledElements.setBit(I);
ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
continue;
}
@@ -11767,14 +11765,27 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
}
DuplicateNonConst = true;
- ShuffledElements.setBits(I * ScalarTyNumElements,
- I * ScalarTyNumElements + ScalarTyNumElements);
+ ShuffledElements.setBit(I);
ShuffleMask[I] = Res.first->second;
}
- if (ForPoisonSrc)
- Cost =
- TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
- /*Extract*/ false, CostKind);
+ if (ForPoisonSrc) {
+ if (isa<FixedVectorType>(ScalarTy)) {
+ assert(SLPReVec && "Only supported by REVEC.");
+ // We don't need to insert elements one by one. Instead, we can insert the
+ // entire vector into the destination.
+ Cost = 0;
+ unsigned ScalarTyNumElements = getNumElements(ScalarTy);
+ for (unsigned I = 0, E = VL.size(); I != E; ++I)
+ if (!ShuffledElements[I])
+ Cost += TTI->getShuffleCost(
+ TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
+ I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
+ } else {
+ Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+ /*Insert*/ true,
+ /*Extract*/ false, CostKind);
+ }
+ }
if (DuplicateNonConst)
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
VecTy, ShuffleMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
index 29dca7f3c60aca..648eb5bc5119dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
@@ -6,11 +6,15 @@ define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4
; CHECK-NEXT: entry:
; CHECK-NEXT: [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; CHECK-NEXT: [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT: [[FMULADD8:%.*]] = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[VEXT165_I]], <4 x float> [[LOAD17:%.*]], <4 x float> [[FMULADD7:%.*]])
-; CHECK-NEXT: [[FMULADD17:%.*]] = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[VEXT309_I]], <4 x float> [[LOAD17]], <4 x float> [[FMULADD16:%.*]])
-; CHECK-NEXT: [[ADD_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[OUT_PTR:%.*]], i64 16
-; CHECK-NEXT: store <4 x float> [[FMULADD8]], ptr [[OUT_PTR]], align 4
-; CHECK-NEXT: store <4 x float> [[FMULADD17]], ptr [[ADD_PTR_I_I]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[VEXT165_I]], i64 0)
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP0]], <4 x float> [[VEXT309_I]], i64 4)
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> poison, i64 4)
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[LOAD17:%.*]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[FMULADD7:%.*]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP5]], <4 x float> [[FMULADD16:%.*]], i64 4)
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; CHECK-NEXT: store <8 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
>From eee7a6221c06273db1b3db750baa84d2bf6a0172 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Sun, 22 Sep 2024 00:06:23 -0700
Subject: [PATCH 3/4] [SLP][REVEC] Apply comment.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 14450ff27a6844..c000f7648e647b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11775,7 +11775,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
// entire vector into the destination.
Cost = 0;
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
- for (unsigned I = 0, E = VL.size(); I != E; ++I)
+ for (unsigned I : seq<unsigned>(VL.size()))
if (!ShuffledElements[I])
Cost += TTI->getShuffleCost(
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
>From eda7eff2a1081f33a5617ca1716c0561f6337ce1 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Sun, 22 Sep 2024 00:33:25 -0700
Subject: [PATCH 4/4] [SLP][REVEC] Apply comment.
---
.../SLPVectorizer/RISCV/revec-getGatherCost.ll | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
index 648eb5bc5119dc..887f59bbda94d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
@@ -1,5 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 -pass-remarks-output=%t %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+; YAML: --- !Passed
+; YAML: Pass: slp-vectorizer
+; YAML: Name: StoresVectorized
+; YAML: Function: test
+; YAML: Args:
+; YAML: - String: 'Stores SLP vectorized with cost '
+; YAML: - Cost: '6'
+; YAML: - String: ' and with tree size '
+; YAML: - TreeSize: '5'
define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) {
; CHECK-LABEL: @test(
More information about the llvm-commits
mailing list