[llvm] [RISCV][TTI] Reduce cost of a <N x i1> build_vector pattern (PR #109449)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 20 11:02:07 PDT 2024
https://github.com/preames updated https://github.com/llvm/llvm-project/pull/109449
>From bdd4441e59ad4416e55eefafd48ed20ace64d9ed Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 20 Sep 2024 10:16:07 -0700
Subject: [PATCH 1/2] [RISCV][TTI] Reduce cost of a <N x i1> build_vector
pattern
This is a follow up to 7f6bbb3. When lowering a <N x i1>
build_vector, we currently chose to extend to i8, perform the
build_vector there, and then truncate back in vector. Our
costing on the other hand accounts for it as if we performed
a vector extend, an insert, and a vector extract for every
element. This significantly over estimates the cost.
Note that we can likely do better in our build_vector lowering
here by packing the bits in scalar, and doing a build_vector
of the packed bits. Regardless, our costing should match our
lowering.
---
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 15 +++++++++++++--
.../Analysis/CostModel/RISCV/rvv-intrinsics.ll | 16 ++++++++--------
.../VectorCombine/RISCV/shuffle-of-intrinsics.ll | 7 +++----
3 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 595475f37a7a69..e6a27e061326f2 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -635,8 +635,19 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead(
InstructionCost Cost = BaseT::getScalarizationOverhead(
Ty, DemandedElts, Insert, Extract, CostKind);
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
- if (Insert && !Extract && LT.first.isValid() && LT.second.isVector() &&
- Ty->getScalarSizeInBits() != 1) {
+ if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
+ if (Ty->getScalarSizeInBits() == 1) {
+ auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
+ // Note: Implicit scalar anyextend is assumed to be free since the i1
+ // must be stored in a GPR.
+ InstructionCost BVCost =
+ getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract, CostKind);
+ InstructionCost TruncCost =
+ getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, TTI::CastContextHint::None,
+ CostKind, nullptr);
+ return BVCost + TruncCost;
+ }
+
assert(LT.second.isFixedLengthVector());
MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
if (isM1OrSmaller(ContainerVT)) {
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 807657797288da..bb98508f239c1b 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -1029,10 +1029,10 @@ define void @store() {
define void @strided_load() {
; CHECK-LABEL: 'strided_load'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
@@ -1056,10 +1056,10 @@ define void @strided_load() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPEBASED-LABEL: 'strided_load'
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
index 7ccc14cc0b125e..f3e5d273e88cca 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
@@ -48,10 +48,9 @@ entry:
define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
; CHECK-LABEL: @test4(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
-; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: ret <8 x i1> [[TMP4]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0)
+; CHECK-NEXT: ret <8 x i1> [[TMP3]]
;
entry:
%2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
>From b75bde18dcbff29f882dda5a6d8c0bb8f444332a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 20 Sep 2024 10:57:35 -0700
Subject: [PATCH 2/2] clang-format and style fixup afterwards
---
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e6a27e061326f2..b683bd5ee50ceb 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -640,12 +640,10 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead(
auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
// Note: Implicit scalar anyextend is assumed to be free since the i1
// must be stored in a GPR.
- InstructionCost BVCost =
- getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract, CostKind);
- InstructionCost TruncCost =
- getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, TTI::CastContextHint::None,
- CostKind, nullptr);
- return BVCost + TruncCost;
+ return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
+ CostKind) +
+ getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
+ TTI::CastContextHint::None, CostKind, nullptr);
}
assert(LT.second.isFixedLengthVector());
More information about the llvm-commits
mailing list