[llvm] [SLP] Fix cost estimation of external uses with wrong VF (PR #148185)
Gaƫtan Bossu via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 11 06:19:35 PDT 2025
https://github.com/gbossu updated https://github.com/llvm/llvm-project/pull/148185
>From a4b41eb9b8f9d559f634b18d460748ef85c21e7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at arm.com>
Date: Tue, 24 Jun 2025 12:21:48 +0000
Subject: [PATCH] [SLP] Fix cost estimation of external uses with wrong VF
It assumed that the VF remains constant throughout the tree. That's not
always true. This meant that we could query the extraction cost for a
lane that is out of bounds.
While experimenting with re-vectorisation for AArch64, we ran into
this issue. We cannot add a proper AArch64 test as more changes would
need to be brought in.
This commit is only fixing the computation of VF and adding an assert.
Some tests were failing after adding the assert:
- foo() in llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
- test() in llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
- test_with_extract() in llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 13 ++++++--
.../SLPVectorizer/RISCV/segmented-loads.ll | 31 +++++++++++++++++++
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b8a80d2b3c94..88da7015fc770 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14568,8 +14568,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
- unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
@@ -14632,6 +14630,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
for (ExternalUser &EU : ExternalUses) {
+ LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
+ << EU.E.Idx << " in lane " << EU.Lane << "\n");
+ LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
+ LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
+
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
@@ -14739,6 +14742,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// for the extract and the added cost of the sign extend if needed.
InstructionCost ExtraCost = TTI::TCC_Free;
auto *ScalarTy = EU.Scalar->getType();
+ const unsigned BundleWidth = EU.E.getVectorFactor();
+ assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
const TreeEntry *Entry = &EU.E;
auto It = MinBWs.find(Entry);
@@ -14752,10 +14757,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
VecTy = getWidenedType(MinTy, BundleWidth);
ExtraCost =
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
+ LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
+ << ExtraCost << "\n");
} else {
ExtraCost =
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
+ LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
+ << *VecTy << ": " << ExtraCost << "\n");
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index ce26bd3b89392..e800b5e016b74 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -31,3 +31,34 @@ define void @test() {
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
ret void
}
+
+; Same as above, but %a7 is also used as a scalar and must be extracted from
+; the wide load. (Or in this case, kept as a scalar load).
+define double @test_with_extract() {
+; CHECK-LABEL: @test_with_extract(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
+; CHECK-NEXT: ret double [[A7]]
+;
+ %a0 = load double, ptr @src, align 8
+ %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+ %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+ %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+ %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+ %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+ %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+ %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+ %res1 = fsub fast double %a0, %a1
+ %res2 = fsub fast double %a2, %a3
+ %res3 = fsub fast double %a4, %a5
+ %res4 = fsub fast double %a6, %a7
+ store double %res1, ptr @dst, align 8
+ store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+ store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+ store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+ ret double %a7
+}
More information about the llvm-commits
mailing list