[llvm] [SLP] Fix cost estimation of external uses with wrong VF (PR #148185)
Gaƫtan Bossu via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 11 04:49:07 PDT 2025
https://github.com/gbossu updated https://github.com/llvm/llvm-project/pull/148185
>From 97b198ae224198f95c59a94814fafbe3ab0be201 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at arm.com>
Date: Fri, 11 Jul 2025 11:17:41 +0000
Subject: [PATCH 1/2] [SLP] Harmonise findLaneForValue() return type (NFC)
The lane is computed as an unsigned, so let's return it as unsigned.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7b77954e3a4ff..1b8a80d2b3c94 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3898,7 +3898,7 @@ class BoUpSLP {
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
- int findLaneForValue(Value *V) const {
+ unsigned findLaneForValue(Value *V) const {
unsigned FoundLane = getVectorFactor();
for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
std::advance(It, 1)) {
@@ -4344,7 +4344,7 @@ class BoUpSLP {
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
- ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
+ ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
: Scalar(S), User(U), E(E), Lane(L) {}
/// Which scalar in our function.
@@ -4357,7 +4357,7 @@ class BoUpSLP {
const TreeEntry &E;
/// Which lane does the scalar belong to.
- int Lane;
+ unsigned Lane;
};
using UserList = SmallVector<ExternalUser, 16>;
@@ -7901,7 +7901,7 @@ void BoUpSLP::buildExternalUses(
// Check if the scalar is externally used as an extra arg.
const auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< FoundLane << " from " << *Scalar << ".\n");
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
@@ -7949,7 +7949,7 @@ void BoUpSLP::buildExternalUses(
if (U && Scalar->hasNUsesOrMore(UsesLimit))
U = nullptr;
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
<< " from lane " << FoundLane << " from " << *Scalar
<< ".\n");
>From c7082aafebd0218aa0e1072d4729f94c0b3a17ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at arm.com>
Date: Tue, 24 Jun 2025 12:21:48 +0000
Subject: [PATCH 2/2] [SLP] Fix cost estimation of external uses with wrong VF
It assumed that the VF remains constant throughout the tree. That's not
always true. This meant that we could query the extraction cost for a
lane that is out of bounds.
While experimenting with re-vectorisation for AArch64, we ran into
this issue. We cannot add a proper AArch64 test as more changes would
need to be brought in.
This commit is only fixing the computation of VF and adding an assert.
Some tests were failing after adding the assert:
- foo() in llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
- test() in llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
- test_with_extract() in llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 13 ++++++--
.../SLPVectorizer/RISCV/segmented-loads.ll | 31 +++++++++++++++++++
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b8a80d2b3c94..88da7015fc770 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14568,8 +14568,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
- unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
@@ -14632,6 +14630,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
for (ExternalUser &EU : ExternalUses) {
+ LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
+ << EU.E.Idx << " in lane " << EU.Lane << "\n");
+ LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
+ LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
+
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
@@ -14739,6 +14742,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// for the extract and the added cost of the sign extend if needed.
InstructionCost ExtraCost = TTI::TCC_Free;
auto *ScalarTy = EU.Scalar->getType();
+ const unsigned BundleWidth = EU.E.getVectorFactor();
+ assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
const TreeEntry *Entry = &EU.E;
auto It = MinBWs.find(Entry);
@@ -14752,10 +14757,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
VecTy = getWidenedType(MinTy, BundleWidth);
ExtraCost =
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
+ LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
+ << ExtraCost << "\n");
} else {
ExtraCost =
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
+ LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
+ << *VecTy << ": " << ExtraCost << "\n");
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index ce26bd3b89392..e800b5e016b74 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -31,3 +31,34 @@ define void @test() {
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
ret void
}
+
+; Same as above, but %a7 is also used as a scalar and must be extracted from
+; the wide load. (Or in this case, kept as a scalar load).
+define double @test_with_extract() {
+; CHECK-LABEL: @test_with_extract(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
+; CHECK-NEXT: ret double [[A7]]
+;
+ %a0 = load double, ptr @src, align 8
+ %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+ %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+ %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+ %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+ %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+ %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+ %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+ %res1 = fsub fast double %a0, %a1
+ %res2 = fsub fast double %a2, %a3
+ %res3 = fsub fast double %a4, %a5
+ %res4 = fsub fast double %a6, %a7
+ store double %res1, ptr @dst, align 8
+ store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+ store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+ store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+ ret double %a7
+}
More information about the llvm-commits
mailing list