[llvm] [SLP] Fix cost estimation of external uses with wrong VF (PR #148185)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 11 03:19:58 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Gaƫtan Bossu (gbossu)
<details>
<summary>Changes</summary>
It assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds.
While experimenting with re-vectorisation for AArch64, and we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in.
This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert:
- foo() in llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
- test() in llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
- test_with_extract() in llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
---
Full diff: https://github.com/llvm/llvm-project/pull/148185.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+16-7)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll (+31)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7b77954e3a4ff..88da7015fc770 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3898,7 +3898,7 @@ class BoUpSLP {
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
- int findLaneForValue(Value *V) const {
+ unsigned findLaneForValue(Value *V) const {
unsigned FoundLane = getVectorFactor();
for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
std::advance(It, 1)) {
@@ -4344,7 +4344,7 @@ class BoUpSLP {
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
- ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
+ ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
: Scalar(S), User(U), E(E), Lane(L) {}
/// Which scalar in our function.
@@ -4357,7 +4357,7 @@ class BoUpSLP {
const TreeEntry &E;
/// Which lane does the scalar belong to.
- int Lane;
+ unsigned Lane;
};
using UserList = SmallVector<ExternalUser, 16>;
@@ -7901,7 +7901,7 @@ void BoUpSLP::buildExternalUses(
// Check if the scalar is externally used as an extra arg.
const auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< FoundLane << " from " << *Scalar << ".\n");
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
@@ -7949,7 +7949,7 @@ void BoUpSLP::buildExternalUses(
if (U && Scalar->hasNUsesOrMore(UsesLimit))
U = nullptr;
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
<< " from lane " << FoundLane << " from " << *Scalar
<< ".\n");
@@ -14568,8 +14568,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
- unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
@@ -14632,6 +14630,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
for (ExternalUser &EU : ExternalUses) {
+ LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
+ << EU.E.Idx << " in lane " << EU.Lane << "\n");
+ LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
+ LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
+
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
@@ -14739,6 +14742,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// for the extract and the added cost of the sign extend if needed.
InstructionCost ExtraCost = TTI::TCC_Free;
auto *ScalarTy = EU.Scalar->getType();
+ const unsigned BundleWidth = EU.E.getVectorFactor();
+ assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
const TreeEntry *Entry = &EU.E;
auto It = MinBWs.find(Entry);
@@ -14752,10 +14757,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
VecTy = getWidenedType(MinTy, BundleWidth);
ExtraCost =
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
+ LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
+ << ExtraCost << "\n");
} else {
ExtraCost =
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
+ LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
+ << *VecTy << ": " << ExtraCost << "\n");
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index ce26bd3b89392..e800b5e016b74 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -31,3 +31,34 @@ define void @test() {
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
ret void
}
+
+; Same as above, but %a7 is also used as a scalar and must be extracted from
+; the wide load. (Or in this case, kept as a scalar load).
+define double @test_with_extract() {
+; CHECK-LABEL: @test_with_extract(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
+; CHECK-NEXT: ret double [[A7]]
+;
+ %a0 = load double, ptr @src, align 8
+ %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+ %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+ %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+ %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+ %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+ %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+ %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+ %res1 = fsub fast double %a0, %a1
+ %res2 = fsub fast double %a2, %a3
+ %res3 = fsub fast double %a4, %a5
+ %res4 = fsub fast double %a6, %a7
+ store double %res1, ptr @dst, align 8
+ store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+ store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+ store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+ ret double %a7
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/148185
More information about the llvm-commits
mailing list