[llvm] 076318b - [SLP]Use proper order when calculating costs for geps/extracts to correctly identify profitability
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 9 07:49:43 PDT 2025
Author: Alexey Bataev
Date: 2025-04-09T07:43:23-07:00
New Revision: 076318bd78f5ed338350841075316a75d89ecd9b
URL: https://github.com/llvm/llvm-project/commit/076318bd78f5ed338350841075316a75d89ecd9b
DIFF: https://github.com/llvm/llvm-project/commit/076318bd78f5ed338350841075316a75d89ecd9b.diff
LOG: [SLP]Use proper order when calculating costs for geps/extracts to correctly identify profitability
Need to reorder properly the scalars, when evaluating the costs for the
external uses/geps to prevent differences in the calculating of the
profitability costs, used to choose between gather/compressed loads.
Fixes https://github.com/llvm/llvm-project/pull/132099#issuecomment-2789627454
Added:
llvm/test/Transforms/SLPVectorizer/AArch64/reordered-loads.ll
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..87fc617eddedc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5491,12 +5491,16 @@ static bool isMaskedLoadCompress(
const unsigned Sz = VL.size();
auto *VecTy = getWidenedType(ScalarTy, Sz);
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ SmallVector<int> Mask;
+ if (!Order.empty())
+ inversePermutation(Order, Mask);
// Check external uses.
for (const auto [I, V] : enumerate(VL)) {
if (AreAllUsersVectorized(V))
continue;
InstructionCost ExtractCost =
- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
+ Mask.empty() ? I : Mask[I]);
InstructionCost ScalarCost =
TTI.getInstructionCost(cast<Instruction>(V), CostKind);
if (ExtractCost <= ScalarCost)
@@ -5536,8 +5540,11 @@ static bool isMaskedLoadCompress(
bool IsStrided =
buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
assert(CompressMask.size() >= 2 && "At least two elements are required");
+ SmallVector<Value *> OrderedPointerOps(PointerOps);
+ if (!Order.empty())
+ reorderScalars(OrderedPointerOps, Mask);
auto [ScalarGEPCost, VectorGEPCost] =
- getGEPCosts(TTI, PointerOps, PointerOps.front(),
+ getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
// The cost of scalar loads.
InstructionCost ScalarLoadsCost =
@@ -5564,17 +5571,16 @@ static bool isMaskedLoadCompress(
TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
LI->getPointerAddressSpace(), CostKind);
}
- SmallVector<int> Mask;
- if (!Order.empty())
- inversePermutation(Order, Mask);
if (IsStrided) {
// Check for potential segmented(interleaved) loads.
if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
CommonAlignment,
LI->getPointerAddressSpace())) {
- InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
- Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt,
- CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked);
+ InstructionCost InterleavedCost =
+ VectorGEPCost + TTI.getInterleavedMemoryOpCost(
+ Instruction::Load, LoadVecTy, CompressMask[1],
+ std::nullopt, CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind, IsMasked);
if (!Mask.empty())
InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
VecTy, Mask, CostKind);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reordered-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reordered-loads.ll
new file mode 100644
index 0000000000000..2ecc27905b3d3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reordered-loads.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s -mcpu=neoverse-512tvb | FileCheck %s
+
+define i32 @test(ptr %0, i64 %1) vscale_range(2,2) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_BODY48:.*]]
+; CHECK: [[FOR_BODY48]]:
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[TMP0]], i64 [[TMP1]]
+; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX52]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[ADD56:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store float [[ADD56]], ptr [[TMP0]], align 4
+; CHECK-NEXT: br label %[[FOR_BODY48]]
+;
+entry:
+ br label %for.body48
+
+for.body48:
+ %2 = getelementptr float, ptr %0, i64 %1
+ %arrayidx52 = getelementptr i8, ptr %2, i64 8
+ %3 = load float, ptr %arrayidx52, align 4
+ %4 = load float, ptr %2, align 4
+ %add56 = fadd float %3, %4
+ store float %add56, ptr %0, align 4
+ br label %for.body48
+}
More information about the llvm-commits
mailing list