[llvm] 3918ef3 - [SLP]Fix the analysis for masked compress loads
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 20 04:31:20 PDT 2025
Author: Alexey Bataev
Date: 2025-05-20T07:31:16-04:00
New Revision: 3918ef3688dc7e9ef1c0158867efe2b88bf68bec
URL: https://github.com/llvm/llvm-project/commit/3918ef3688dc7e9ef1c0158867efe2b88bf68bec
DIFF: https://github.com/llvm/llvm-project/commit/3918ef3688dc7e9ef1c0158867efe2b88bf68bec.diff
LOG: [SLP]Fix the analysis for masked compress loads
Need to remove the check for Orders in interleaved loads analysis and
estimate shuffle cost without the reordering to correctly handle the
costs of masked compress loads.
Reviewers: hiraditya, HanKuanChen, RKSimon
Reviewed By: HanKuanChen, RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/140647
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5b9ced4561a0c..fcb9da637dd37 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5944,10 +5944,9 @@ static bool isMaskedLoadCompress(
// Check for potential segmented(interleaved) loads.
VectorType *AlignedLoadVecTy = getWidenedType(
ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
- if (!isSafeToLoadUnconditionally(
- Ptr0, AlignedLoadVecTy, CommonAlignment, DL,
- cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC,
- &DT, &TLI))
+ if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
+ DL, cast<LoadInst>(VL.back()), &AC, &DT,
+ &TLI))
AlignedLoadVecTy = LoadVecTy;
if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
CommonAlignment,
@@ -5957,9 +5956,6 @@ static bool isMaskedLoadCompress(
Instruction::Load, AlignedLoadVecTy,
CompressMask[1], std::nullopt, CommonAlignment,
LI->getPointerAddressSpace(), CostKind, IsMasked);
- if (!Mask.empty())
- InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
- VecTy, Mask, CostKind);
if (InterleavedCost < GatherCost) {
InterleaveFactor = CompressMask[1];
LoadVecTy = AlignedLoadVecTy;
@@ -5967,6 +5963,8 @@ static bool isMaskedLoadCompress(
}
}
}
+ InstructionCost CompressCost = ::getShuffleCost(
+ TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
if (!Order.empty()) {
SmallVector<int> NewMask(Sz, PoisonMaskElem);
for (unsigned I : seq<unsigned>(Sz)) {
@@ -5974,8 +5972,6 @@ static bool isMaskedLoadCompress(
}
CompressMask.swap(NewMask);
}
- InstructionCost CompressCost = ::getShuffleCost(
- TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
return TotalVecCost < GatherCost;
}
@@ -13553,10 +13549,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallVector<Value *> PointerOps(Scalars.size());
for (auto [I, V] : enumerate(Scalars))
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
- (void)isMaskedLoadCompress(
+ [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
*TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
CompressMask, LoadVecTy);
+ assert(IsVectorized && "Failed to vectorize load");
CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
InterleaveFactor, IsMasked);
Align CommonAlignment = LI0->getAlign();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index bce0884e92925..07094c642f8da 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -15,10 +15,11 @@ define i16 @test() {
; CHECK-NEXT: [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ]
; CHECK-NEXT: [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1
; CHECK-NEXT: [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1
-; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[PPREV_0_I]], i64 4, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr [[PPREV_0_I]], i32 2, <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]]
; CHECK-NEXT: br label [[WHILE_BODY_I]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
index 1b65a7ac1c311..4dd659a7ae802 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
@@ -9,18 +9,20 @@ define void @test(ptr %mdct_forward_x) {
; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[MDCT_FORWARD_X]], align 8
; CHECK-NEXT: [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
+; CHECK-NEXT: [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> <i64 28, i64 36, i64 24, i64 28>
-; CHECK-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[ARRAYIDX2_I_I]], i64 -8, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 48, i64 40>
-; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP6]], i32 4, <2 x i1> splat (i1 true), <2 x float> poison)
+; CHECK-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP10]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP22]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP4]], i64 0)
; CHECK-NEXT: [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
index 843d1cf46ffcc..7d65fe1bcde76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
@@ -9,17 +9,16 @@ define void @test() {
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[M1]], i64 8
+; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 48
; CHECK-NEXT: [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40
; CHECK-NEXT: [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
+; CHECK-NEXT: [[TMP4:%.*]] = load <6 x double>, ptr [[M1]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = load <7 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <7 x double> [[TMP4]], <7 x double> poison, <4 x i32> <i32 5, i32 0, i32 3, i32 6>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP7]], <4 x i32> <i32 0, i32 3, i32 4, i32 5>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x double> [[TMP7]], <5 x double> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 9, i32 1>
; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32>
; CHECK-NEXT: [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double>
More information about the llvm-commits
mailing list