[llvm] [LV] Use forced cost once for whole interleave group in legacy costmodel (PR #168270)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 16 05:53:02 PST 2025
https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/168270
The VPlan-based cost model assigns the forced cost once for a whole VPInterleaveRecipe. Update the legacy cost model to match this behavior. This fixes a cost-model divergence, and assigns the cost in a way that matches the generated code more accurately.
>From 784fb39fc251ace12d0c279fc56b79bf11c42de3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 16 Nov 2025 13:47:15 +0000
Subject: [PATCH] [LV] Use forced cost once for whole interleave group in
legacy costmodel
The VPlan-based cost model assigns the forced cost once for a whole
VPInterleaveRecipe. Update the legacy cost model to match this behavior.
This fixes a cost-model divergence, and assigns the cost in a way that
matches the generated code more accurately.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 14 +-
.../AArch64/force-target-instruction-cost.ll | 158 ++++++++++++++++++
2 files changed, 170 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cbfbc29360b0b..679ff4ed63df4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5122,8 +5122,18 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
InstructionCost C = getInstructionCost(&I, VF);
// Check if we should override the cost.
- if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
- C = InstructionCost(ForceTargetInstructionCost);
+ if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
+ // For interleave groups, use ForceTargetInstructionCost once for the
+ // whole group.
+ if (VF.isVector() && getWideningDecision(&I, VF) == CM_Interleave) {
+ if (getInterleavedAccessGroup(&I)->getInsertPos() == &I)
+ C = InstructionCost(ForceTargetInstructionCost);
+ else
+ C = InstructionCost(0);
+ } else {
+ C = InstructionCost(ForceTargetInstructionCost);
+ }
+ }
BlockCost += C;
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 29bbd015eed1f..d5227b15012da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -380,7 +380,165 @@ for.end:
ret void
}
+define void @interleave_group(ptr %dst) #1 {
+; COST1-LABEL: define void @interleave_group(
+; COST1-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
+; COST1-NEXT: [[ITER_CHECK:.*:]]
+; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; COST1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST1: [[VECTOR_PH]]:
+; COST1-NEXT: br label %[[VECTOR_BODY:.*]]
+; COST1: [[VECTOR_BODY]]:
+; COST1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; COST1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 16
+; COST1-NEXT: [[TMP1:%.*]] = mul i64 [[INDEX]], 3
+; COST1-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 3
+; COST1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
+; COST1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
+; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP3]], align 1
+; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP4]], align 1
+; COST1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; COST1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; COST1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; COST1: [[MIDDLE_BLOCK]]:
+; COST1-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; COST1: [[VEC_EPILOG_ITER_CHECK]]:
+; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
+; COST1: [[VEC_EPILOG_PH]]:
+; COST1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; COST1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; COST1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; COST1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; COST1-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; COST1: [[VEC_EPILOG_VECTOR_BODY]]:
+; COST1-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; COST1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; COST1-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
+; COST1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
+; COST1-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; COST1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; COST1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; COST1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; COST1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; COST1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
+; COST1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
+; COST1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
+; COST1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP12]], i64 2
+; COST1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 2
+; COST1-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 2
+; COST1-NEXT: store i8 0, ptr [[TMP15]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP16]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP17]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP18]], align 1
+; COST1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
+; COST1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i64 1
+; COST1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP13]], i64 1
+; COST1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP14]], i64 1
+; COST1-NEXT: store i8 0, ptr [[TMP19]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP20]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP21]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP22]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP11]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP12]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP13]], align 1
+; COST1-NEXT: store i8 0, ptr [[TMP14]], align 1
+; COST1-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
+; COST1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; COST1-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
+; COST1-NEXT: br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; COST1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; COST1-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; COST1: [[VEC_EPILOG_SCALAR_PH]]:
+;
+; COST10-LABEL: define void @interleave_group(
+; COST10-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
+; COST10-NEXT: [[ITER_CHECK:.*:]]
+; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; COST10: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST10: [[VECTOR_PH]]:
+; COST10-NEXT: br label %[[VECTOR_BODY:.*]]
+; COST10: [[VECTOR_BODY]]:
+; COST10-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; COST10-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3
+; COST10-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; COST10-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP1]], align 1
+; COST10-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; COST10-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; COST10-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; COST10: [[MIDDLE_BLOCK]]:
+; COST10-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; COST10: [[VEC_EPILOG_ITER_CHECK]]:
+; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
+; COST10: [[VEC_EPILOG_PH]]:
+; COST10-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; COST10-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; COST10-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; COST10-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; COST10-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; COST10: [[VEC_EPILOG_VECTOR_BODY]]:
+; COST10-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; COST10-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; COST10-NEXT: [[TMP3:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
+; COST10-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; COST10-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; COST10-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; COST10-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; COST10-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
+; COST10-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
+; COST10-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; COST10-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; COST10-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 2
+; COST10-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2
+; COST10-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 2
+; COST10-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
+; COST10-NEXT: store i8 0, ptr [[TMP12]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP13]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP14]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP15]], align 1
+; COST10-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP8]], i64 1
+; COST10-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP9]], i64 1
+; COST10-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP10]], i64 1
+; COST10-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
+; COST10-NEXT: store i8 0, ptr [[TMP16]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP17]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP18]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP19]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP8]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP9]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP10]], align 1
+; COST10-NEXT: store i8 0, ptr [[TMP11]], align 1
+; COST10-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
+; COST10-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; COST10-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
+; COST10-NEXT: br i1 [[TMP20]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; COST10: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; COST10-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; COST10: [[VEC_EPILOG_SCALAR_PH]]:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %iv.3 = mul i64 %iv, 3
+ %gep.0 = getelementptr i8, ptr %dst, i64 %iv.3
+ %gep.2 = getelementptr i8, ptr %gep.0, i64 2
+ store i8 0, ptr %gep.2, align 1
+ %gep.1 = getelementptr i8, ptr %gep.0, i64 1
+ store i8 0, ptr %gep.1, align 1
+ store i8 0, ptr %gep.0, align 1
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, 100
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
attributes #0 = { "target-features"="+neon,+sve" vscale_range(1,16) }
+attributes #1 = { "target-cpu"="neoverse-512tvb" }
declare void @llvm.assume(i1 noundef)
declare i64 @llvm.umin.i64(i64, i64)
More information about the llvm-commits
mailing list