[llvm] [SystemZ, LoopVectorizer] Enable vectorization of epilogue loops after VF16. (PR #172925)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 13 09:30:57 PST 2026
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/172925
>From d4b4bfc81b1d6f34fa5c060318e1fb1f9778d3f7 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 19 Dec 2025 00:07:29 +0100
Subject: [PATCH 1/7] LV: Enable vectorization of epilogue loops after VF16.
---
.../SystemZ/SystemZTargetTransformInfo.cpp | 10 +++++++
.../SystemZ/SystemZTargetTransformInfo.h | 2 ++
.../SystemZ/vectorized-epilogue-loop.ll | 28 +++++++++++++++++++
3 files changed, 40 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 4322773f4afd6..4f1667872abd3 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -478,6 +478,16 @@ unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
}
+unsigned SystemZTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
+ // Enable vectorization of (LoopVectorizer) epilogue loop after VF 16 main
+ // loop. This can be highly beneficial when the original loop handles bytes
+ // (i8) and most of the time is not spent in the main vectorized loop
+ // (x264/mc_chroma).
+ if (VF == ElementCount::getFixed(16))
+ return 2;
+ return 1;
+}
+
bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
EVT VT = TLI->getValueType(DL, DataType);
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index f4ba29c987f09..059dd704327ce 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -82,6 +82,8 @@ class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
bool HasCall) const override;
bool enableWritePrefetching() const override { return true; }
+ unsigned getMaxInterleaveFactor(ElementCount VF) const override;
+
bool hasDivRemOp(Type *DataType, bool IsSigned) const override;
bool prefersVectorizedAddressing() const override { return false; }
bool LSRWithInstrQueries() const override { return true; }
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
new file mode 100644
index 0000000000000..654d636045f76
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -mtriple=s390x-unknown-linux -mcpu=z16 -passes=loop-vectorize < %s \
+; RUN: | FileCheck %s
+;
+; Test that loop vectorizer generates a vectorized epilogue loop after a VF16
+; vectorization.
+
+define void @fun(ptr %Src, ptr %Dst, i64 %wide.trip.count) {
+; CHECK-LABEL: @fun(
+; CHECK-LABEL: vector.body:
+; CHECK: %wide.load = load <16 x i8>
+; CHECK-LABEL: vec.epilog.vector.body:
+; CHECK: %wide.load8 = load <4 x i8>
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %arrayidx0 = getelementptr i8, ptr %Src, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx0
+ %arrayidx1 = getelementptr i8, ptr %Dst, i64 %indvars.iv
+ store i8 %0, ptr %arrayidx1
+ %exitcond.not = icmp eq i64 %indvars.iv, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
>From e189dbccf23d8ca7554846ba271c7d8c5bd04e94 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 19 Dec 2025 20:15:15 +0100
Subject: [PATCH 2/7] Refactor preferEpilogueVectorization to subsume the
interleave check by default. Return true for SystemZ. Leave SystemZ
getMaxInterleaveFactor() to return 1 always for now.
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 5 +++--
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 6 +++++-
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++--
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 2 +-
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp | 9 +++------
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h | 4 ++++
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++-------
.../LoopVectorize/SystemZ/vectorized-epilogue-loop.ll | 2 +-
8 files changed, 21 insertions(+), 20 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a013122df5f06..5cacba5dc5a50 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1932,8 +1932,9 @@ class TargetTransformInfo {
LLVM_ABI bool preferPredicatedReductionSelect() const;
/// Return true if the loop vectorizer should consider vectorizing an
- /// otherwise scalar epilogue loop.
- LLVM_ABI bool preferEpilogueVectorization() const;
+ /// otherwise scalar epilogue loop. VF is what was used for the vectorized
+ /// loop body.
+ LLVM_ABI bool preferEpilogueVectorization(ElementCount VF) const;
/// \returns True if the loop vectorizer should discard any VFs where the
/// maximum register pressure exceeds getNumberOfRegisters.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6d27cabf404f8..9d6d378a48ad8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1134,7 +1134,11 @@ class TargetTransformInfoImplBase {
virtual bool preferPredicatedReductionSelect() const { return false; }
- virtual bool preferEpilogueVectorization() const { return true; }
+ virtual bool preferEpilogueVectorization(ElementCount VF) const {
+ // We consider epilogue vectorization unprofitable for targets that
+ // don't consider interleaving beneficial (eg. MVE).
+ return getMaxInterleaveFactor(VF) > 1;
+ }
virtual bool shouldConsiderVectorizationRegPressure() const { return false; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 19785204ed2b3..769fbdb2e91a0 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1450,8 +1450,8 @@ bool TargetTransformInfo::preferPredicatedReductionSelect() const {
return TTIImpl->preferPredicatedReductionSelect();
}
-bool TargetTransformInfo::preferEpilogueVectorization() const {
- return TTIImpl->preferEpilogueVectorization();
+bool TargetTransformInfo::preferEpilogueVectorization(ElementCount VF) const {
+ return TTIImpl->preferEpilogueVectorization(VF);
}
bool TargetTransformInfo::shouldConsiderVectorizationRegPressure() const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index faa87b4fc48dd..b873a73b18770 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -139,7 +139,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
bool preferAlternateOpcodeVectorization() const override;
- bool preferEpilogueVectorization() const override {
+ bool preferEpilogueVectorization(ElementCount VF) const override {
// Epilogue vectorization is usually unprofitable - tail folding or
// a smaller VF would have been better. This a blunt hammer - we
// should re-examine this once vectorization is better tuned.
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 4f1667872abd3..041f12ac5f660 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -479,12 +479,9 @@ unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
}
unsigned SystemZTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
- // Enable vectorization of (LoopVectorizer) epilogue loop after VF 16 main
- // loop. This can be highly beneficial when the original loop handles bytes
- // (i8) and most of the time is not spent in the main vectorized loop
- // (x264/mc_chroma).
- if (VF == ElementCount::getFixed(16))
- return 2;
+ // TODO: Find optimal settings for interleave factors.
+ // if (VF == ElementCount::getFixed(16))
+ // return 2;
return 1;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 059dd704327ce..f1bfdc2b94ded 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -153,6 +153,10 @@ class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const override;
+ bool preferEpilogueVectorization(ElementCount VF) const override {
+ return true;
+ }
+
bool shouldExpandReduction(const IntrinsicInst *II) const override;
/// @}
};
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b88779a7828fb..e9278b0574cb4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4407,13 +4407,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// account. For now we apply a very crude heuristic and only consider loops
// with vectorization factors larger than a certain value.
- // Allow the target to opt out entirely.
- if (!TTI.preferEpilogueVectorization())
- return false;
-
- // We also consider epilogue vectorization unprofitable for targets that don't
- // consider interleaving beneficial (eg. MVE).
- if (TTI.getMaxInterleaveFactor(VF) <= 1)
+ // Allow the target to opt out.
+ if (!TTI.preferEpilogueVectorization(VF))
return false;
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
index 654d636045f76..14998d8f22a46 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
@@ -9,7 +9,7 @@ define void @fun(ptr %Src, ptr %Dst, i64 %wide.trip.count) {
; CHECK-LABEL: vector.body:
; CHECK: %wide.load = load <16 x i8>
; CHECK-LABEL: vec.epilog.vector.body:
-; CHECK: %wide.load8 = load <4 x i8>
+; CHECK: %wide.load7 = load <4 x i8>
entry:
br label %for.body
>From fc144585f69208f0362e4a67d1a51704930791b2 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Thu, 15 Jan 2026 22:07:13 +0100
Subject: [PATCH 3/7] Generate test checks for the vectorized-epilogue-loop.ll
test.
---
.../SystemZ/vectorized-epilogue-loop.ll | 80 +++++++++++++++++--
1 file changed, 75 insertions(+), 5 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
index 14998d8f22a46..4072871ac8dce 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -S -mtriple=s390x-unknown-linux -mcpu=z16 -passes=loop-vectorize < %s \
; RUN: | FileCheck %s
;
@@ -5,11 +6,72 @@
; vectorization.
define void @fun(ptr %Src, ptr %Dst, i64 %wide.trip.count) {
-; CHECK-LABEL: @fun(
-; CHECK-LABEL: vector.body:
-; CHECK: %wide.load = load <16 x i8>
-; CHECK-LABEL: vec.epilog.vector.body:
-; CHECK: %wide.load7 = load <4 x i8>
+; CHECK-LABEL: define void @fun(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[WIDE_TRIP_COUNT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ITER_CHECK:.*]]:
+; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
+; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX6]]
+; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]]
+; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD7]], ptr [[TMP6]], align 1
+; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX6]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC5]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
+; CHECK-NEXT: br i1 [[CMP_N9]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX0]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP8]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
entry:
br label %for.body
@@ -26,3 +88,11 @@ for.body:
exit:
ret void
}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+;.
>From af12ad5665f6aef5c62aa107f6d88b0436634cc6 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <Jonas.Paulsson2 at ibm.com>
Date: Tue, 20 Jan 2026 10:24:09 -0600
Subject: [PATCH 4/7] Commenting.
Co-authored-by: Florian Hahn <flo at fhahn.com>
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5cacba5dc5a50..05bb54525569f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1932,8 +1932,7 @@ class TargetTransformInfo {
LLVM_ABI bool preferPredicatedReductionSelect() const;
/// Return true if the loop vectorizer should consider vectorizing an
- /// otherwise scalar epilogue loop. VF is what was used for the vectorized
- /// loop body.
+ /// otherwise scalar epilogue loop if the loop already has been vectorized with \p VF.
LLVM_ABI bool preferEpilogueVectorization(ElementCount VF) const;
/// \returns True if the loop vectorizer should discard any VFs where the
>From b8668ffd3e7bf44ffd6763bf63ebe7a7bc4f0813 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <Jonas.Paulsson2 at ibm.com>
Date: Tue, 20 Jan 2026 10:25:53 -0600
Subject: [PATCH 5/7] clang-format
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 05bb54525569f..61a14183996f2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1932,7 +1932,8 @@ class TargetTransformInfo {
LLVM_ABI bool preferPredicatedReductionSelect() const;
/// Return true if the loop vectorizer should consider vectorizing an
- /// otherwise scalar epilogue loop if the loop already has been vectorized with \p VF.
+ /// otherwise scalar epilogue loop if the loop already has been vectorized
+ /// with \p VF.
LLVM_ABI bool preferEpilogueVectorization(ElementCount VF) const;
/// \returns True if the loop vectorizer should discard any VFs where the
>From 424a4a9562c5244e6d9e7760a6bcb5b90c9ea1e9 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 6 Feb 2026 19:57:20 +0100
Subject: [PATCH 6/7] Rebase + review updates.
---
.../llvm/Analysis/TargetTransformInfo.h | 4 +--
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +--
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +--
.../Target/RISCV/RISCVTargetTransformInfo.h | 2 +-
.../SystemZ/SystemZTargetTransformInfo.cpp | 7 ----
.../SystemZ/SystemZTargetTransformInfo.h | 4 +--
.../Transforms/Vectorize/LoopVectorize.cpp | 2 +-
.../SystemZ/vectorized-epilogue-loop.ll | 34 ++++++++-----------
8 files changed, 23 insertions(+), 38 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 61a14183996f2..c9757bab75da5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1933,8 +1933,8 @@ class TargetTransformInfo {
/// Return true if the loop vectorizer should consider vectorizing an
/// otherwise scalar epilogue loop if the loop already has been vectorized
- /// with \p VF.
- LLVM_ABI bool preferEpilogueVectorization(ElementCount VF) const;
+ /// with VF x IC, running \p Iters scalar iterations per vector iteration.
+ LLVM_ABI bool preferEpilogueVectorization(ElementCount Iters) const;
/// \returns True if the loop vectorizer should discard any VFs where the
/// maximum register pressure exceeds getNumberOfRegisters.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 9d6d378a48ad8..fd9db944700b7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1134,10 +1134,10 @@ class TargetTransformInfoImplBase {
virtual bool preferPredicatedReductionSelect() const { return false; }
- virtual bool preferEpilogueVectorization(ElementCount VF) const {
+ virtual bool preferEpilogueVectorization(ElementCount Iters) const {
// We consider epilogue vectorization unprofitable for targets that
// don't consider interleaving beneficial (eg. MVE).
- return getMaxInterleaveFactor(VF) > 1;
+ return getMaxInterleaveFactor(Iters) > 1;
}
virtual bool shouldConsiderVectorizationRegPressure() const { return false; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 769fbdb2e91a0..a9d8c1ef3d325 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1450,8 +1450,8 @@ bool TargetTransformInfo::preferPredicatedReductionSelect() const {
return TTIImpl->preferPredicatedReductionSelect();
}
-bool TargetTransformInfo::preferEpilogueVectorization(ElementCount VF) const {
- return TTIImpl->preferEpilogueVectorization(VF);
+bool TargetTransformInfo::preferEpilogueVectorization(ElementCount Iters) const {
+ return TTIImpl->preferEpilogueVectorization(Iters);
}
bool TargetTransformInfo::shouldConsiderVectorizationRegPressure() const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index b873a73b18770..eed5e8ba4aee8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -139,7 +139,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
bool preferAlternateOpcodeVectorization() const override;
- bool preferEpilogueVectorization(ElementCount VF) const override {
+ bool preferEpilogueVectorization(ElementCount Iters) const override {
// Epilogue vectorization is usually unprofitable - tail folding or
// a smaller VF would have been better. This a blunt hammer - we
// should re-examine this once vectorization is better tuned.
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 041f12ac5f660..4322773f4afd6 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -478,13 +478,6 @@ unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
}
-unsigned SystemZTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
- // TODO: Find optimal settings for interleave factors.
- // if (VF == ElementCount::getFixed(16))
- // return 2;
- return 1;
-}
-
bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
EVT VT = TLI->getValueType(DL, DataType);
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index f1bfdc2b94ded..d96036067c786 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -82,8 +82,6 @@ class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
bool HasCall) const override;
bool enableWritePrefetching() const override { return true; }
- unsigned getMaxInterleaveFactor(ElementCount VF) const override;
-
bool hasDivRemOp(Type *DataType, bool IsSigned) const override;
bool prefersVectorizedAddressing() const override { return false; }
bool LSRWithInstrQueries() const override { return true; }
@@ -153,7 +151,7 @@ class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const override;
- bool preferEpilogueVectorization(ElementCount VF) const override {
+ bool preferEpilogueVectorization(ElementCount Iters) const override {
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e9278b0574cb4..c477c1746f18b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4408,7 +4408,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// with vectorization factors larger than a certain value.
// Allow the target to opt out.
- if (!TTI.preferEpilogueVectorization(VF))
+ if (!TTI.preferEpilogueVectorization(VF * IC))
return false;
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
index 4072871ac8dce..18213ef681a83 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/vectorized-epilogue-loop.ll
@@ -2,22 +2,16 @@
; RUN: opt -S -mtriple=s390x-unknown-linux -mcpu=z16 -passes=loop-vectorize < %s \
; RUN: | FileCheck %s
;
-; Test that loop vectorizer generates a vectorized epilogue loop after a VF16
-; vectorization.
+; Test that loop vectorizer generates a vectorized epilogue loop after
+; vectorizing the main loop with VF = 16.
-define void @fun(ptr %Src, ptr %Dst, i64 %wide.trip.count) {
+define void @fun(ptr noalias %Src, ptr noalias %Dst, i64 %N) {
; CHECK-LABEL: define void @fun(
-; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[WIDE_TRIP_COUNT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ITER_CHECK:.*]]:
-; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
-; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 1
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
-; CHECK: [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC2]]
-; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], 16
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -58,7 +52,7 @@ define void @fun(ptr %Src, ptr %Dst, i64 %wide.trip.count) {
; CHECK-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
; CHECK-NEXT: br i1 [[CMP_N9]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -67,7 +61,7 @@ define void @fun(ptr %Src, ptr %Dst, i64 %wide.trip.count) {
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX0]], align 1
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[TMP8]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
@@ -76,13 +70,13 @@ entry:
br label %for.body
for.body:
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %indvars.iv.next = add i64 %indvars.iv, 1
- %arrayidx0 = getelementptr i8, ptr %Src, i64 %indvars.iv
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %iv.next = add i64 %iv, 1
+ %arrayidx0 = getelementptr i8, ptr %Src, i64 %iv
%0 = load i8, ptr %arrayidx0
- %arrayidx1 = getelementptr i8, ptr %Dst, i64 %indvars.iv
+ %arrayidx1 = getelementptr i8, ptr %Dst, i64 %iv
store i8 %0, ptr %arrayidx1
- %exitcond.not = icmp eq i64 %indvars.iv, %wide.trip.count
+ %exitcond.not = icmp eq i64 %iv, %N
br i1 %exitcond.not, label %exit, label %for.body
exit:
@@ -94,5 +88,5 @@ exit:
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[PROF3]] = !{!"branch_weights", i32 4, i32 12}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
;.
>From 1b15bb3e1aef58a29785411ab7ea8e44753e0118 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <Jonas.Paulsson2 at ibm.com>
Date: Fri, 13 Feb 2026 11:30:44 -0600
Subject: [PATCH 7/7] Update llvm/include/llvm/Analysis/TargetTransformInfo.h
commenting
Co-authored-by: Florian Hahn <flo at fhahn.com>
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c9757bab75da5..5c6ef22df0856 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1933,7 +1933,7 @@ class TargetTransformInfo {
/// Return true if the loop vectorizer should consider vectorizing an
/// otherwise scalar epilogue loop if the loop already has been vectorized
- /// with VF x IC, running \p Iters scalar iterations per vector iteration.
+ /// processing \p Iters scalar iterations per vector iteration.
LLVM_ABI bool preferEpilogueVectorization(ElementCount Iters) const;
/// \returns True if the loop vectorizer should discard any VFs where the
More information about the llvm-commits
mailing list