[llvm] [AArch64] Allow unrolling of scalar epilogue loops (PR #151164)
John Brawn via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 30 09:23:22 PDT 2025
https://github.com/john-brawn-arm updated https://github.com/llvm/llvm-project/pull/151164
>From f29281bd3be5688ec421696d5a7563bc72e1244d Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Mon, 28 Jul 2025 17:34:50 +0100
Subject: [PATCH 1/3] [AArch64] Allow unrolling of scalar epilogue loops
PR#147420 changed the unrolling preferences to permit unrolling of
non-auto vectorized loops by checking for the isvectorized attribute,
however when a loop is vectorized this attribute is put on both the
vector loop and the scalar epilogue, so this change prevented the
scalar epilogue from being unrolled.
Restore the previous behaviour of unrolling the scalar epilogue by
checking both for the isvectorized attribute and vector instructions
in the loop.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 13 +-
.../Transforms/LoopUnroll/AArch64/vector.ll | 277 ++++++++++++++++++
2 files changed, 285 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 40f49dade6131..18ca22fc9f211 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4905,14 +4905,17 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
- // No need to unroll auto-vectorized loops
- if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
- return;
-
// Scan the loop: don't unroll loops with calls as this could prevent
- // inlining.
+ // inlining. Don't unroll auto-vectorized loops either, though do allow
+ // unrolling of the scalar remainder.
+ bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
+ // Both auto-vectorized loops and the scalar remainder have the
+ // isvectorized attribute, so differentiate between them by the presence
+ // of vector instructions.
+ if (IsVectorized && I.getType()->isVectorTy())
+ return;
if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index 8baded897fd7d..e0189d17349b6 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -485,12 +485,289 @@ exit: ; preds = %vector.body
!0 = !{!0, !1}
!1 = !{!"llvm.loop.isvectorized", i32 1}
+; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
+; vector loop or vector epilogue loop.
+define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
+; APPLE-LABEL: define void @scalar_epilogue(
+; APPLE-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; APPLE: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; APPLE-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
+; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; APPLE: [[VECTOR_PH]]:
+; APPLE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
+; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
+; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
+; APPLE: [[VECTOR_BODY]]:
+; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
+; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; APPLE-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; APPLE-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; APPLE-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
+; APPLE-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
+; APPLE-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
+; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; APPLE-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; APPLE-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; APPLE: [[MIDDLE_BLOCK]]:
+; APPLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; APPLE: [[VEC_EPILOG_ITER_CHECK]]:
+; APPLE-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
+; APPLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
+; APPLE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
+; APPLE: [[VEC_EPILOG_PH]]:
+; APPLE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; APPLE-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
+; APPLE-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
+; APPLE-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
+; APPLE-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; APPLE: [[VEC_EPILOG_VECTOR_BODY]]:
+; APPLE-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
+; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
+; APPLE-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
+; APPLE-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; APPLE-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
+; APPLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
+; APPLE-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; APPLE: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; APPLE-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
+; APPLE-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
+; APPLE: [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; APPLE-NEXT: br label %[[FOR_BODY:.*]]
+; APPLE: [[FOR_BODY]]:
+; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ]
+; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
+; APPLE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; APPLE-NEXT: [[ADD:%.*]] = add i8 [[TMP8]], [[VAL]]
+; APPLE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
+; APPLE-NEXT: [[INC]] = add nuw i64 [[I_06]], 1
+; APPLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; APPLE: [[EXIT_LOOPEXIT]]:
+; APPLE-NEXT: br label %[[EXIT]]
+; APPLE: [[EXIT]]:
+; APPLE-NEXT: ret void
+;
+; CORTEXA55-LABEL: define void @scalar_epilogue(
+; CORTEXA55-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
+; CORTEXA55-NEXT: [[ENTRY:.*]]:
+; CORTEXA55-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CORTEXA55: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CORTEXA55-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
+; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CORTEXA55: [[VECTOR_PH]]:
+; CORTEXA55-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
+; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
+; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
+; CORTEXA55: [[VECTOR_BODY]]:
+; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
+; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; CORTEXA55-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CORTEXA55-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CORTEXA55-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
+; CORTEXA55-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
+; CORTEXA55-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
+; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CORTEXA55-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CORTEXA55-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CORTEXA55: [[MIDDLE_BLOCK]]:
+; CORTEXA55-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CORTEXA55: [[VEC_EPILOG_ITER_CHECK]]:
+; CORTEXA55-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
+; CORTEXA55-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
+; CORTEXA55-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
+; CORTEXA55: [[VEC_EPILOG_PH]]:
+; CORTEXA55-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CORTEXA55-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
+; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
+; CORTEXA55-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CORTEXA55-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CORTEXA55: [[VEC_EPILOG_VECTOR_BODY]]:
+; CORTEXA55-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CORTEXA55-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
+; CORTEXA55-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
+; CORTEXA55-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
+; CORTEXA55-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CORTEXA55-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
+; CORTEXA55-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
+; CORTEXA55-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CORTEXA55: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CORTEXA55-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
+; CORTEXA55-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
+; CORTEXA55: [[FOR_BODY_PREHEADER]]:
+; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CORTEXA55-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
+; CORTEXA55-NEXT: [[TMP9:%.*]] = add i64 [[N]], -1
+; CORTEXA55-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
+; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP8]], 3
+; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_PROL_PREHEADER:.*]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]]
+; CORTEXA55: [[FOR_BODY_PROL_PREHEADER]]:
+; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL:.*]]
+; CORTEXA55: [[FOR_BODY_PROL]]:
+; CORTEXA55-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06_PH]]
+; CORTEXA55-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_PROL]], align 1
+; CORTEXA55-NEXT: [[ADD_PROL:%.*]] = add i8 [[TMP11]], [[VAL]]
+; CORTEXA55-NEXT: store i8 [[ADD_PROL]], ptr [[ARRAYIDX_PROL]], align 1
+; CORTEXA55-NEXT: [[INC_PROL:%.*]] = add nuw i64 [[I_06_PH]], 1
+; CORTEXA55-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
+; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP]], label %[[FOR_BODY_PROL_1:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA:.*]]
+; CORTEXA55: [[FOR_BODY_PROL_1]]:
+; CORTEXA55-NEXT: [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL]]
+; CORTEXA55-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1
+; CORTEXA55-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[TMP12]], [[VAL]]
+; CORTEXA55-NEXT: store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1
+; CORTEXA55-NEXT: [[INC_PROL_1:%.*]] = add nuw i64 [[I_06_PH]], 2
+; CORTEXA55-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
+; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[FOR_BODY_PROL_2:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]
+; CORTEXA55: [[FOR_BODY_PROL_2]]:
+; CORTEXA55-NEXT: [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]]
+; CORTEXA55-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1
+; CORTEXA55-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[TMP13]], [[VAL]]
+; CORTEXA55-NEXT: store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1
+; CORTEXA55-NEXT: [[INC_PROL_2:%.*]] = add nuw i64 [[I_06_PH]], 3
+; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]
+; CORTEXA55: [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]:
+; CORTEXA55-NEXT: [[I_06_UNR_PH:%.*]] = phi i64 [ [[INC_PROL]], %[[FOR_BODY_PROL]] ], [ [[INC_PROL_1]], %[[FOR_BODY_PROL_1]] ], [ [[INC_PROL_2]], %[[FOR_BODY_PROL_2]] ]
+; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL_LOOPEXIT]]
+; CORTEXA55: [[FOR_BODY_PROL_LOOPEXIT]]:
+; CORTEXA55-NEXT: [[I_06_UNR:%.*]] = phi i64 [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[I_06_UNR_PH]], %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CORTEXA55-NEXT: [[TMP14:%.*]] = icmp ult i64 [[TMP10]], 3
+; CORTEXA55-NEXT: br i1 [[TMP14]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; CORTEXA55: [[FOR_BODY_PREHEADER_NEW]]:
+; CORTEXA55-NEXT: br label %[[FOR_BODY:.*]]
+; CORTEXA55: [[FOR_BODY]]:
+; CORTEXA55-NEXT: [[I_06:%.*]] = phi i64 [ [[I_06_UNR]], %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
+; CORTEXA55-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CORTEXA55-NEXT: [[ADD:%.*]] = add i8 [[TMP15]], [[VAL]]
+; CORTEXA55-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
+; CORTEXA55-NEXT: [[INC:%.*]] = add nuw i64 [[I_06]], 1
+; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]]
+; CORTEXA55-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; CORTEXA55-NEXT: [[ADD_1:%.*]] = add i8 [[TMP16]], [[VAL]]
+; CORTEXA55-NEXT: store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1
+; CORTEXA55-NEXT: [[INC_1:%.*]] = add nuw i64 [[I_06]], 2
+; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]]
+; CORTEXA55-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; CORTEXA55-NEXT: [[ADD_2:%.*]] = add i8 [[TMP17]], [[VAL]]
+; CORTEXA55-NEXT: store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1
+; CORTEXA55-NEXT: [[INC_2:%.*]] = add nuw i64 [[I_06]], 3
+; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]]
+; CORTEXA55-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; CORTEXA55-NEXT: [[ADD_3:%.*]] = add i8 [[TMP18]], [[VAL]]
+; CORTEXA55-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
+; CORTEXA55-NEXT: [[INC_3]] = add nuw i64 [[I_06]], 4
+; CORTEXA55-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
+; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CORTEXA55: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
+; CORTEXA55-NEXT: br label %[[EXIT_LOOPEXIT]]
+; CORTEXA55: [[EXIT_LOOPEXIT]]:
+; CORTEXA55-NEXT: br label %[[EXIT]]
+; CORTEXA55: [[EXIT]]:
+; CORTEXA55-NEXT: ret void
+;
+entry:
+ %min.iters.check = icmp ult i64 %N, 8
+ br i1 %min.iters.check, label %for.body, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check:
+ %min.iters.check7 = icmp ult i64 %N, 32
+ br i1 %min.iters.check7, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:
+ %n.vec = and i64 %N, -32
+ %broadcast.splatinsert = insertelement <16 x i8> poison, i8 %val, i64 0
+ %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds nuw i8, ptr %p, i64 %index
+ %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %wide.load = load <16 x i8>, ptr %0, align 1
+ %wide.load8 = load <16 x i8>, ptr %1, align 1
+ %2 = add <16 x i8> %wide.load, %broadcast.splat
+ %3 = add <16 x i8> %wide.load8, %broadcast.splat
+ store <16 x i8> %2, ptr %0, align 1
+ store <16 x i8> %3, ptr %1, align 1
+ %index.next = add nuw i64 %index, 32
+ %4 = icmp eq i64 %index.next, %n.vec
+ br i1 %4, label %middle.block, label %vector.body, !llvm.loop !2
+
+middle.block:
+ %cmp.n = icmp eq i64 %N, %n.vec
+ br i1 %cmp.n, label %exit, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:
+ %n.vec.remaining = and i64 %N, 24
+ %min.epilog.iters.check = icmp eq i64 %n.vec.remaining, 0
+ br i1 %min.epilog.iters.check, label %for.body, label %vec.epilog.ph
+
+vec.epilog.ph:
+ %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+ %n.vec10 = and i64 %N, -8
+ %broadcast.splatinsert11 = insertelement <8 x i8> poison, i8 %val, i64 0
+ %broadcast.splat12 = shufflevector <8 x i8> %broadcast.splatinsert11, <8 x i8> poison, <8 x i32> zeroinitializer
+ br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:
+ %index13 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next15, %vec.epilog.vector.body ]
+ %5 = getelementptr inbounds nuw i8, ptr %p, i64 %index13
+ %wide.load14 = load <8 x i8>, ptr %5, align 1
+ %6 = add <8 x i8> %wide.load14, %broadcast.splat12
+ store <8 x i8> %6, ptr %5, align 1
+ %index.next15 = add nuw i64 %index13, 8
+ %7 = icmp eq i64 %index.next15, %n.vec10
+ br i1 %7, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !llvm.loop !3
+
+vec.epilog.middle.block:
+ %cmp.n16 = icmp eq i64 %N, %n.vec10
+ br i1 %cmp.n16, label %exit, label %for.body
+
+for.body:
+ %i.06 = phi i64 [ %inc, %for.body ], [ %n.vec10, %vec.epilog.middle.block ], [ %n.vec, %vec.epilog.iter.check ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %i.06
+ %8 = load i8, ptr %arrayidx, align 1
+ %add = add i8 %8, %val
+ store i8 %add, ptr %arrayidx, align 1
+ %inc = add nuw i64 %i.06, 1
+ %exitcond.not = icmp eq i64 %inc, %N
+ br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !4
+
+exit:
+ ret void
+}
+
+!2 = distinct !{!2, !1}
+!3 = distinct !{!3, !1}
+!4 = distinct !{!4, !1}
+
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; APPLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]]}
+; APPLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META3]]}
+; APPLE: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]]}
;.
; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CORTEXA55: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CORTEXA55: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CORTEXA55: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
;.
>From 7d22ee512374f8be7fb59c56179db32e1994b533 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Wed, 30 Jul 2025 11:44:53 +0100
Subject: [PATCH 2/3] Simplify test
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 111 +++---------------
1 file changed, 14 insertions(+), 97 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index e0189d17349b6..b4e12ab03cf1b 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -486,16 +486,13 @@ exit: ; preds = %vector.body
!1 = !{!"llvm.loop.isvectorized", i32 1}
; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
-; vector loop or vector epilogue loop.
+; vector loop.
define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; APPLE-LABEL: define void @scalar_epilogue(
; APPLE-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
-; APPLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; APPLE: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; APPLE-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
-; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; APPLE: [[VECTOR_PH]]:
; APPLE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
@@ -516,31 +513,9 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; APPLE-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; APPLE: [[MIDDLE_BLOCK]]:
; APPLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; APPLE: [[VEC_EPILOG_ITER_CHECK]]:
-; APPLE-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
-; APPLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; APPLE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
-; APPLE: [[VEC_EPILOG_PH]]:
-; APPLE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; APPLE-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
-; APPLE-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
-; APPLE-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
-; APPLE-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; APPLE: [[VEC_EPILOG_VECTOR_BODY]]:
-; APPLE-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
-; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
-; APPLE-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
-; APPLE-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; APPLE-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
-; APPLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
-; APPLE-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; APPLE: [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; APPLE-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
-; APPLE-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
+; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER]]
; APPLE: [[FOR_BODY_PREHEADER]]:
-; APPLE-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; APPLE-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
; APPLE: [[FOR_BODY]]:
; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ]
@@ -550,7 +525,7 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; APPLE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; APPLE-NEXT: [[INC]] = add nuw i64 [[I_06]], 1
; APPLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; APPLE: [[EXIT_LOOPEXIT]]:
; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
@@ -559,11 +534,8 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; CORTEXA55-LABEL: define void @scalar_epilogue(
; CORTEXA55-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT: [[ENTRY:.*]]:
-; CORTEXA55-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; CORTEXA55: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CORTEXA55-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
-; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; CORTEXA55: [[VECTOR_PH]]:
; CORTEXA55-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
@@ -584,31 +556,9 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; CORTEXA55-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CORTEXA55: [[MIDDLE_BLOCK]]:
; CORTEXA55-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; CORTEXA55: [[VEC_EPILOG_ITER_CHECK]]:
-; CORTEXA55-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[N]], 24
-; CORTEXA55-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; CORTEXA55-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]]
-; CORTEXA55: [[VEC_EPILOG_PH]]:
-; CORTEXA55-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CORTEXA55-NEXT: [[N_VEC10:%.*]] = and i64 [[N]], -8
-; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[VAL]], i64 0
-; CORTEXA55-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CORTEXA55-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CORTEXA55: [[VEC_EPILOG_VECTOR_BODY]]:
-; CORTEXA55-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CORTEXA55-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX13]]
-; CORTEXA55-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
-; CORTEXA55-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[BROADCAST_SPLAT12]]
-; CORTEXA55-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CORTEXA55-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX13]], 8
-; CORTEXA55-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
-; CORTEXA55-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CORTEXA55: [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CORTEXA55-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
-; CORTEXA55-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[FOR_BODY_PREHEADER]]
+; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER]]
; CORTEXA55: [[FOR_BODY_PREHEADER]]:
-; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; CORTEXA55-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
; CORTEXA55-NEXT: [[TMP9:%.*]] = add i64 [[N]], -1
; CORTEXA55-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
@@ -672,7 +622,7 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; CORTEXA55-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
; CORTEXA55-NEXT: [[INC_3]] = add nuw i64 [[I_06]], 4
; CORTEXA55-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
-; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CORTEXA55: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT: br label %[[EXIT_LOOPEXIT]]
; CORTEXA55: [[EXIT_LOOPEXIT]]:
@@ -681,12 +631,8 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; CORTEXA55-NEXT: ret void
;
entry:
- %min.iters.check = icmp ult i64 %N, 8
- br i1 %min.iters.check, label %for.body, label %vector.main.loop.iter.check
-
-vector.main.loop.iter.check:
- %min.iters.check7 = icmp ult i64 %N, 32
- br i1 %min.iters.check7, label %vec.epilog.ph, label %vector.ph
+ %min.iters.check = icmp ult i64 %N, 32
+ br i1 %min.iters.check, label %for.body, label %vector.ph
vector.ph:
%n.vec = and i64 %N, -32
@@ -710,43 +656,17 @@ vector.body:
middle.block:
%cmp.n = icmp eq i64 %N, %n.vec
- br i1 %cmp.n, label %exit, label %vec.epilog.iter.check
-
-vec.epilog.iter.check:
- %n.vec.remaining = and i64 %N, 24
- %min.epilog.iters.check = icmp eq i64 %n.vec.remaining, 0
- br i1 %min.epilog.iters.check, label %for.body, label %vec.epilog.ph
-
-vec.epilog.ph:
- %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
- %n.vec10 = and i64 %N, -8
- %broadcast.splatinsert11 = insertelement <8 x i8> poison, i8 %val, i64 0
- %broadcast.splat12 = shufflevector <8 x i8> %broadcast.splatinsert11, <8 x i8> poison, <8 x i32> zeroinitializer
- br label %vec.epilog.vector.body
-
-vec.epilog.vector.body:
- %index13 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next15, %vec.epilog.vector.body ]
- %5 = getelementptr inbounds nuw i8, ptr %p, i64 %index13
- %wide.load14 = load <8 x i8>, ptr %5, align 1
- %6 = add <8 x i8> %wide.load14, %broadcast.splat12
- store <8 x i8> %6, ptr %5, align 1
- %index.next15 = add nuw i64 %index13, 8
- %7 = icmp eq i64 %index.next15, %n.vec10
- br i1 %7, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !llvm.loop !3
-
-vec.epilog.middle.block:
- %cmp.n16 = icmp eq i64 %N, %n.vec10
- br i1 %cmp.n16, label %exit, label %for.body
+ br i1 %cmp.n, label %exit, label %for.body
for.body:
- %i.06 = phi i64 [ %inc, %for.body ], [ %n.vec10, %vec.epilog.middle.block ], [ %n.vec, %vec.epilog.iter.check ], [ 0, %entry ]
+ %i.06 = phi i64 [ %inc, %for.body ], [ %n.vec, %middle.block ], [ 0, %entry ]
%arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %i.06
%8 = load i8, ptr %arrayidx, align 1
%add = add i8 %8, %val
store i8 %add, ptr %arrayidx, align 1
%inc = add nuw i64 %i.06, 1
%exitcond.not = icmp eq i64 %inc, %N
- br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !4
+ br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !3
exit:
ret void
@@ -754,7 +674,6 @@ exit:
!2 = distinct !{!2, !1}
!3 = distinct !{!3, !1}
-!4 = distinct !{!4, !1}
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
@@ -763,11 +682,9 @@ exit:
; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; APPLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]]}
; APPLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META3]]}
-; APPLE: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]]}
;.
; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CORTEXA55: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
; CORTEXA55: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
-; CORTEXA55: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
;.
>From 1433c1c6a9dcfe731cde080f0385a891c906aba4 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Wed, 30 Jul 2025 17:22:32 +0100
Subject: [PATCH 3/3] Update test based on review comments
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 140 +++++++++---------
1 file changed, 70 insertions(+), 70 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index b4e12ab03cf1b..38d559f86587e 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -487,15 +487,15 @@ exit: ; preds = %vector.body
; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
; vector loop.
-define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
+define void @scalar_epilogue(ptr %p, i8 %splat.scalar, i64 %n) {
; APPLE-LABEL: define void @scalar_epilogue(
-; APPLE-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
+; APPLE-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
-; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; APPLE: [[VECTOR_PH]]:
; APPLE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
-; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
+; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
; APPLE: [[VECTOR_BODY]]:
@@ -513,32 +513,32 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; APPLE-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; APPLE: [[MIDDLE_BLOCK]]:
; APPLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER]]
-; APPLE: [[FOR_BODY_PREHEADER]]:
-; APPLE-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
-; APPLE-NEXT: br label %[[FOR_BODY:.*]]
-; APPLE: [[FOR_BODY]]:
-; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ]
+; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
+; APPLE: [[SCALAR_REMAINDER_PREHEADER]]:
+; APPLE-NEXT: [[IV_SCALAR_LOOP_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; APPLE-NEXT: br label %[[SCALAR_REMAINDER:.*]]
+; APPLE: [[SCALAR_REMAINDER]]:
+; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[SCALAR_REMAINDER]] ], [ [[IV_SCALAR_LOOP_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ]
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
; APPLE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; APPLE-NEXT: [[ADD:%.*]] = add i8 [[TMP8]], [[VAL]]
+; APPLE-NEXT: [[ADD:%.*]] = add i8 [[TMP8]], [[SPLAT_SCALAR]]
; APPLE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; APPLE-NEXT: [[INC]] = add nuw i64 [[I_06]], 1
; APPLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP5:![0-9]+]]
; APPLE: [[EXIT_LOOPEXIT]]:
; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; CORTEXA55-LABEL: define void @scalar_epilogue(
-; CORTEXA55-SAME: i64 [[N:%.*]], ptr [[P:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
+; CORTEXA55-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT: [[ENTRY:.*]]:
; CORTEXA55-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
-; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; CORTEXA55: [[VECTOR_PH]]:
; CORTEXA55-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
-; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL]], i64 0
+; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
; CORTEXA55: [[VECTOR_BODY]]:
@@ -556,73 +556,73 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; CORTEXA55-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CORTEXA55: [[MIDDLE_BLOCK]]:
; CORTEXA55-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER]]
-; CORTEXA55: [[FOR_BODY_PREHEADER]]:
+; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
+; CORTEXA55: [[SCALAR_REMAINDER_PREHEADER]]:
; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; CORTEXA55-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
; CORTEXA55-NEXT: [[TMP9:%.*]] = add i64 [[N]], -1
; CORTEXA55-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP8]], 3
; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
-; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_PROL_PREHEADER:.*]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]]
-; CORTEXA55: [[FOR_BODY_PROL_PREHEADER]]:
-; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL:.*]]
-; CORTEXA55: [[FOR_BODY_PROL]]:
+; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[SCALAR_REMAINDER_PROL_PREHEADER:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT:.*]]
+; CORTEXA55: [[SCALAR_REMAINDER_PROL_PREHEADER]]:
+; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL:.*]]
+; CORTEXA55: [[SCALAR_REMAINDER_PROL]]:
; CORTEXA55-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06_PH]]
; CORTEXA55-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_PROL]], align 1
-; CORTEXA55-NEXT: [[ADD_PROL:%.*]] = add i8 [[TMP11]], [[VAL]]
+; CORTEXA55-NEXT: [[ADD_PROL:%.*]] = add i8 [[TMP11]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_PROL]], ptr [[ARRAYIDX_PROL]], align 1
; CORTEXA55-NEXT: [[INC_PROL:%.*]] = add nuw i64 [[I_06_PH]], 1
; CORTEXA55-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
-; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP]], label %[[FOR_BODY_PROL_1:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA:.*]]
-; CORTEXA55: [[FOR_BODY_PROL_1]]:
+; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP]], label %[[SCALAR_REMAINDER_PROL_1:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA:.*]]
+; CORTEXA55: [[SCALAR_REMAINDER_PROL_1]]:
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL]]
; CORTEXA55-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1
-; CORTEXA55-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[TMP12]], [[VAL]]
+; CORTEXA55-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[TMP12]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1
; CORTEXA55-NEXT: [[INC_PROL_1:%.*]] = add nuw i64 [[I_06_PH]], 2
; CORTEXA55-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
-; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[FOR_BODY_PROL_2:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]
-; CORTEXA55: [[FOR_BODY_PROL_2]]:
+; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[SCALAR_REMAINDER_PROL_2:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
+; CORTEXA55: [[SCALAR_REMAINDER_PROL_2]]:
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]]
; CORTEXA55-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1
-; CORTEXA55-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[TMP13]], [[VAL]]
+; CORTEXA55-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[TMP13]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1
; CORTEXA55-NEXT: [[INC_PROL_2:%.*]] = add nuw i64 [[I_06_PH]], 3
-; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]
-; CORTEXA55: [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]:
-; CORTEXA55-NEXT: [[I_06_UNR_PH:%.*]] = phi i64 [ [[INC_PROL]], %[[FOR_BODY_PROL]] ], [ [[INC_PROL_1]], %[[FOR_BODY_PROL_1]] ], [ [[INC_PROL_2]], %[[FOR_BODY_PROL_2]] ]
-; CORTEXA55-NEXT: br label %[[FOR_BODY_PROL_LOOPEXIT]]
-; CORTEXA55: [[FOR_BODY_PROL_LOOPEXIT]]:
-; CORTEXA55-NEXT: [[I_06_UNR:%.*]] = phi i64 [ [[I_06_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[I_06_UNR_PH]], %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
+; CORTEXA55: [[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]:
+; CORTEXA55-NEXT: [[IV_SCALAR_LOOP_UNR_PH:%.*]] = phi i64 [ [[INC_PROL]], %[[SCALAR_REMAINDER_PROL]] ], [ [[INC_PROL_1]], %[[SCALAR_REMAINDER_PROL_1]] ], [ [[INC_PROL_2]], %[[SCALAR_REMAINDER_PROL_2]] ]
+; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT]]
+; CORTEXA55: [[SCALAR_REMAINDER_PROL_LOOPEXIT]]:
+; CORTEXA55-NEXT: [[IV_SCALAR_LOOP_UNR:%.*]] = phi i64 [ [[I_06_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ], [ [[IV_SCALAR_LOOP_UNR_PH]], %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ]
; CORTEXA55-NEXT: [[TMP14:%.*]] = icmp ult i64 [[TMP10]], 3
-; CORTEXA55-NEXT: br i1 [[TMP14]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
-; CORTEXA55: [[FOR_BODY_PREHEADER_NEW]]:
-; CORTEXA55-NEXT: br label %[[FOR_BODY:.*]]
-; CORTEXA55: [[FOR_BODY]]:
-; CORTEXA55-NEXT: [[I_06:%.*]] = phi i64 [ [[I_06_UNR]], %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT: br i1 [[TMP14]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER_NEW:.*]]
+; CORTEXA55: [[SCALAR_REMAINDER_PREHEADER_NEW]]:
+; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER:.*]]
+; CORTEXA55: [[SCALAR_REMAINDER]]:
+; CORTEXA55-NEXT: [[I_06:%.*]] = phi i64 [ [[IV_SCALAR_LOOP_UNR]], %[[SCALAR_REMAINDER_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[SCALAR_REMAINDER]] ]
; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
; CORTEXA55-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CORTEXA55-NEXT: [[ADD:%.*]] = add i8 [[TMP15]], [[VAL]]
+; CORTEXA55-NEXT: [[ADD:%.*]] = add i8 [[TMP15]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; CORTEXA55-NEXT: [[INC:%.*]] = add nuw i64 [[I_06]], 1
; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]]
; CORTEXA55-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
-; CORTEXA55-NEXT: [[ADD_1:%.*]] = add i8 [[TMP16]], [[VAL]]
+; CORTEXA55-NEXT: [[ADD_1:%.*]] = add i8 [[TMP16]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1
; CORTEXA55-NEXT: [[INC_1:%.*]] = add nuw i64 [[I_06]], 2
; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]]
; CORTEXA55-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
-; CORTEXA55-NEXT: [[ADD_2:%.*]] = add i8 [[TMP17]], [[VAL]]
+; CORTEXA55-NEXT: [[ADD_2:%.*]] = add i8 [[TMP17]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1
; CORTEXA55-NEXT: [[INC_2:%.*]] = add nuw i64 [[I_06]], 3
; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]]
; CORTEXA55-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
-; CORTEXA55-NEXT: [[ADD_3:%.*]] = add i8 [[TMP18]], [[VAL]]
+; CORTEXA55-NEXT: [[ADD_3:%.*]] = add i8 [[TMP18]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
; CORTEXA55-NEXT: [[INC_3]] = add nuw i64 [[I_06]], 4
; CORTEXA55-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
-; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]]
; CORTEXA55: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT: br label %[[EXIT_LOOPEXIT]]
; CORTEXA55: [[EXIT_LOOPEXIT]]:
@@ -631,42 +631,42 @@ define void @scalar_epilogue(i64 %N, ptr %p, i8 %val) {
; CORTEXA55-NEXT: ret void
;
entry:
- %min.iters.check = icmp ult i64 %N, 32
- br i1 %min.iters.check, label %for.body, label %vector.ph
+ %min.iters.check = icmp ult i64 %n, 32
+ br i1 %min.iters.check, label %scalar.remainder, label %vector.ph
vector.ph:
- %n.vec = and i64 %N, -32
- %broadcast.splatinsert = insertelement <16 x i8> poison, i8 %val, i64 0
+ %n.vec = and i64 %n, -32
+ %broadcast.splatinsert = insertelement <16 x i8> poison, i8 %splat.scalar, i64 0
%broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer
br label %vector.body
vector.body:
- %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
- %0 = getelementptr inbounds nuw i8, ptr %p, i64 %index
- %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
- %wide.load = load <16 x i8>, ptr %0, align 1
- %wide.load8 = load <16 x i8>, ptr %1, align 1
- %2 = add <16 x i8> %wide.load, %broadcast.splat
- %3 = add <16 x i8> %wide.load8, %broadcast.splat
- store <16 x i8> %2, ptr %0, align 1
- store <16 x i8> %3, ptr %1, align 1
- %index.next = add nuw i64 %index, 32
- %4 = icmp eq i64 %index.next, %n.vec
- br i1 %4, label %middle.block, label %vector.body, !llvm.loop !2
+ %iv = phi i64 [ 0, %vector.ph ], [ %iv.next, %vector.body ]
+ %gep.p.iv = getelementptr inbounds nuw i8, ptr %p, i64 %iv
+ %gep.p.iv.16 = getelementptr inbounds nuw i8, ptr %gep.p.iv, i64 16
+ %wide.load = load <16 x i8>, ptr %gep.p.iv, align 1
+ %wide.load.2 = load <16 x i8>, ptr %gep.p.iv.16, align 1
+ %add.broadcast = add <16 x i8> %wide.load, %broadcast.splat
+ %add.broadcast.2 = add <16 x i8> %wide.load.2, %broadcast.splat
+ store <16 x i8> %add.broadcast, ptr %gep.p.iv, align 1
+ store <16 x i8> %add.broadcast.2, ptr %gep.p.iv.16, align 1
+ %iv.next = add nuw i64 %iv, 32
+ %exit.cond = icmp eq i64 %iv.next, %n.vec
+ br i1 %exit.cond, label %middle.block, label %vector.body, !llvm.loop !2
middle.block:
- %cmp.n = icmp eq i64 %N, %n.vec
- br i1 %cmp.n, label %exit, label %for.body
+ %cmp.n = icmp eq i64 %n, %n.vec
+ br i1 %cmp.n, label %exit, label %scalar.remainder
-for.body:
- %i.06 = phi i64 [ %inc, %for.body ], [ %n.vec, %middle.block ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %i.06
- %8 = load i8, ptr %arrayidx, align 1
- %add = add i8 %8, %val
+scalar.remainder:
+ %iv.scalar.loop = phi i64 [ %inc, %scalar.remainder ], [ %n.vec, %middle.block ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %iv.scalar.loop
+ %scalar.load = load i8, ptr %arrayidx, align 1
+ %add = add i8 %scalar.load, %splat.scalar
store i8 %add, ptr %arrayidx, align 1
- %inc = add nuw i64 %i.06, 1
- %exitcond.not = icmp eq i64 %inc, %N
- br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !3
+ %inc = add nuw i64 %iv.scalar.loop, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %exit, label %scalar.remainder, !llvm.loop !3
exit:
ret void
More information about the llvm-commits
mailing list