[llvm] [AArch64] Unrolling of loops with vector instructions. (PR #147420)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 14 12:52:30 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/147420
>From db867b90f6b897b4c50fc44a8997a7643e963902 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ayasin at Ahmads-MacBook-Pro-2.local>
Date: Sun, 6 Jul 2025 22:59:47 +0300
Subject: [PATCH 01/13] Let vectorized loops be unrolled
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c04cbc80bc5b6..f97886c486f8b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4895,14 +4895,9 @@ void AArch64TTIImpl::getUnrollingPreferences(
UP.PartialOptSizeThreshold = 0;
// Scan the loop: don't unroll loops with calls as this could prevent
- // inlining. Don't unroll vector loops either, as they don't benefit much from
- // unrolling.
+ // inlining.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- // Don't unroll vectorised loop.
- if (I.getType()->isVectorTy())
- return;
-
if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
>From 744cea53cbf82128c24d3bcdd77a508eb5a20367 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Sun, 6 Jul 2025 23:14:46 +0300
Subject: [PATCH 02/13] No need to unroll auto-vectorized loops that were
interleaved
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f97886c486f8b..7616aac4028b3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4894,6 +4894,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
+ // No need to unroll auto-vectorized loops that were interleaved
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized") &&
+ findStringMetadataForLoop(L, "llvm.loop.interleave.count"))
+ return;
+
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining.
for (auto *BB : L->getBlocks()) {
>From 23e689fd93653d0331c25e67c1904c58a2aa5927 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 01:48:10 +0300
Subject: [PATCH 03/13] Adding a test for (runtime) unrolling vector loop and a
couple debug prints
---
llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 5 +-
.../Transforms/LoopUnroll/AArch64/vector.ll | 131 ++++++++++++++++++
2 files changed, 135 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index a22d84dcf014d..7a7c945e1cb24 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1172,7 +1172,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
LLVM_DEBUG(dbgs() << "Loop Unroll: F["
<< L->getHeader()->getParent()->getName() << "] Loop %"
- << L->getHeader()->getName() << "\n");
+ << L->getHeader()->getName() << " Full=" << OnlyFullUnroll
+ << " Loc=" << L->getLocStr() << "\n");
TransformationMode TM = hasUnrollTransformation(L);
if (TM & TM_Disable)
return LoopUnrollResult::Unmodified;
@@ -1219,6 +1220,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
ProvidedFullUnrollMaxCount);
TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
+ LLVM_DEBUG(dbgs() << " UP.Partial=" << UP.Partial
+ << " UP.Runtime=" << UP.Runtime << "\n");
// Exit early if unrolling is disabled. For OptForSize, we pick the loop size
// as threshold later on.
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
new file mode 100644
index 0000000000000..dbde0df575472
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; APPLE-LABEL: define void @reverse(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-NEXT: [[ENTRY:.*:]]
+; APPLE-NEXT: [[SHR:%.*]] = ashr i32 [[LEN]], 2
+; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; APPLE: [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT: [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT: [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
+; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT: br label %[[FOR_BODY:.*]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
+; APPLE-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
+; APPLE-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
+; APPLE: [[FOR_BODY_EPIL]]:
+; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
+; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
+; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
+; APPLE: [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT: ret void
+; APPLE: [[FOR_BODY]]:
+; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
+; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
+; APPLE-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
+; APPLE-NEXT: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
+; APPLE-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
+; APPLE-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
+; APPLE-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
+; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
+; APPLE-NEXT: [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
+; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
+; APPLE-NEXT: [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: store <4 x float> [[TMP20]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+ %shr = ashr i32 %len, 2
+ %cmp7 = icmp sgt i32 %shr, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %0 = zext nneg i32 %shr to i64
+ %wide.trip.count = zext nneg i32 %shr to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %1 = sub nsw i64 %0, %indvars.iv
+ %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
+ %2 = load <4 x float>, ptr %arrayidx, align 16
+ %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
+ store <4 x float> %2, ptr %arrayidx2, align 16
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;.
+; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
+
>From 5665ac093ecba02d3c82e681e4b06f18490040a9 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 23:08:58 +0300
Subject: [PATCH 04/13] Simplified the vector.ll test + check for -mtriple
aarch64
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 69 ++++++++++++-------
1 file changed, 43 insertions(+), 26 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index dbde0df575472..cbbe5d63b4cee 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,21 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
-; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*:]]
-; APPLE-NEXT: [[SHR:%.*]] = ashr i32 [[LEN]], 2
-; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; APPLE: [[FOR_BODY_PREHEADER]]:
-; APPLE-NEXT: [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT: [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
-; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
-; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
; APPLE-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
@@ -29,13 +27,13 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
; APPLE: [[FOR_BODY_EPIL]]:
; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
-; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -48,49 +46,49 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
; APPLE: [[FOR_BODY]]:
; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT]]
; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
; APPLE-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_1]]
; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
; APPLE-NEXT: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_2]]
; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
; APPLE-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
; APPLE-NEXT: store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_3]]
; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
; APPLE-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
; APPLE-NEXT: store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_4]]
; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
; APPLE-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
; APPLE-NEXT: store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
-; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_5]]
; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
; APPLE-NEXT: [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
; APPLE-NEXT: store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
-; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_6]]
; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
; APPLE-NEXT: [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
@@ -100,14 +98,33 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
;
+; GENERIC-LABEL: define void @reverse(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
+; GENERIC-NEXT: [[ENTRY:.*:]]
+; GENERIC-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
+; GENERIC-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; GENERIC: [[FOR_BODY_PREHEADER]]:
+; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
+; GENERIC: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; GENERIC-NEXT: br label %[[FOR_COND_CLEANUP]]
+; GENERIC: [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT: ret void
+; GENERIC: [[FOR_BODY]]:
+; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
+; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
+; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
+; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
+; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
entry:
- %shr = ashr i32 %len, 2
- %cmp7 = icmp sgt i32 %shr, 0
+ %cmp7 = icmp sgt i64 %len, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
- %0 = zext nneg i32 %shr to i64
- %wide.trip.count = zext nneg i32 %shr to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
@@ -115,17 +132,17 @@ for.cond.cleanup: ; preds = %for.body, %entry
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %1 = sub nsw i64 %0, %indvars.iv
+ %1 = sub nsw i64 %len, %indvars.iv
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
%2 = load <4 x float>, ptr %arrayidx, align 16
%arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
store <4 x float> %2, ptr %arrayidx2, align 16
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %len
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
;.
-
>From 04459401eed9549db4eb6dcfe470a1030cb17d17 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 10 Jul 2025 14:09:15 +0300
Subject: [PATCH 05/13] revert a couple debug prints
---
llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 7a7c945e1cb24..a22d84dcf014d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1172,8 +1172,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
LLVM_DEBUG(dbgs() << "Loop Unroll: F["
<< L->getHeader()->getParent()->getName() << "] Loop %"
- << L->getHeader()->getName() << " Full=" << OnlyFullUnroll
- << " Loc=" << L->getLocStr() << "\n");
+ << L->getHeader()->getName() << "\n");
TransformationMode TM = hasUnrollTransformation(L);
if (TM & TM_Disable)
return LoopUnrollResult::Unmodified;
@@ -1220,8 +1219,6 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
ProvidedFullUnrollMaxCount);
TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
- LLVM_DEBUG(dbgs() << " UP.Partial=" << UP.Partial
- << " UP.Runtime=" << UP.Runtime << "\n");
// Exit early if unrolling is disabled. For OptForSize, we pick the loop size
// as threshold later on.
>From 7ff7ce3ac52b2e700e54ef433e1cd9d0ffffac4d Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 10 Jul 2025 18:03:15 +0300
Subject: [PATCH 06/13] Revert the non-interleaved auto-vectorized case +
rename %indvars.iv
---
.../AArch64/AArch64TargetTransformInfo.cpp | 5 ++---
.../test/Transforms/LoopUnroll/AArch64/vector.ll | 16 ++++++++--------
2 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7616aac4028b3..230ad821a8a1e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4894,9 +4894,8 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
- // No need to unroll auto-vectorized loops that were interleaved
- if (findStringMetadataForLoop(L, "llvm.loop.isvectorized") &&
- findStringMetadataForLoop(L, "llvm.loop.interleave.count"))
+ // No need to unroll auto-vectorized loops
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
return;
// Scan the loop: don't unroll loops with calls as this could prevent
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index cbbe5d63b4cee..e6f195bc7147a 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -16,16 +16,16 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
-; APPLE-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
-; APPLE-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
; APPLE: [[FOR_BODY_EPIL]]:
-; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
@@ -131,14 +131,14 @@ for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %1 = sub nsw i64 %len, %indvars.iv
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+ %1 = sub nsw i64 %len, %iv
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
%2 = load <4 x float>, ptr %arrayidx, align 16
- %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
store <4 x float> %2, ptr %arrayidx2, align 16
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %len
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %len
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
>From 1553324d5d763969341a9315fb59366a17ba7b93 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 10 Jul 2025 21:58:20 +0300
Subject: [PATCH 07/13] a 2nd test for autovectorized loop with static
tripcount
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 164 +++++++++++++++---
1 file changed, 137 insertions(+), 27 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index e6f195bc7147a..c4ee8c2f08204 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -4,24 +4,21 @@
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
-; APPLE-NEXT: [[ENTRY:.*:]]
-; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
-; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; APPLE: [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT: [[FOR_BODY_PREHEADER:.*]]:
; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
-; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
-; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
+; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]]:
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
-; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
-; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_UNR_LCSSA]]
+; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA]]:
+; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
-; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
; APPLE: [[FOR_BODY_EPIL]]:
@@ -36,10 +33,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
-; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
-; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
-; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
-; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
; APPLE: [[FOR_COND_CLEANUP]]:
; APPLE-NEXT: ret void
@@ -96,18 +91,13 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
-; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
;
; GENERIC-LABEL: define void @reverse(
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
-; GENERIC-NEXT: [[ENTRY:.*:]]
-; GENERIC-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
-; GENERIC-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; GENERIC: [[FOR_BODY_PREHEADER]]:
+; GENERIC-NEXT: [[FOR_BODY_PREHEADER:.*]]:
; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
-; GENERIC: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
-; GENERIC-NEXT: br label %[[FOR_COND_CLEANUP]]
-; GENERIC: [[FOR_COND_CLEANUP]]:
+; GENERIC: [[FOR_COND_CLEANUP:.*]]:
; GENERIC-NEXT: ret void
; GENERIC: [[FOR_BODY]]:
; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -118,12 +108,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
-; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
;
-entry:
- %cmp7 = icmp sgt i64 %len, 0
- br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
for.body.preheader: ; preds = %entry
br label %for.body
@@ -142,7 +128,131 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
+define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
+; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
+; APPLE: [[VECTOR_BODY]]:
+; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
+; APPLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; APPLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
+; APPLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
+; APPLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
+; APPLE-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; APPLE-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; APPLE-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; APPLE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
+; APPLE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
+; APPLE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; APPLE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
+; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
+; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
+; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
+; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
+; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; APPLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; APPLE-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; APPLE: [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT: ret void
+;
+; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
+; GENERIC-NEXT: [[ENTRY:.*]]:
+; GENERIC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; GENERIC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; GENERIC-NEXT: br label %[[VECTOR_BODY:.*]]
+; GENERIC: [[VECTOR_BODY]]:
+; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
+; GENERIC-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; GENERIC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; GENERIC-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
+; GENERIC-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
+; GENERIC-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
+; GENERIC-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; GENERIC-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
+; GENERIC-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
+; GENERIC-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; GENERIC-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
+; GENERIC-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
+; GENERIC-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
+; GENERIC-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
+; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
+; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; GENERIC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; GENERIC-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GENERIC: [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT: ret void
+;
+entry:
+ %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+ %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+ %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %2 = getelementptr inbounds nuw i8, ptr %0, i64 32
+ %3 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %wide.load = load <4 x float>, ptr %0, align 4
+ %wide.load12 = load <4 x float>, ptr %1, align 4
+ %wide.load13 = load <4 x float>, ptr %2, align 4
+ %wide.load14 = load <4 x float>, ptr %3, align 4
+ %4 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+ %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
+ %6 = getelementptr inbounds nuw i8, ptr %4, i64 32
+ %7 = getelementptr inbounds nuw i8, ptr %4, i64 48
+ %wide.load15 = load <4 x float>, ptr %4, align 4
+ %wide.load16 = load <4 x float>, ptr %5, align 4
+ %wide.load17 = load <4 x float>, ptr %6, align 4
+ %wide.load18 = load <4 x float>, ptr %7, align 4
+ %8 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load15)
+ %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x float> %wide.load16)
+ %10 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load13, <4 x float> %wide.load17)
+ %11 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load14, <4 x float> %wide.load18)
+ store <4 x float> %8, ptr %4, align 4
+ store <4 x float> %9, ptr %5, align 4
+ store <4 x float> %10, ptr %6, align 4
+ store <4 x float> %11, ptr %7, align 4
+ %index.next = add nuw i64 %index, 16
+ %12 = icmp eq i64 %index.next, 1024
+ br i1 %12, label %for.cond.cleanup, label %vector.body, !llvm.loop !22
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
+!22 = !{!"llvm.loop.isvectorized", i32 1}
+
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
+;.
+; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
;.
>From a3c2ac64e5846d1ebaefb2d763abf04049c7e51e Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Fri, 11 Jul 2025 00:10:43 +0300
Subject: [PATCH 08/13] shortened the test & cleanup entry/exit labels
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 185 ++++++------------
1 file changed, 64 insertions(+), 121 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index c4ee8c2f08204..8a3e17588610e 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -4,43 +4,17 @@
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
-; APPLE-NEXT: [[FOR_BODY_PREHEADER:.*]]:
+; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
-; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
-; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
+; APPLE-NEXT: br i1 [[TMP6]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE: [[ENTRY_NEW]]:
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
-; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]]:
-; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_UNR_LCSSA]]
-; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA]]:
-; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
-; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
-; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
-; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
-; APPLE: [[FOR_BODY_EPIL]]:
-; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
-; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
-; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
-; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
-; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
-; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
-; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
-; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
-; APPLE: [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
-; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
-; APPLE: [[FOR_COND_CLEANUP]]:
-; APPLE-NEXT: ret void
; APPLE: [[FOR_BODY]]:
-; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
-; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
@@ -91,16 +65,40 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
-; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; APPLE: [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
+; APPLE: [[FOR_BODY_EPIL]]:
+; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[TMP21:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
+; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP21]]
+; APPLE-NEXT: [[TMP22:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
+; APPLE-NEXT: store <4 x float> [[TMP22]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
+; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT: br label %[[EXIT]]
+; APPLE: [[EXIT]]:
+; APPLE-NEXT: ret void
;
; GENERIC-LABEL: define void @reverse(
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
-; GENERIC-NEXT: [[FOR_BODY_PREHEADER:.*]]:
+; GENERIC-NEXT: [[ENTRY:.*]]:
; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
-; GENERIC: [[FOR_COND_CLEANUP:.*]]:
-; GENERIC-NEXT: ret void
; GENERIC: [[FOR_BODY]]:
-; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
@@ -108,16 +106,15 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
-; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; GENERIC: [[EXIT]]:
+; GENERIC-NEXT: ret void
;
-for.body.preheader: ; preds = %entry
+entry: ; preds = %entry
br label %for.body
-for.cond.cleanup: ; preds = %for.body, %entry
- ret void
-
-for.body: ; preds = %for.body.preheader, %for.body
- %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%1 = sub nsw i64 %len, %iv
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
%2 = load <4 x float>, ptr %arrayidx, align 16
@@ -125,13 +122,13 @@ for.body: ; preds = %for.body.preheader,
store <4 x float> %2, ptr %arrayidx2, align 16
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %len
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body, %entry
+ ret void
}
-; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
-; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -142,33 +139,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE: [[VECTOR_BODY]]:
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
-; APPLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
-; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; APPLE-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; APPLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
-; APPLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
-; APPLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
-; APPLE-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; APPLE-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
-; APPLE-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
-; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
-; APPLE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
-; APPLE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
-; APPLE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
; APPLE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
-; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
-; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
-; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
-; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; APPLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; APPLE-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; APPLE: [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
@@ -180,33 +159,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; GENERIC: [[VECTOR_BODY]]:
; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
-; GENERIC-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
-; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; GENERIC-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; GENERIC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; GENERIC-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
-; GENERIC-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
-; GENERIC-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
-; GENERIC-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; GENERIC-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
-; GENERIC-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
-; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
-; GENERIC-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
-; GENERIC-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
-; GENERIC-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
; GENERIC-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; GENERIC-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
-; GENERIC-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
-; GENERIC-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
-; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
-; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; GENERIC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; GENERIC-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; GENERIC: [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GENERIC: [[EXIT]]:
; GENERIC-NEXT: ret void
;
entry:
@@ -217,37 +178,19 @@ entry:
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
- %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
- %2 = getelementptr inbounds nuw i8, ptr %0, i64 32
- %3 = getelementptr inbounds nuw i8, ptr %0, i64 48
%wide.load = load <4 x float>, ptr %0, align 4
+ %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
- %wide.load13 = load <4 x float>, ptr %2, align 4
- %wide.load14 = load <4 x float>, ptr %3, align 4
- %4 = getelementptr inbounds nuw float, ptr %dst, i64 %index
- %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
- %6 = getelementptr inbounds nuw i8, ptr %4, i64 32
- %7 = getelementptr inbounds nuw i8, ptr %4, i64 48
- %wide.load15 = load <4 x float>, ptr %4, align 4
- %wide.load16 = load <4 x float>, ptr %5, align 4
- %wide.load17 = load <4 x float>, ptr %6, align 4
- %wide.load18 = load <4 x float>, ptr %7, align 4
- %8 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load15)
- %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x float> %wide.load16)
- %10 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load13, <4 x float> %wide.load17)
- %11 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load14, <4 x float> %wide.load18)
- store <4 x float> %8, ptr %4, align 4
- store <4 x float> %9, ptr %5, align 4
- store <4 x float> %10, ptr %6, align 4
- store <4 x float> %11, ptr %7, align 4
- %index.next = add nuw i64 %index, 16
- %12 = icmp eq i64 %index.next, 1024
- br i1 %12, label %for.cond.cleanup, label %vector.body, !llvm.loop !22
+ %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+ store <4 x float> %2, ptr %1, align 4
+ %index.next = add nuw i64 %index, 4
+ %3 = icmp eq i64 %index.next, 1024
+ br i1 %3, label %exit, label %vector.body, !llvm.loop !0
-for.cond.cleanup: ; preds = %vector.body
+exit: ; preds = %vector.body
ret void
}
-!22 = !{!"llvm.loop.isvectorized", i32 1}
+!0 = !{!"llvm.loop.isvectorized", i32 1}
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
>From 0f3dad93f51887bf2a2145c26ade6dd4376db1a0 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Sun, 13 Jul 2025 22:40:00 +0300
Subject: [PATCH 09/13] updated the vector.ll test and adding COMMON prefix
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 158 +++++++++---------
1 file changed, 80 insertions(+), 78 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index 8a3e17588610e..36eee7549ec3a 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,73 +1,73 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefixes=COMMON,APPLE %s
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefixes=COMMON,GENERIC
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*]]:
-; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
+; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
-; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
-; APPLE-NEXT: br i1 [[TMP6]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
; APPLE: [[ENTRY_NEW]]:
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
; APPLE: [[FOR_BODY]]:
-; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
-; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
-; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
-; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT]]
-; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
-; APPLE-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
-; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_1]]
-; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
-; APPLE-NEXT: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
-; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_2]]
-; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
-; APPLE-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
-; APPLE-NEXT: store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_3]]
-; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
-; APPLE-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
-; APPLE-NEXT: store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_4]]
-; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
-; APPLE-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
-; APPLE-NEXT: store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
-; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_5]]
-; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
-; APPLE-NEXT: [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
-; APPLE-NEXT: store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
-; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_6]]
-; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
-; APPLE-NEXT: [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
-; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
-; APPLE-NEXT: store <4 x float> [[TMP20]], ptr [[ARRAYIDX2_7]], align 16
-; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+; APPLE-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
+; APPLE-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
+; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
+; APPLE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
+; APPLE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; APPLE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
+; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
+; APPLE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; APPLE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
+; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
+; APPLE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
+; APPLE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
+; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
+; APPLE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
+; APPLE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
+; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
+; APPLE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
+; APPLE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
+; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
+; APPLE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
+; APPLE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
+; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
+; APPLE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
-; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
; APPLE: [[EXIT_UNR_LCSSA]]:
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
@@ -78,11 +78,11 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE: [[FOR_BODY_EPIL]]:
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT: [[TMP21:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
-; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP21]]
-; APPLE-NEXT: [[TMP22:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
+; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
+; APPLE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
-; APPLE-NEXT: store <4 x float> [[TMP22]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
@@ -98,14 +98,14 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; GENERIC-NEXT: [[ENTRY:.*]]:
; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
; GENERIC: [[FOR_BODY]]:
-; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
+; GENERIC-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
-; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
+; GENERIC-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
; GENERIC: [[EXIT]]:
; GENERIC-NEXT: ret void
@@ -139,14 +139,14 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE: [[VECTOR_BODY]]:
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; APPLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; APPLE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; APPLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; APPLE-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
@@ -159,14 +159,14 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; GENERIC: [[VECTOR_BODY]]:
; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; GENERIC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; GENERIC-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; GENERIC-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; GENERIC-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; GENERIC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; GENERIC-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GENERIC-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; GENERIC-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; GENERIC: [[EXIT]]:
; GENERIC-NEXT: ret void
;
@@ -199,3 +199,5 @@ exit: ; preds = %vector.body
;.
; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}
>From d33001d7e1ae01cbc17aa5fc8314cd6590a84539 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 14 Jul 2025 12:31:00 +0300
Subject: [PATCH 10/13] added test for full unroll; fixed !llvm.loop metadata;
undo (non-working) COMMON prefix
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 76 +++++++++++++++++--
1 file changed, 69 insertions(+), 7 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index 36eee7549ec3a..bff8433b75b1b 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefixes=COMMON,APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefixes=COMMON,GENERIC
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -129,6 +129,67 @@ exit: ; preds = %for.body, %entry
}
+define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
+; APPLE-LABEL: define void @saxpy_tripcount8_full_unroll(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT: [[ENTRY:.*:]]
+; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
+; APPLE: [[VECTOR_BODY]]:
+; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
+; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
+; APPLE-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; APPLE-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
+; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
+; APPLE-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
+; APPLE-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; APPLE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; APPLE-NEXT: ret void
+;
+; GENERIC-LABEL: define void @saxpy_tripcount8_full_unroll(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
+; GENERIC-NEXT: [[ENTRY:.*:]]
+; GENERIC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; GENERIC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; GENERIC-NEXT: br label %[[VECTOR_BODY:.*]]
+; GENERIC: [[VECTOR_BODY]]:
+; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
+; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
+; GENERIC-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; GENERIC-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
+; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
+; GENERIC-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
+; GENERIC-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; GENERIC-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; GENERIC-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; GENERIC-NEXT: ret void
+;
+entry:
+ %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+ %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+ %wide.load = load <4 x float>, ptr %0, align 4
+ %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+ %wide.load12 = load <4 x float>, ptr %1, align 4
+ %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+ store <4 x float> %2, ptr %1, align 4
+ %index.next = add nuw i64 %index, 4
+ %3 = icmp eq i64 %index.next, 8
+ br i1 %3, label %exit, label %vector.body
+
+exit: ; preds = %vector.body
+ ret void
+}
+
+
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -190,14 +251,15 @@ vector.body: ; preds = %vector.body, %entry
exit: ; preds = %vector.body
ret void
}
-!0 = !{!"llvm.loop.isvectorized", i32 1}
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.isvectorized", i32 1}
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
-; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
+; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
;.
-; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
+; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
;.
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; COMMON: {{.*}}
>From eed737c62200b8aa3f8248b23d7b1e069916f9bb Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 14 Jul 2025 16:30:09 +0300
Subject: [PATCH 11/13] another saxpy test flavor that Cortex-A55 can unroll +
replace prefix=GENERIC with CORETEXA55
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 297 +++++++++++++++++-
1 file changed, 291 insertions(+), 6 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index bff8433b75b1b..45eb9c50b34eb 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -93,6 +93,85 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
+; CORTEXA55-LABEL: define void @reverse(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; CORTEXA55-NEXT: [[ENTRY:.*]]:
+; CORTEXA55-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1
+; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 3
+; CORTEXA55-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
+; CORTEXA55-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CORTEXA55: [[ENTRY_NEW]]:
+; CORTEXA55-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
+; CORTEXA55-NEXT: br label %[[FOR_BODY:.*]]
+; CORTEXA55: [[FOR_BODY]]:
+; CORTEXA55-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
+; CORTEXA55-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; CORTEXA55-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
+; CORTEXA55-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CORTEXA55-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
+; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
+; CORTEXA55-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; CORTEXA55-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
+; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
+; CORTEXA55-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CORTEXA55-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
+; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
+; CORTEXA55-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; CORTEXA55-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
+; CORTEXA55-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
+; CORTEXA55-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CORTEXA55-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
+; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
+; CORTEXA55-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; CORTEXA55-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
+; CORTEXA55-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
+; CORTEXA55-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CORTEXA55-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; CORTEXA55-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; CORTEXA55-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CORTEXA55: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CORTEXA55-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; CORTEXA55: [[EXIT_UNR_LCSSA]]:
+; CORTEXA55-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CORTEXA55: [[FOR_BODY_EPIL_PREHEADER]]:
+; CORTEXA55-NEXT: br label %[[FOR_BODY_EPIL:.*]]
+; CORTEXA55: [[FOR_BODY_EPIL]]:
+; CORTEXA55-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]]
+; CORTEXA55-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
+; CORTEXA55-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]]
+; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_EPIL]], align 16
+; CORTEXA55-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1
+; CORTEXA55-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
+; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]]
+; CORTEXA55: [[FOR_BODY_EPIL_1]]:
+; CORTEXA55-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]]
+; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
+; CORTEXA55-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16
+; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; CORTEXA55-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_EPIL_1]], align 16
+; CORTEXA55-NEXT: [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2
+; CORTEXA55-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
+; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]]
+; CORTEXA55: [[FOR_BODY_EPIL_2]]:
+; CORTEXA55-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]]
+; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
+; CORTEXA55-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16
+; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]]
+; CORTEXA55-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_EPIL_2]], align 16
+; CORTEXA55-NEXT: br label %[[EXIT_EPILOG_LCSSA]]
+; CORTEXA55: [[EXIT_EPILOG_LCSSA]]:
+; CORTEXA55-NEXT: br label %[[EXIT]]
+; CORTEXA55: [[EXIT]]:
+; CORTEXA55-NEXT: ret void
+;
; GENERIC-LABEL: define void @reverse(
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
; GENERIC-NEXT: [[ENTRY:.*]]:
@@ -109,7 +188,6 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
; GENERIC: [[EXIT]]:
; GENERIC-NEXT: ret void
-;
entry: ; preds = %entry
br label %for.body
@@ -149,6 +227,25 @@ define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; APPLE-NEXT: ret void
;
+; CORTEXA55-LABEL: define void @saxpy_tripcount8_full_unroll(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; CORTEXA55-NEXT: [[ENTRY:.*:]]
+; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
+; CORTEXA55: [[VECTOR_BODY]]:
+; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
+; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
+; CORTEXA55-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
+; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
+; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CORTEXA55-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
+; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CORTEXA55-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CORTEXA55-NEXT: ret void
+;
; GENERIC-LABEL: define void @saxpy_tripcount8_full_unroll(
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
; GENERIC-NEXT: [[ENTRY:.*:]]
@@ -167,7 +264,6 @@ define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
; GENERIC-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; GENERIC-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; GENERIC-NEXT: ret void
-;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -190,6 +286,174 @@ exit: ; preds = %vector.body
}
+define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) {
+; APPLE-LABEL: define void @saxpy_tripcount1K_av0(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
+; APPLE: [[VECTOR_BODY]]:
+; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
+; APPLE: [[EXIT]]:
+; APPLE-NEXT: ret void
+;
+; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av0(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; CORTEXA55-NEXT: [[ENTRY:.*]]:
+; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
+; CORTEXA55: [[VECTOR_BODY]]:
+; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ]
+; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 4
+; CORTEXA55-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CORTEXA55-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CORTEXA55-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8
+; CORTEXA55-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CORTEXA55-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CORTEXA55-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12
+; CORTEXA55-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
+; CORTEXA55-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
+; CORTEXA55-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_3:%.*]] = add nuw nsw i64 [[INDEX]], 16
+; CORTEXA55-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_3]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
+; CORTEXA55-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_3]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
+; CORTEXA55-NEXT: [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20
+; CORTEXA55-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4
+; CORTEXA55-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
+; CORTEXA55-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP17]], ptr [[TMP16]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24
+; CORTEXA55-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
+; CORTEXA55-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
+; CORTEXA55-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP20]], ptr [[TMP19]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28
+; CORTEXA55-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
+; CORTEXA55-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
+; CORTEXA55-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP23]], ptr [[TMP22]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32
+; CORTEXA55-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
+; CORTEXA55-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4
+; CORTEXA55-NEXT: [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36
+; CORTEXA55-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
+; CORTEXA55-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
+; CORTEXA55-NEXT: [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40
+; CORTEXA55-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
+; CORTEXA55-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4
+; CORTEXA55-NEXT: [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP32]], ptr [[TMP31]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44
+; CORTEXA55-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
+; CORTEXA55-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
+; CORTEXA55-NEXT: [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP35]], ptr [[TMP34]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48
+; CORTEXA55-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
+; CORTEXA55-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
+; CORTEXA55-NEXT: [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52
+; CORTEXA55-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
+; CORTEXA55-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
+; CORTEXA55-NEXT: [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP41]], ptr [[TMP40]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56
+; CORTEXA55-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
+; CORTEXA55-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4
+; CORTEXA55-NEXT: [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP44]], ptr [[TMP43]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60
+; CORTEXA55-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]]
+; CORTEXA55-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4
+; CORTEXA55-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
+; CORTEXA55-NEXT: [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP47]], ptr [[TMP46]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64
+; CORTEXA55-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024
+; CORTEXA55-NEXT: br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
+; CORTEXA55: [[EXIT]]:
+; CORTEXA55-NEXT: ret void
+;
+entry:
+ %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+ %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+ %wide.load = load <4 x float>, ptr %0, align 4
+ %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+ %wide.load12 = load <4 x float>, ptr %1, align 4
+ %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+ store <4 x float> %2, ptr %1, align 4
+ %index.next = add nuw i64 %index, 4
+ %3 = icmp eq i64 %index.next, 1024
+ br i1 %3, label %exit, label %vector.body
+
+exit: ; preds = %vector.body
+ ret void
+}
+
+
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -211,6 +475,26 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
+; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av1(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; CORTEXA55-NEXT: [[ENTRY:.*]]:
+; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
+; CORTEXA55: [[VECTOR_BODY]]:
+; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CORTEXA55-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CORTEXA55-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CORTEXA55: [[EXIT]]:
+; CORTEXA55-NEXT: ret void
+;
; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
; GENERIC-NEXT: [[ENTRY:.*]]:
@@ -230,7 +514,6 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; GENERIC-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; GENERIC: [[EXIT]]:
; GENERIC-NEXT: ret void
-;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -254,12 +537,14 @@ exit: ; preds = %vector.body
!0 = !{!0, !1}
!1 = !{!"llvm.loop.isvectorized", i32 1}
+; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
;.
-; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
;.
>From 06b3012754b9616eea382c20e5cf934363aa06ea Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 14 Jul 2025 17:38:07 +0300
Subject: [PATCH 12/13] cleaned up unused GENERIC check lines
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 55 -------------------
1 file changed, 55 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index 45eb9c50b34eb..c3a713142a63b 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -172,22 +172,6 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
; CORTEXA55: [[EXIT]]:
; CORTEXA55-NEXT: ret void
;
-; GENERIC-LABEL: define void @reverse(
-; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
-; GENERIC-NEXT: [[ENTRY:.*]]:
-; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
-; GENERIC: [[FOR_BODY]]:
-; GENERIC-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
-; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
-; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
-; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
-; GENERIC-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
-; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; GENERIC: [[EXIT]]:
-; GENERIC-NEXT: ret void
entry: ; preds = %entry
br label %for.body
@@ -246,24 +230,6 @@ define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; CORTEXA55-NEXT: ret void
;
-; GENERIC-LABEL: define void @saxpy_tripcount8_full_unroll(
-; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
-; GENERIC-NEXT: [[ENTRY:.*:]]
-; GENERIC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
-; GENERIC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; GENERIC-NEXT: br label %[[VECTOR_BODY:.*]]
-; GENERIC: [[VECTOR_BODY]]:
-; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
-; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
-; GENERIC-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
-; GENERIC-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
-; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
-; GENERIC-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
-; GENERIC-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; GENERIC-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
-; GENERIC-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
-; GENERIC-NEXT: ret void
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -495,25 +461,6 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; CORTEXA55: [[EXIT]]:
; CORTEXA55-NEXT: ret void
;
-; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
-; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
-; GENERIC-NEXT: [[ENTRY:.*]]:
-; GENERIC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
-; GENERIC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; GENERIC-NEXT: br label %[[VECTOR_BODY:.*]]
-; GENERIC: [[VECTOR_BODY]]:
-; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; GENERIC-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
-; GENERIC-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
-; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; GENERIC-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; GENERIC-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; GENERIC: [[EXIT]]:
-; GENERIC-NEXT: ret void
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -537,8 +484,6 @@ exit: ; preds = %vector.body
!0 = !{!0, !1}
!1 = !{!"llvm.loop.isvectorized", i32 1}
-; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
>From a48c02a3bba8cd1dc9be59732f5b006c0baba80b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 14 Jul 2025 20:52:15 +0100
Subject: [PATCH 13/13] Update vector.ll
---
llvm/test/Transforms/LoopUnroll/AArch64/vector.ll | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index c3a713142a63b..8baded897fd7d 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55
+
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
More information about the llvm-commits
mailing list