[llvm] [AArch64] Unrolling of loops with vector instructions. (PR #147420)
Ahmad Yasin via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 8 13:09:15 PDT 2025
https://github.com/ayasin-a updated https://github.com/llvm/llvm-project/pull/147420
>From c5955fb6047fec8e7b192e0860540d409b10a93a Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ayasin at Ahmads-MacBook-Pro-2.local>
Date: Sun, 6 Jul 2025 22:59:47 +0300
Subject: [PATCH 1/4] Let vectorized loops be unrolled
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3f10da23b3494..c210eb73dbd55 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4873,14 +4873,9 @@ void AArch64TTIImpl::getUnrollingPreferences(
UP.PartialOptSizeThreshold = 0;
// Scan the loop: don't unroll loops with calls as this could prevent
- // inlining. Don't unroll vector loops either, as they don't benefit much from
- // unrolling.
+ // inlining.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- // Don't unroll vectorised loop.
- if (I.getType()->isVectorTy())
- return;
-
if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
>From b0d8301d4f8345ed106e19e83ba4026c62ac9e9f Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Sun, 6 Jul 2025 23:14:46 +0300
Subject: [PATCH 2/4] No need to unroll auto-vectorized loops that were
interleaved
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c210eb73dbd55..673fb691cd603 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4872,6 +4872,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
+ // No need to unroll auto-vectorized loops that were interleaved
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized") &&
+ findStringMetadataForLoop(L, "llvm.loop.interleave.count"))
+ return;
+
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining.
for (auto *BB : L->getBlocks()) {
>From 27c90711b5fa0b5840c4658913296b880f1d7bf1 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 01:48:10 +0300
Subject: [PATCH 3/4] Adding a test for (runtime) unrolling vector loop and a
couple debug prints
---
llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 5 +-
.../Transforms/LoopUnroll/AArch64/vector.ll | 131 ++++++++++++++++++
2 files changed, 135 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 0b9fee5727c6f..354633f837d45 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1172,7 +1172,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
LLVM_DEBUG(dbgs() << "Loop Unroll: F["
<< L->getHeader()->getParent()->getName() << "] Loop %"
- << L->getHeader()->getName() << "\n");
+ << L->getHeader()->getName() << " Full=" << OnlyFullUnroll
+ << " Loc=" << L->getLocStr() << "\n");
TransformationMode TM = hasUnrollTransformation(L);
if (TM & TM_Disable)
return LoopUnrollResult::Unmodified;
@@ -1219,6 +1220,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
ProvidedFullUnrollMaxCount);
TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
+ LLVM_DEBUG(dbgs() << " UP.Partial=" << UP.Partial
+ << " UP.Runtime=" << UP.Runtime << "\n");
// Exit early if unrolling is disabled. For OptForSize, we pick the loop size
// as threshold later on.
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
new file mode 100644
index 0000000000000..dbde0df575472
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; APPLE-LABEL: define void @reverse(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-NEXT: [[ENTRY:.*:]]
+; APPLE-NEXT: [[SHR:%.*]] = ashr i32 [[LEN]], 2
+; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; APPLE: [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT: [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT: [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
+; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT: br label %[[FOR_BODY:.*]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
+; APPLE-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
+; APPLE-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
+; APPLE: [[FOR_BODY_EPIL]]:
+; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
+; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
+; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
+; APPLE: [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT: ret void
+; APPLE: [[FOR_BODY]]:
+; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
+; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
+; APPLE-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
+; APPLE-NEXT: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
+; APPLE-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
+; APPLE-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
+; APPLE-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
+; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
+; APPLE-NEXT: [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
+; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
+; APPLE-NEXT: [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: store <4 x float> [[TMP20]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+ %shr = ashr i32 %len, 2
+ %cmp7 = icmp sgt i32 %shr, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %0 = zext nneg i32 %shr to i64
+ %wide.trip.count = zext nneg i32 %shr to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %1 = sub nsw i64 %0, %indvars.iv
+ %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
+ %2 = load <4 x float>, ptr %arrayidx, align 16
+ %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
+ store <4 x float> %2, ptr %arrayidx2, align 16
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;.
+; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
+
>From 3e73e55043f4c1a7b563fb2d3e1a80b0cd44c0a8 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 23:08:58 +0300
Subject: [PATCH 4/4] Simplified the vector.ll test + check for -mtriple
aarch64
---
.../Transforms/LoopUnroll/AArch64/vector.ll | 69 ++++++++++++-------
1 file changed, 43 insertions(+), 26 deletions(-)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index dbde0df575472..cbbe5d63b4cee 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,21 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
-; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*:]]
-; APPLE-NEXT: [[SHR:%.*]] = ashr i32 [[LEN]], 2
-; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; APPLE: [[FOR_BODY_PREHEADER]]:
-; APPLE-NEXT: [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT: [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
-; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
-; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
; APPLE-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
@@ -29,13 +27,13 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
; APPLE: [[FOR_BODY_EPIL]]:
; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
-; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -48,49 +46,49 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
; APPLE: [[FOR_BODY]]:
; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT]]
; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
; APPLE-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_1]]
; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
; APPLE-NEXT: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_2]]
; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
; APPLE-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
; APPLE-NEXT: store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_3]]
; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
; APPLE-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
; APPLE-NEXT: store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_4]]
; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
; APPLE-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
; APPLE-NEXT: store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
-; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_5]]
; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
; APPLE-NEXT: [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
; APPLE-NEXT: store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
; APPLE-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
-; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_6]]
; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
; APPLE-NEXT: [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
@@ -100,14 +98,33 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
;
+; GENERIC-LABEL: define void @reverse(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
+; GENERIC-NEXT: [[ENTRY:.*:]]
+; GENERIC-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
+; GENERIC-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; GENERIC: [[FOR_BODY_PREHEADER]]:
+; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
+; GENERIC: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; GENERIC-NEXT: br label %[[FOR_COND_CLEANUP]]
+; GENERIC: [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT: ret void
+; GENERIC: [[FOR_BODY]]:
+; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
+; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
+; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
+; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
+; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
entry:
- %shr = ashr i32 %len, 2
- %cmp7 = icmp sgt i32 %shr, 0
+ %cmp7 = icmp sgt i64 %len, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
- %0 = zext nneg i32 %shr to i64
- %wide.trip.count = zext nneg i32 %shr to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
@@ -115,17 +132,17 @@ for.cond.cleanup: ; preds = %for.body, %entry
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %1 = sub nsw i64 %0, %indvars.iv
+ %1 = sub nsw i64 %len, %indvars.iv
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
%2 = load <4 x float>, ptr %arrayidx, align 16
%arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
store <4 x float> %2, ptr %arrayidx2, align 16
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %len
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
;.
-
More information about the llvm-commits
mailing list