[llvm] [AArch64] Unrolling of loops with vector instructions. (PR #147420)

Tue Jul 8 13:09:15 PDT 2025

https://github.com/ayasin-a updated https://github.com/llvm/llvm-project/pull/147420

>From c5955fb6047fec8e7b192e0860540d409b10a93a Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ayasin at Ahmads-MacBook-Pro-2.local>
Date: Sun, 6 Jul 2025 22:59:47 +0300
Subject: [PATCH 1/4] Let vectorized loops be unrolled

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3f10da23b3494..c210eb73dbd55 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4873,14 +4873,9 @@ void AArch64TTIImpl::getUnrollingPreferences(
   UP.PartialOptSizeThreshold = 0;
 
   // Scan the loop: don't unroll loops with calls as this could prevent
-  // inlining. Don't unroll vector loops either, as they don't benefit much from
-  // unrolling.
+  // inlining.
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
-      // Don't unroll vectorised loop.
-      if (I.getType()->isVectorTy())
-        return;
-
       if (isa<CallBase>(I)) {
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
           if (const Function *F = cast<CallBase>(I).getCalledFunction())

>From b0d8301d4f8345ed106e19e83ba4026c62ac9e9f Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Sun, 6 Jul 2025 23:14:46 +0300
Subject: [PATCH 2/4] No need to unroll auto-vectorized loops that were
 interleaved

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c210eb73dbd55..673fb691cd603 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4872,6 +4872,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
 
+  // No need to unroll auto-vectorized loops that were interleaved
+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized") &&
+      findStringMetadataForLoop(L, "llvm.loop.interleave.count"))
+    return;
+
   // Scan the loop: don't unroll loops with calls as this could prevent
   // inlining.
   for (auto *BB : L->getBlocks()) {

>From 27c90711b5fa0b5840c4658913296b880f1d7bf1 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 01:48:10 +0300
Subject: [PATCH 3/4] Adding a test for (runtime) unrolling vector loop and a
 couple debug prints

---
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   5 +-
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 131 ++++++++++++++++++
 2 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/vector.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 0b9fee5727c6f..354633f837d45 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1172,7 +1172,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
-                    << L->getHeader()->getName() << "\n");
+                    << L->getHeader()->getName() << " Full=" << OnlyFullUnroll
+                    << " Loc=" << L->getLocStr() << "\n");
   TransformationMode TM = hasUnrollTransformation(L);
   if (TM & TM_Disable)
     return LoopUnrollResult::Unmodified;
@@ -1219,6 +1220,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
       ProvidedFullUnrollMaxCount);
   TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
       L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
+  LLVM_DEBUG(dbgs() << "  UP.Partial=" << UP.Partial
+                    << " UP.Runtime=" << UP.Runtime << "\n");
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
new file mode 100644
index 0000000000000..dbde0df575472
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; APPLE-LABEL: define void @reverse(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-NEXT:  [[ENTRY:.*:]]
+; APPLE-NEXT:    [[SHR:%.*]] = ashr i32 [[LEN]], 2
+; APPLE-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; APPLE:       [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT:    [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
+; APPLE-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; APPLE:       [[FOR_BODY_PREHEADER_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
+; APPLE-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
+; APPLE-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
+; APPLE:       [[FOR_BODY_EPIL]]:
+; APPLE-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
+; APPLE-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
+; APPLE-NEXT:    store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
+; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; APPLE:       [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT:    ret void
+; APPLE:       [[FOR_BODY]]:
+; APPLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
+; APPLE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; APPLE-NEXT:    [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
+; APPLE-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; APPLE-NEXT:    store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; APPLE-NEXT:    [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
+; APPLE-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; APPLE-NEXT:    [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
+; APPLE-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT:    store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; APPLE-NEXT:    [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
+; APPLE-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT:    store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; APPLE-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
+; APPLE-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT:    store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
+; APPLE-NEXT:    [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
+; APPLE-NEXT:    [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT:    store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
+; APPLE-NEXT:    [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
+; APPLE-NEXT:    [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT:    store <4 x float> [[TMP20]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+  %shr = ashr i32 %len, 2
+  %cmp7 = icmp sgt i32 %shr, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext nneg i32 %shr to i64
+  %wide.trip.count = zext nneg i32 %shr to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %1 = sub nsw i64 %0, %indvars.iv
+  %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
+  %2 = load <4 x float>, ptr %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
+  store <4 x float> %2, ptr %arrayidx2, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;.
+; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
+

>From 3e73e55043f4c1a7b563fb2d3e1a80b0cd44c0a8 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 23:08:58 +0300
Subject: [PATCH 4/4] Simplified the vector.ll test + check for -mtriple
 aarch64

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 69 ++++++++++++-------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index dbde0df575472..cbbe5d63b4cee 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,21 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
-; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; APPLE-NEXT:  [[ENTRY:.*:]]
-; APPLE-NEXT:    [[SHR:%.*]] = ashr i32 [[LEN]], 2
-; APPLE-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
 ; APPLE-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; APPLE:       [[FOR_BODY_PREHEADER]]:
-; APPLE-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT:    [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
-; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT:    [[TMP5:%.*]] = add i64 [[LEN]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 7
 ; APPLE-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
 ; APPLE-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
 ; APPLE:       [[FOR_BODY_PREHEADER_NEW]]:
-; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
 ; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
 ; APPLE-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
@@ -29,13 +27,13 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
 ; APPLE:       [[FOR_BODY_EPIL]]:
 ; APPLE-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
 ; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
 ; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
 ; APPLE-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
 ; APPLE-NEXT:    store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
-; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
 ; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
 ; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
 ; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -48,49 +46,49 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
 ; APPLE:       [[FOR_BODY]]:
 ; APPLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
 ; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
 ; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
 ; APPLE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
 ; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; APPLE-NEXT:    [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT:    [[TMP7:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT]]
 ; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
 ; APPLE-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
 ; APPLE-NEXT:    store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; APPLE-NEXT:    [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT:    [[TMP9:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_1]]
 ; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
 ; APPLE-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
 ; APPLE-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; APPLE-NEXT:    [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT:    [[TMP11:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_2]]
 ; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
 ; APPLE-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
 ; APPLE-NEXT:    store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; APPLE-NEXT:    [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT:    [[TMP13:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_3]]
 ; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
 ; APPLE-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
 ; APPLE-NEXT:    store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; APPLE-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_4]]
 ; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
 ; APPLE-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
 ; APPLE-NEXT:    store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
-; APPLE-NEXT:    [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT:    [[TMP17:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_5]]
 ; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
 ; APPLE-NEXT:    [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
 ; APPLE-NEXT:    store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
-; APPLE-NEXT:    [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT:    [[TMP19:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_6]]
 ; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
 ; APPLE-NEXT:    [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
@@ -100,14 +98,33 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
 ; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
 ; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
 ;
+; GENERIC-LABEL: define void @reverse(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
+; GENERIC-NEXT:  [[ENTRY:.*:]]
+; GENERIC-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
+; GENERIC-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; GENERIC:       [[FOR_BODY_PREHEADER]]:
+; GENERIC-NEXT:    br label %[[FOR_BODY:.*]]
+; GENERIC:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; GENERIC-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; GENERIC:       [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT:    ret void
+; GENERIC:       [[FOR_BODY]]:
+; GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; GENERIC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; GENERIC-NEXT:    store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
+; GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
+; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
 entry:
-  %shr = ashr i32 %len, 2
-  %cmp7 = icmp sgt i32 %shr, 0
+  %cmp7 = icmp sgt i64 %len, 0
   br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
 
 for.body.preheader:                               ; preds = %entry
-  %0 = zext nneg i32 %shr to i64
-  %wide.trip.count = zext nneg i32 %shr to i64
   br label %for.body
 
 for.cond.cleanup:                                 ; preds = %for.body, %entry
@@ -115,17 +132,17 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %1 = sub nsw i64 %0, %indvars.iv
+  %1 = sub nsw i64 %len, %indvars.iv
   %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
   %2 = load <4 x float>, ptr %arrayidx, align 16
   %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
   store <4 x float> %2, ptr %arrayidx2, align 16
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %len
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
 ;.
-