[llvm] [AArch64] Unrolling of loops with vector instructions. (PR #147420)

Mon Jul 14 12:52:30 PDT 2025

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/147420

>From db867b90f6b897b4c50fc44a8997a7643e963902 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ayasin at Ahmads-MacBook-Pro-2.local>
Date: Sun, 6 Jul 2025 22:59:47 +0300
Subject: [PATCH 01/13] Let vectorized loops be unrolled

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c04cbc80bc5b6..f97886c486f8b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4895,14 +4895,9 @@ void AArch64TTIImpl::getUnrollingPreferences(
   UP.PartialOptSizeThreshold = 0;
 
   // Scan the loop: don't unroll loops with calls as this could prevent
-  // inlining. Don't unroll vector loops either, as they don't benefit much from
-  // unrolling.
+  // inlining.
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
-      // Don't unroll vectorised loop.
-      if (I.getType()->isVectorTy())
-        return;
-
       if (isa<CallBase>(I)) {
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
           if (const Function *F = cast<CallBase>(I).getCalledFunction())

>From 744cea53cbf82128c24d3bcdd77a508eb5a20367 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Sun, 6 Jul 2025 23:14:46 +0300
Subject: [PATCH 02/13] No need to unroll auto-vectorized loops that were
 interleaved

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f97886c486f8b..7616aac4028b3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4894,6 +4894,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
 
+  // No need to unroll auto-vectorized loops that were interleaved
+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized") &&
+      findStringMetadataForLoop(L, "llvm.loop.interleave.count"))
+    return;
+
   // Scan the loop: don't unroll loops with calls as this could prevent
   // inlining.
   for (auto *BB : L->getBlocks()) {

>From 23e689fd93653d0331c25e67c1904c58a2aa5927 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 01:48:10 +0300
Subject: [PATCH 03/13] Adding a test for (runtime) unrolling vector loop and a
 couple debug prints

---
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   5 +-
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 131 ++++++++++++++++++
 2 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/vector.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index a22d84dcf014d..7a7c945e1cb24 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1172,7 +1172,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
-                    << L->getHeader()->getName() << "\n");
+                    << L->getHeader()->getName() << " Full=" << OnlyFullUnroll
+                    << " Loc=" << L->getLocStr() << "\n");
   TransformationMode TM = hasUnrollTransformation(L);
   if (TM & TM_Disable)
     return LoopUnrollResult::Unmodified;
@@ -1219,6 +1220,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
       ProvidedFullUnrollMaxCount);
   TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
       L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
+  LLVM_DEBUG(dbgs() << "  UP.Partial=" << UP.Partial
+                    << " UP.Runtime=" << UP.Runtime << "\n");
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
new file mode 100644
index 0000000000000..dbde0df575472
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; APPLE-LABEL: define void @reverse(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-NEXT:  [[ENTRY:.*:]]
+; APPLE-NEXT:    [[SHR:%.*]] = ashr i32 [[LEN]], 2
+; APPLE-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; APPLE:       [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT:    [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
+; APPLE-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; APPLE:       [[FOR_BODY_PREHEADER_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
+; APPLE-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
+; APPLE-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
+; APPLE:       [[FOR_BODY_EPIL]]:
+; APPLE-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
+; APPLE-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
+; APPLE-NEXT:    store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
+; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; APPLE:       [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT:    ret void
+; APPLE:       [[FOR_BODY]]:
+; APPLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
+; APPLE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; APPLE-NEXT:    [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
+; APPLE-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; APPLE-NEXT:    store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; APPLE-NEXT:    [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
+; APPLE-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; APPLE-NEXT:    [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
+; APPLE-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT:    store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; APPLE-NEXT:    [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
+; APPLE-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT:    store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; APPLE-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
+; APPLE-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT:    store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
+; APPLE-NEXT:    [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
+; APPLE-NEXT:    [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT:    store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
+; APPLE-NEXT:    [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
+; APPLE-NEXT:    [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT:    store <4 x float> [[TMP20]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+  %shr = ashr i32 %len, 2
+  %cmp7 = icmp sgt i32 %shr, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext nneg i32 %shr to i64
+  %wide.trip.count = zext nneg i32 %shr to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %1 = sub nsw i64 %0, %indvars.iv
+  %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
+  %2 = load <4 x float>, ptr %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
+  store <4 x float> %2, ptr %arrayidx2, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;.
+; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
+

>From 5665ac093ecba02d3c82e681e4b06f18490040a9 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 8 Jul 2025 23:08:58 +0300
Subject: [PATCH 04/13] Simplified the vector.ll test + check for -mtriple
 aarch64

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 69 ++++++++++++-------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index dbde0df575472..cbbe5d63b4cee 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,21 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
-; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; APPLE-NEXT:  [[ENTRY:.*:]]
-; APPLE-NEXT:    [[SHR:%.*]] = ashr i32 [[LEN]], 2
-; APPLE-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
 ; APPLE-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; APPLE:       [[FOR_BODY_PREHEADER]]:
-; APPLE-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
-; APPLE-NEXT:    [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
-; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT:    [[TMP5:%.*]] = add i64 [[LEN]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 7
 ; APPLE-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
 ; APPLE-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
 ; APPLE:       [[FOR_BODY_PREHEADER_NEW]]:
-; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
 ; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
 ; APPLE-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
@@ -29,13 +27,13 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
 ; APPLE:       [[FOR_BODY_EPIL]]:
 ; APPLE-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
 ; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
 ; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
 ; APPLE-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
 ; APPLE-NEXT:    store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
-; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
 ; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
 ; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
 ; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -48,49 +46,49 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
 ; APPLE:       [[FOR_BODY]]:
 ; APPLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
 ; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
 ; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
 ; APPLE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
 ; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; APPLE-NEXT:    [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT:    [[TMP7:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT]]
 ; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
 ; APPLE-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
 ; APPLE-NEXT:    store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; APPLE-NEXT:    [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT:    [[TMP9:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_1]]
 ; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
 ; APPLE-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
 ; APPLE-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; APPLE-NEXT:    [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT:    [[TMP11:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_2]]
 ; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
 ; APPLE-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
 ; APPLE-NEXT:    store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; APPLE-NEXT:    [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT:    [[TMP13:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_3]]
 ; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
 ; APPLE-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
 ; APPLE-NEXT:    store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; APPLE-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_4]]
 ; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
 ; APPLE-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
 ; APPLE-NEXT:    store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
-; APPLE-NEXT:    [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT:    [[TMP17:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_5]]
 ; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
 ; APPLE-NEXT:    [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
 ; APPLE-NEXT:    store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
-; APPLE-NEXT:    [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT:    [[TMP19:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_6]]
 ; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
 ; APPLE-NEXT:    [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
@@ -100,14 +98,33 @@ define void @reverse(ptr %dst, ptr %src, i32 %len) {
 ; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
 ; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
 ;
+; GENERIC-LABEL: define void @reverse(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
+; GENERIC-NEXT:  [[ENTRY:.*:]]
+; GENERIC-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
+; GENERIC-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; GENERIC:       [[FOR_BODY_PREHEADER]]:
+; GENERIC-NEXT:    br label %[[FOR_BODY:.*]]
+; GENERIC:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; GENERIC-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; GENERIC:       [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT:    ret void
+; GENERIC:       [[FOR_BODY]]:
+; GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; GENERIC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; GENERIC-NEXT:    store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
+; GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
+; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
 entry:
-  %shr = ashr i32 %len, 2
-  %cmp7 = icmp sgt i32 %shr, 0
+  %cmp7 = icmp sgt i64 %len, 0
   br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
 
 for.body.preheader:                               ; preds = %entry
-  %0 = zext nneg i32 %shr to i64
-  %wide.trip.count = zext nneg i32 %shr to i64
   br label %for.body
 
 for.cond.cleanup:                                 ; preds = %for.body, %entry
@@ -115,17 +132,17 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %1 = sub nsw i64 %0, %indvars.iv
+  %1 = sub nsw i64 %len, %indvars.iv
   %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
   %2 = load <4 x float>, ptr %arrayidx, align 16
   %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
   store <4 x float> %2, ptr %arrayidx2, align 16
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %len
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
 ;.
-

>From 04459401eed9549db4eb6dcfe470a1030cb17d17 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 10 Jul 2025 14:09:15 +0300
Subject: [PATCH 05/13] revert a couple debug prints

---
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 7a7c945e1cb24..a22d84dcf014d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1172,8 +1172,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
-                    << L->getHeader()->getName() << " Full=" << OnlyFullUnroll
-                    << " Loc=" << L->getLocStr() << "\n");
+                    << L->getHeader()->getName() << "\n");
   TransformationMode TM = hasUnrollTransformation(L);
   if (TM & TM_Disable)
     return LoopUnrollResult::Unmodified;
@@ -1220,8 +1219,6 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
       ProvidedFullUnrollMaxCount);
   TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
       L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
-  LLVM_DEBUG(dbgs() << "  UP.Partial=" << UP.Partial
-                    << " UP.Runtime=" << UP.Runtime << "\n");
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.

>From 7ff7ce3ac52b2e700e54ef433e1cd9d0ffffac4d Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 10 Jul 2025 18:03:15 +0300
Subject: [PATCH 06/13] Revert the non-interleaved auto-vectorized case +
 rename %indvars.iv

---
 .../AArch64/AArch64TargetTransformInfo.cpp       |  5 ++---
 .../test/Transforms/LoopUnroll/AArch64/vector.ll | 16 ++++++++--------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7616aac4028b3..230ad821a8a1e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4894,9 +4894,8 @@ void AArch64TTIImpl::getUnrollingPreferences(
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
 
-  // No need to unroll auto-vectorized loops that were interleaved
-  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized") &&
-      findStringMetadataForLoop(L, "llvm.loop.interleave.count"))
+  // No need to unroll auto-vectorized loops
+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
     return;
 
   // Scan the loop: don't unroll loops with calls as this could prevent
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index cbbe5d63b4cee..e6f195bc7147a 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -16,16 +16,16 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
 ; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
-; APPLE-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
 ; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
 ; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
-; APPLE-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
 ; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
 ; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
 ; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
 ; APPLE:       [[FOR_BODY_EPIL]]:
-; APPLE-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
 ; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
 ; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
 ; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
@@ -131,14 +131,14 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret void
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %1 = sub nsw i64 %len, %indvars.iv
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %1 = sub nsw i64 %len, %iv
   %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
   %2 = load <4 x float>, ptr %arrayidx, align 16
-  %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
   store <4 x float> %2, ptr %arrayidx2, align 16
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %len
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %len
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 

>From 1553324d5d763969341a9315fb59366a17ba7b93 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 10 Jul 2025 21:58:20 +0300
Subject: [PATCH 07/13] a 2nd test for autovectorized loop with static
 tripcount

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 164 +++++++++++++++---
 1 file changed, 137 insertions(+), 27 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index e6f195bc7147a..c4ee8c2f08204 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -4,24 +4,21 @@
 define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
-; APPLE-NEXT:  [[ENTRY:.*:]]
-; APPLE-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
-; APPLE-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; APPLE:       [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT:  [[FOR_BODY_PREHEADER:.*]]:
 ; APPLE-NEXT:    [[TMP5:%.*]] = add i64 [[LEN]], -1
 ; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 7
 ; APPLE-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
-; APPLE-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; APPLE-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
 ; APPLE:       [[FOR_BODY_PREHEADER_NEW]]:
 ; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
-; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
+; APPLE:       [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]]:
 ; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
-; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
-; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_UNR_LCSSA]]
+; APPLE:       [[FOR_COND_CLEANUP_UNR_LCSSA]]:
+; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
 ; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
-; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
 ; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
 ; APPLE:       [[FOR_BODY_EPIL]]:
@@ -36,10 +33,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
 ; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
 ; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
-; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
-; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
-; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
-; APPLE:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE:       [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
 ; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP]]
 ; APPLE:       [[FOR_COND_CLEANUP]]:
 ; APPLE-NEXT:    ret void
@@ -96,18 +91,13 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
 ; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
-; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
 ;
 ; GENERIC-LABEL: define void @reverse(
 ; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
-; GENERIC-NEXT:  [[ENTRY:.*:]]
-; GENERIC-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
-; GENERIC-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; GENERIC:       [[FOR_BODY_PREHEADER]]:
+; GENERIC-NEXT:  [[FOR_BODY_PREHEADER:.*]]:
 ; GENERIC-NEXT:    br label %[[FOR_BODY:.*]]
-; GENERIC:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
-; GENERIC-NEXT:    br label %[[FOR_COND_CLEANUP]]
-; GENERIC:       [[FOR_COND_CLEANUP]]:
+; GENERIC:       [[FOR_COND_CLEANUP:.*]]:
 ; GENERIC-NEXT:    ret void
 ; GENERIC:       [[FOR_BODY]]:
 ; GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -118,12 +108,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; GENERIC-NEXT:    store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
 ; GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
-; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
-entry:
-  %cmp7 = icmp sgt i64 %len, 0
-  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
@@ -142,7 +128,131 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
+define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
+; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; APPLE:       [[VECTOR_BODY]]:
+; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; APPLE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
+; APPLE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; APPLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; APPLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
+; APPLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
+; APPLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
+; APPLE-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; APPLE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
+; APPLE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
+; APPLE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; APPLE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
+; APPLE-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
+; APPLE-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
+; APPLE-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
+; APPLE-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
+; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; APPLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; APPLE:       [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT:    ret void
+;
+; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
+; GENERIC-NEXT:  [[ENTRY:.*]]:
+; GENERIC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; GENERIC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; GENERIC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; GENERIC:       [[VECTOR_BODY]]:
+; GENERIC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; GENERIC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; GENERIC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
+; GENERIC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; GENERIC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; GENERIC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; GENERIC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
+; GENERIC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
+; GENERIC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
+; GENERIC-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; GENERIC-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
+; GENERIC-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
+; GENERIC-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; GENERIC-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
+; GENERIC-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
+; GENERIC-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
+; GENERIC-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
+; GENERIC-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
+; GENERIC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; GENERIC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; GENERIC-NEXT:    br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GENERIC:       [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT:    ret void
+;
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+  %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %2 = getelementptr inbounds nuw i8, ptr %0, i64 32
+  %3 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %wide.load13 = load <4 x float>, ptr %2, align 4
+  %wide.load14 = load <4 x float>, ptr %3, align 4
+  %4 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+  %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
+  %6 = getelementptr inbounds nuw i8, ptr %4, i64 32
+  %7 = getelementptr inbounds nuw i8, ptr %4, i64 48
+  %wide.load15 = load <4 x float>, ptr %4, align 4
+  %wide.load16 = load <4 x float>, ptr %5, align 4
+  %wide.load17 = load <4 x float>, ptr %6, align 4
+  %wide.load18 = load <4 x float>, ptr %7, align 4
+  %8 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load15)
+  %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x float> %wide.load16)
+  %10 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load13, <4 x float> %wide.load17)
+  %11 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load14, <4 x float> %wide.load18)
+  store <4 x float> %8, ptr %4, align 4
+  store <4 x float> %9, ptr %5, align 4
+  store <4 x float> %10, ptr %6, align 4
+  store <4 x float> %11, ptr %7, align 4
+  %index.next = add nuw i64 %index, 16
+  %12 = icmp eq i64 %index.next, 1024
+  br i1 %12, label %for.cond.cleanup, label %vector.body, !llvm.loop !22
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+!22 = !{!"llvm.loop.isvectorized", i32 1}
+
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
+;.
+; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.

>From a3c2ac64e5846d1ebaefb2d763abf04049c7e51e Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Fri, 11 Jul 2025 00:10:43 +0300
Subject: [PATCH 08/13] shortened the test & cleanup entry/exit labels

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 185 ++++++------------
 1 file changed, 64 insertions(+), 121 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index c4ee8c2f08204..8a3e17588610e 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -4,43 +4,17 @@
 define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
-; APPLE-NEXT:  [[FOR_BODY_PREHEADER:.*]]:
+; APPLE-NEXT:  [[ENTRY:.*]]:
 ; APPLE-NEXT:    [[TMP5:%.*]] = add i64 [[LEN]], -1
 ; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 7
 ; APPLE-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
-; APPLE-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
-; APPLE:       [[FOR_BODY_PREHEADER_NEW]]:
+; APPLE-NEXT:    br i1 [[TMP6]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE:       [[ENTRY_NEW]]:
 ; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
-; APPLE:       [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]]:
-; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP_UNR_LCSSA]]
-; APPLE:       [[FOR_COND_CLEANUP_UNR_LCSSA]]:
-; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
-; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
-; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
-; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
-; APPLE:       [[FOR_BODY_EPIL]]:
-; APPLE-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT:    [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
-; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
-; APPLE-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
-; APPLE-NEXT:    store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
-; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
-; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
-; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
-; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
-; APPLE:       [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
-; APPLE-NEXT:    br label %[[FOR_COND_CLEANUP]]
-; APPLE:       [[FOR_COND_CLEANUP]]:
-; APPLE-NEXT:    ret void
 ; APPLE:       [[FOR_BODY]]:
-; APPLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
-; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
 ; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
 ; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
 ; APPLE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
@@ -91,16 +65,40 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
 ; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
-; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; APPLE:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; APPLE:       [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
+; APPLE:       [[FOR_BODY_EPIL]]:
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[TMP21:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
+; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP21]]
+; APPLE-NEXT:    [[TMP22:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
+; APPLE-NEXT:    store <4 x float> [[TMP22]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
+; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE:       [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT:    br label %[[EXIT]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    ret void
 ;
 ; GENERIC-LABEL: define void @reverse(
 ; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
-; GENERIC-NEXT:  [[FOR_BODY_PREHEADER:.*]]:
+; GENERIC-NEXT:  [[ENTRY:.*]]:
 ; GENERIC-NEXT:    br label %[[FOR_BODY:.*]]
-; GENERIC:       [[FOR_COND_CLEANUP:.*]]:
-; GENERIC-NEXT:    ret void
 ; GENERIC:       [[FOR_BODY]]:
-; GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; GENERIC-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
 ; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
@@ -108,16 +106,15 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; GENERIC-NEXT:    store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
 ; GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
-; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; GENERIC:       [[EXIT]]:
+; GENERIC-NEXT:    ret void
 ;
-for.body.preheader:                               ; preds = %entry
+entry:                               ; preds = %entry
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %1 = sub nsw i64 %len, %iv
   %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
   %2 = load <4 x float>, ptr %arrayidx, align 16
@@ -125,13 +122,13 @@ for.body:                                         ; preds = %for.body.preheader,
   store <4 x float> %2, ptr %arrayidx2, align 16
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %len
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
 }
 
 
-; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
-; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
 define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -142,33 +139,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; APPLE:       [[VECTOR_BODY]]:
 ; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-; APPLE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
-; APPLE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
-; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; APPLE-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; APPLE-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; APPLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; APPLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
-; APPLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
-; APPLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
-; APPLE-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; APPLE-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
-; APPLE-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
-; APPLE-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
-; APPLE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
-; APPLE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
-; APPLE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; APPLE-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; APPLE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; APPLE-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
-; APPLE-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
-; APPLE-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
-; APPLE-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
-; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; APPLE-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; APPLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; APPLE-NEXT:    br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; APPLE:       [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; APPLE:       [[EXIT]]:
 ; APPLE-NEXT:    ret void
 ;
 ; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
@@ -180,33 +159,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; GENERIC:       [[VECTOR_BODY]]:
 ; GENERIC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; GENERIC-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; GENERIC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-; GENERIC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
-; GENERIC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
-; GENERIC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; GENERIC-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; GENERIC-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; GENERIC-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; GENERIC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; GENERIC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
-; GENERIC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
-; GENERIC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
-; GENERIC-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; GENERIC-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
-; GENERIC-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
-; GENERIC-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
-; GENERIC-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
-; GENERIC-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
-; GENERIC-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
+; GENERIC-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; GENERIC-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; GENERIC-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
-; GENERIC-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
-; GENERIC-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
-; GENERIC-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
-; GENERIC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; GENERIC-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; GENERIC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; GENERIC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; GENERIC-NEXT:    br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; GENERIC:       [[FOR_COND_CLEANUP]]:
+; GENERIC-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GENERIC:       [[EXIT]]:
 ; GENERIC-NEXT:    ret void
 ;
 entry:
@@ -217,37 +178,19 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
-  %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
-  %2 = getelementptr inbounds nuw i8, ptr %0, i64 32
-  %3 = getelementptr inbounds nuw i8, ptr %0, i64 48
   %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
   %wide.load12 = load <4 x float>, ptr %1, align 4
-  %wide.load13 = load <4 x float>, ptr %2, align 4
-  %wide.load14 = load <4 x float>, ptr %3, align 4
-  %4 = getelementptr inbounds nuw float, ptr %dst, i64 %index
-  %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
-  %6 = getelementptr inbounds nuw i8, ptr %4, i64 32
-  %7 = getelementptr inbounds nuw i8, ptr %4, i64 48
-  %wide.load15 = load <4 x float>, ptr %4, align 4
-  %wide.load16 = load <4 x float>, ptr %5, align 4
-  %wide.load17 = load <4 x float>, ptr %6, align 4
-  %wide.load18 = load <4 x float>, ptr %7, align 4
-  %8 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load15)
-  %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x float> %wide.load16)
-  %10 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load13, <4 x float> %wide.load17)
-  %11 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load14, <4 x float> %wide.load18)
-  store <4 x float> %8, ptr %4, align 4
-  store <4 x float> %9, ptr %5, align 4
-  store <4 x float> %10, ptr %6, align 4
-  store <4 x float> %11, ptr %7, align 4
-  %index.next = add nuw i64 %index, 16
-  %12 = icmp eq i64 %index.next, 1024
-  br i1 %12, label %for.cond.cleanup, label %vector.body, !llvm.loop !22
+  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+  store <4 x float> %2, ptr %1, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %exit, label %vector.body, !llvm.loop !0
 
-for.cond.cleanup:                                 ; preds = %vector.body
+exit:                                 ; preds = %vector.body
   ret void
 }
-!22 = !{!"llvm.loop.isvectorized", i32 1}
+!0 = !{!"llvm.loop.isvectorized", i32 1}
 
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}

>From 0f3dad93f51887bf2a2145c26ade6dd4376db1a0 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Sun, 13 Jul 2025 22:40:00 +0300
Subject: [PATCH 09/13] updated the vector.ll test and adding COMMON prefix

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 158 +++++++++---------
 1 file changed, 80 insertions(+), 78 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index 8a3e17588610e..36eee7549ec3a 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,73 +1,73 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefixes=COMMON,APPLE %s
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefixes=COMMON,GENERIC
 define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; APPLE-NEXT:  [[ENTRY:.*]]:
-; APPLE-NEXT:    [[TMP5:%.*]] = add i64 [[LEN]], -1
+; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[LEN]], -1
 ; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 7
-; APPLE-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
-; APPLE-NEXT:    br i1 [[TMP6]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
 ; APPLE:       [[ENTRY_NEW]]:
 ; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
 ; APPLE:       [[FOR_BODY]]:
-; APPLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
 ; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
-; APPLE-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
-; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
-; APPLE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
-; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; APPLE-NEXT:    [[TMP7:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT]]
-; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
-; APPLE-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
-; APPLE-NEXT:    store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; APPLE-NEXT:    [[TMP9:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_1]]
-; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
-; APPLE-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
-; APPLE-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; APPLE-NEXT:    [[TMP11:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_2]]
-; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
-; APPLE-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
-; APPLE-NEXT:    store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; APPLE-NEXT:    [[TMP13:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_3]]
-; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
-; APPLE-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
-; APPLE-NEXT:    store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; APPLE-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_4]]
-; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
-; APPLE-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
-; APPLE-NEXT:    store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
-; APPLE-NEXT:    [[TMP17:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_5]]
-; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
-; APPLE-NEXT:    [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
-; APPLE-NEXT:    store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
-; APPLE-NEXT:    [[TMP19:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_NEXT_6]]
-; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
-; APPLE-NEXT:    [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
-; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
-; APPLE-NEXT:    store <4 x float> [[TMP20]], ptr [[ARRAYIDX2_7]], align 16
-; APPLE-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+; APPLE-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
+; APPLE-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; APPLE-NEXT:    store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
+; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
+; APPLE-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
+; APPLE-NEXT:    store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; APPLE-NEXT:    [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
+; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
+; APPLE-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT:    store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; APPLE-NEXT:    [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
+; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
+; APPLE-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT:    store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
+; APPLE-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
+; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
+; APPLE-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
+; APPLE-NEXT:    [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
+; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
+; APPLE-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT:    store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
+; APPLE-NEXT:    [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
+; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
+; APPLE-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT:    store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
+; APPLE-NEXT:    [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
+; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
+; APPLE-NEXT:    [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT:    store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
 ; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
 ; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
 ; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
 ; APPLE:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
-; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
 ; APPLE-NEXT:    br label %[[EXIT_UNR_LCSSA]]
 ; APPLE:       [[EXIT_UNR_LCSSA]]:
 ; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
@@ -78,11 +78,11 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE:       [[FOR_BODY_EPIL]]:
 ; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
 ; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
-; APPLE-NEXT:    [[TMP21:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
-; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP21]]
-; APPLE-NEXT:    [[TMP22:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT:    [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
+; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
+; APPLE-NEXT:    [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
 ; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
-; APPLE-NEXT:    store <4 x float> [[TMP22]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT:    store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
 ; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
 ; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
 ; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
@@ -98,14 +98,14 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; GENERIC-NEXT:  [[ENTRY:.*]]:
 ; GENERIC-NEXT:    br label %[[FOR_BODY:.*]]
 ; GENERIC:       [[FOR_BODY]]:
-; GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; GENERIC-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
+; GENERIC-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
 ; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-; GENERIC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; GENERIC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
 ; GENERIC-NEXT:    store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
-; GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
+; GENERIC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
 ; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
 ; GENERIC:       [[EXIT]]:
 ; GENERIC-NEXT:    ret void
@@ -139,14 +139,14 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; APPLE:       [[VECTOR_BODY]]:
 ; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; APPLE-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; APPLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; APPLE-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; APPLE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; APPLE-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
 ; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; APPLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; APPLE-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; APPLE-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; APPLE:       [[EXIT]]:
 ; APPLE-NEXT:    ret void
 ;
@@ -159,14 +159,14 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; GENERIC:       [[VECTOR_BODY]]:
 ; GENERIC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; GENERIC-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; GENERIC-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; GENERIC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; GENERIC-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
-; GENERIC-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
-; GENERIC-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; GENERIC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; GENERIC-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; GENERIC-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; GENERIC-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
 ; GENERIC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; GENERIC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; GENERIC-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GENERIC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; GENERIC-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; GENERIC:       [[EXIT]]:
 ; GENERIC-NEXT:    ret void
 ;
@@ -199,3 +199,5 @@ exit:                                 ; preds = %vector.body
 ;.
 ; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}

>From d33001d7e1ae01cbc17aa5fc8314cd6590a84539 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 14 Jul 2025 12:31:00 +0300
Subject: [PATCH 10/13] added test for full unroll; fixed !llvm.loop metadata;
 undo (non-working) COMMON prefix

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 76 +++++++++++++++++--
 1 file changed, 69 insertions(+), 7 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index 36eee7549ec3a..bff8433b75b1b 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefixes=COMMON,APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefixes=COMMON,GENERIC
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
 define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -129,6 +129,67 @@ exit:                                 ; preds = %for.body, %entry
 }
 
 
+define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
+; APPLE-LABEL: define void @saxpy_tripcount8_full_unroll(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*:]]
+; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; APPLE:       [[VECTOR_BODY]]:
+; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
+; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
+; APPLE-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; APPLE-NEXT:    store <4 x float> [[TMP0]], ptr [[DST]], align 4
+; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
+; APPLE-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
+; APPLE-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; APPLE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; APPLE-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; APPLE-NEXT:    ret void
+;
+; GENERIC-LABEL: define void @saxpy_tripcount8_full_unroll(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
+; GENERIC-NEXT:  [[ENTRY:.*:]]
+; GENERIC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; GENERIC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; GENERIC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; GENERIC:       [[VECTOR_BODY]]:
+; GENERIC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
+; GENERIC-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
+; GENERIC-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; GENERIC-NEXT:    store <4 x float> [[TMP0]], ptr [[DST]], align 4
+; GENERIC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
+; GENERIC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; GENERIC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
+; GENERIC-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; GENERIC-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; GENERIC-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; GENERIC-NEXT:    ret void
+;
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+  store <4 x float> %2, ptr %1, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 8
+  br i1 %3, label %exit, label %vector.body
+
+exit:                                 ; preds = %vector.body
+  ret void
+}
+
+
 define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -190,14 +251,15 @@ vector.body:                                      ; preds = %vector.body, %entry
 exit:                                 ; preds = %vector.body
   ret void
 }
-!0 = !{!"llvm.loop.isvectorized", i32 1}
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.isvectorized", i32 1}
 
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
-; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
+; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
-; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
+; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; COMMON: {{.*}}

>From eed737c62200b8aa3f8248b23d7b1e069916f9bb Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 14 Jul 2025 16:30:09 +0300
Subject: [PATCH 11/13] another saxpy test flavor that Cortex-A55 can unroll +
 replace prefix=GENERIC with CORETEXA55

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 297 +++++++++++++++++-
 1 file changed, 291 insertions(+), 6 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index bff8433b75b1b..45eb9c50b34eb 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
+; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55
 define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -93,6 +93,85 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE:       [[EXIT]]:
 ; APPLE-NEXT:    ret void
 ;
+; CORTEXA55-LABEL: define void @reverse(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; CORTEXA55-NEXT:  [[ENTRY:.*]]:
+; CORTEXA55-NEXT:    [[TMP0:%.*]] = add i64 [[LEN]], -1
+; CORTEXA55-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 3
+; CORTEXA55-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
+; CORTEXA55-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CORTEXA55:       [[ENTRY_NEW]]:
+; CORTEXA55-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
+; CORTEXA55-NEXT:    br label %[[FOR_BODY:.*]]
+; CORTEXA55:       [[FOR_BODY]]:
+; CORTEXA55-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; CORTEXA55-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
+; CORTEXA55-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; CORTEXA55-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; CORTEXA55-NEXT:    store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
+; CORTEXA55-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CORTEXA55-NEXT:    [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
+; CORTEXA55-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
+; CORTEXA55-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; CORTEXA55-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
+; CORTEXA55-NEXT:    store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
+; CORTEXA55-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CORTEXA55-NEXT:    [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
+; CORTEXA55-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
+; CORTEXA55-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; CORTEXA55-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
+; CORTEXA55-NEXT:    store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
+; CORTEXA55-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CORTEXA55-NEXT:    [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
+; CORTEXA55-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
+; CORTEXA55-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; CORTEXA55-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
+; CORTEXA55-NEXT:    store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
+; CORTEXA55-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CORTEXA55-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; CORTEXA55-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; CORTEXA55-NEXT:    br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CORTEXA55:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CORTEXA55-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[FOR_BODY]] ]
+; CORTEXA55-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CORTEXA55:       [[EXIT_UNR_LCSSA]]:
+; CORTEXA55-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CORTEXA55-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CORTEXA55-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CORTEXA55:       [[FOR_BODY_EPIL_PREHEADER]]:
+; CORTEXA55-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
+; CORTEXA55:       [[FOR_BODY_EPIL]]:
+; CORTEXA55-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]]
+; CORTEXA55-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
+; CORTEXA55-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; CORTEXA55-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]]
+; CORTEXA55-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_EPIL]], align 16
+; CORTEXA55-NEXT:    [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1
+; CORTEXA55-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
+; CORTEXA55-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]]
+; CORTEXA55:       [[FOR_BODY_EPIL_1]]:
+; CORTEXA55-NEXT:    [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]]
+; CORTEXA55-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
+; CORTEXA55-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16
+; CORTEXA55-NEXT:    [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; CORTEXA55-NEXT:    store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_EPIL_1]], align 16
+; CORTEXA55-NEXT:    [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2
+; CORTEXA55-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
+; CORTEXA55-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]]
+; CORTEXA55:       [[FOR_BODY_EPIL_2]]:
+; CORTEXA55-NEXT:    [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]]
+; CORTEXA55-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
+; CORTEXA55-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16
+; CORTEXA55-NEXT:    [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]]
+; CORTEXA55-NEXT:    store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_EPIL_2]], align 16
+; CORTEXA55-NEXT:    br label %[[EXIT_EPILOG_LCSSA]]
+; CORTEXA55:       [[EXIT_EPILOG_LCSSA]]:
+; CORTEXA55-NEXT:    br label %[[EXIT]]
+; CORTEXA55:       [[EXIT]]:
+; CORTEXA55-NEXT:    ret void
+;
 ; GENERIC-LABEL: define void @reverse(
 ; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
 ; GENERIC-NEXT:  [[ENTRY:.*]]:
@@ -109,7 +188,6 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
 ; GENERIC:       [[EXIT]]:
 ; GENERIC-NEXT:    ret void
-;
 entry:                               ; preds = %entry
   br label %for.body
 
@@ -149,6 +227,25 @@ define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
 ; APPLE-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
 ; APPLE-NEXT:    ret void
 ;
+; CORTEXA55-LABEL: define void @saxpy_tripcount8_full_unroll(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; CORTEXA55-NEXT:  [[ENTRY:.*:]]
+; CORTEXA55-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CORTEXA55-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CORTEXA55-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CORTEXA55:       [[VECTOR_BODY]]:
+; CORTEXA55-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
+; CORTEXA55-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
+; CORTEXA55-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP0]], ptr [[DST]], align 4
+; CORTEXA55-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
+; CORTEXA55-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CORTEXA55-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CORTEXA55-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CORTEXA55-NEXT:    ret void
+;
 ; GENERIC-LABEL: define void @saxpy_tripcount8_full_unroll(
 ; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
 ; GENERIC-NEXT:  [[ENTRY:.*:]]
@@ -167,7 +264,6 @@ define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
 ; GENERIC-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
 ; GENERIC-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
 ; GENERIC-NEXT:    ret void
-;
 entry:
   %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
   %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -190,6 +286,174 @@ exit:                                 ; preds = %vector.body
 }
 
 
+define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) {
+; APPLE-LABEL: define void @saxpy_tripcount1K_av0(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; APPLE:       [[VECTOR_BODY]]:
+; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; APPLE-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    ret void
+;
+; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av0(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; CORTEXA55-NEXT:  [[ENTRY:.*]]:
+; CORTEXA55-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CORTEXA55-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CORTEXA55-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CORTEXA55:       [[VECTOR_BODY]]:
+; CORTEXA55-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ]
+; CORTEXA55-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CORTEXA55-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CORTEXA55-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 4
+; CORTEXA55-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CORTEXA55-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CORTEXA55-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP4]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8
+; CORTEXA55-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CORTEXA55-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CORTEXA55-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP7]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12
+; CORTEXA55-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
+; CORTEXA55-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
+; CORTEXA55-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP10]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_3:%.*]] = add nuw nsw i64 [[INDEX]], 16
+; CORTEXA55-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_3]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
+; CORTEXA55-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_3]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
+; CORTEXA55-NEXT:    [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP14]], ptr [[TMP13]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20
+; CORTEXA55-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4
+; CORTEXA55-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
+; CORTEXA55-NEXT:    [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP17]], ptr [[TMP16]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24
+; CORTEXA55-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
+; CORTEXA55-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
+; CORTEXA55-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP20]], ptr [[TMP19]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28
+; CORTEXA55-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
+; CORTEXA55-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
+; CORTEXA55-NEXT:    [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP22]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32
+; CORTEXA55-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
+; CORTEXA55-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4
+; CORTEXA55-NEXT:    [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36
+; CORTEXA55-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
+; CORTEXA55-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
+; CORTEXA55-NEXT:    [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40
+; CORTEXA55-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
+; CORTEXA55-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4
+; CORTEXA55-NEXT:    [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP32]], ptr [[TMP31]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44
+; CORTEXA55-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
+; CORTEXA55-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
+; CORTEXA55-NEXT:    [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP35]], ptr [[TMP34]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48
+; CORTEXA55-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
+; CORTEXA55-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
+; CORTEXA55-NEXT:    [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52
+; CORTEXA55-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
+; CORTEXA55-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
+; CORTEXA55-NEXT:    [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP40]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56
+; CORTEXA55-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
+; CORTEXA55-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4
+; CORTEXA55-NEXT:    [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP44]], ptr [[TMP43]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60
+; CORTEXA55-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4
+; CORTEXA55-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
+; CORTEXA55-NEXT:    [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP47]], ptr [[TMP46]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64
+; CORTEXA55-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024
+; CORTEXA55-NEXT:    br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
+; CORTEXA55:       [[EXIT]]:
+; CORTEXA55-NEXT:    ret void
+;
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+  store <4 x float> %2, ptr %1, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %exit, label %vector.body
+
+exit:                                             ; preds = %vector.body
+  ret void
+}
+
+
 define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -211,6 +475,26 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; APPLE:       [[EXIT]]:
 ; APPLE-NEXT:    ret void
 ;
+; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av1(
+; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; CORTEXA55-NEXT:  [[ENTRY:.*]]:
+; CORTEXA55-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CORTEXA55-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CORTEXA55-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CORTEXA55:       [[VECTOR_BODY]]:
+; CORTEXA55-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CORTEXA55-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CORTEXA55-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CORTEXA55-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CORTEXA55-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; CORTEXA55-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; CORTEXA55-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CORTEXA55-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CORTEXA55-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CORTEXA55:       [[EXIT]]:
+; CORTEXA55-NEXT:    ret void
+;
 ; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
 ; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
 ; GENERIC-NEXT:  [[ENTRY:.*]]:
@@ -230,7 +514,6 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; GENERIC-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; GENERIC:       [[EXIT]]:
 ; GENERIC-NEXT:    ret void
-;
 entry:
   %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
   %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -254,12 +537,14 @@ exit:                                 ; preds = %vector.body
 !0 = !{!0, !1}
 !1 = !{!"llvm.loop.isvectorized", i32 1}
 
+; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
 ; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
 ; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
-; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.

>From 06b3012754b9616eea382c20e5cf934363aa06ea Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 14 Jul 2025 17:38:07 +0300
Subject: [PATCH 12/13] cleaned up unused GENERIC check lines

---
 .../Transforms/LoopUnroll/AArch64/vector.ll   | 55 -------------------
 1 file changed, 55 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index 45eb9c50b34eb..c3a713142a63b 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -172,22 +172,6 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; CORTEXA55:       [[EXIT]]:
 ; CORTEXA55-NEXT:    ret void
 ;
-; GENERIC-LABEL: define void @reverse(
-; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
-; GENERIC-NEXT:  [[ENTRY:.*]]:
-; GENERIC-NEXT:    br label %[[FOR_BODY:.*]]
-; GENERIC:       [[FOR_BODY]]:
-; GENERIC-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; GENERIC-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
-; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
-; GENERIC-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-; GENERIC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
-; GENERIC-NEXT:    store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
-; GENERIC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
-; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; GENERIC:       [[EXIT]]:
-; GENERIC-NEXT:    ret void
 entry:                               ; preds = %entry
   br label %for.body
 
@@ -246,24 +230,6 @@ define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
 ; CORTEXA55-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
 ; CORTEXA55-NEXT:    ret void
 ;
-; GENERIC-LABEL: define void @saxpy_tripcount8_full_unroll(
-; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
-; GENERIC-NEXT:  [[ENTRY:.*:]]
-; GENERIC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
-; GENERIC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; GENERIC-NEXT:    br label %[[VECTOR_BODY:.*]]
-; GENERIC:       [[VECTOR_BODY]]:
-; GENERIC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
-; GENERIC-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
-; GENERIC-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
-; GENERIC-NEXT:    store <4 x float> [[TMP0]], ptr [[DST]], align 4
-; GENERIC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
-; GENERIC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; GENERIC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
-; GENERIC-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; GENERIC-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
-; GENERIC-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
-; GENERIC-NEXT:    ret void
 entry:
   %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
   %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -495,25 +461,6 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
 ; CORTEXA55:       [[EXIT]]:
 ; CORTEXA55-NEXT:    ret void
 ;
-; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
-; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
-; GENERIC-NEXT:  [[ENTRY:.*]]:
-; GENERIC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
-; GENERIC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; GENERIC-NEXT:    br label %[[VECTOR_BODY:.*]]
-; GENERIC:       [[VECTOR_BODY]]:
-; GENERIC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; GENERIC-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
-; GENERIC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; GENERIC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
-; GENERIC-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; GENERIC-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
-; GENERIC-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
-; GENERIC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; GENERIC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; GENERIC-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; GENERIC:       [[EXIT]]:
-; GENERIC-NEXT:    ret void
 entry:
   %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
   %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -537,8 +484,6 @@ exit:                                 ; preds = %vector.body
 !0 = !{!0, !1}
 !1 = !{!"llvm.loop.isvectorized", i32 1}
 
-; GENERIC: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; GENERIC: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}

>From a48c02a3bba8cd1dc9be59732f5b006c0baba80b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 14 Jul 2025 20:52:15 +0100
Subject: [PATCH 13/13] Update vector.ll

---
 llvm/test/Transforms/LoopUnroll/AArch64/vector.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
index c3a713142a63b..8baded897fd7d 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
 ; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55
+
 define void @reverse(ptr %dst, ptr %src, i64 %len) {
 ; APPLE-LABEL: define void @reverse(
 ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {