[llvm] [AArch64] Unrolling of loops with vector instructions. (PR #147420)

Mon Jul 14 02:32:01 PDT 2025

================
@@ -0,0 +1,203 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefixes=COMMON,APPLE %s
+; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefixes=COMMON,GENERIC
+define void @reverse(ptr %dst, ptr %src, i64 %len) {
+; APPLE-LABEL: define void @reverse(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[LEN]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 7
+; APPLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE:       [[ENTRY_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
+; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
+; APPLE:       [[FOR_BODY]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
+; APPLE-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; APPLE-NEXT:    store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
+; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
+; APPLE-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
+; APPLE-NEXT:    store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; APPLE-NEXT:    [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
+; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
+; APPLE-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT:    store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; APPLE-NEXT:    [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
+; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
+; APPLE-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT:    store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
+; APPLE-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
+; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
+; APPLE-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
+; APPLE-NEXT:    [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
+; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
+; APPLE-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT:    store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
+; APPLE-NEXT:    [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
+; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
+; APPLE-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT:    store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
+; APPLE-NEXT:    [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
+; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
+; APPLE-NEXT:    [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT:    store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
+; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; APPLE:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; APPLE:       [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
+; APPLE:       [[FOR_BODY_EPIL]]:
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT:    [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
+; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
+; APPLE-NEXT:    [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
+; APPLE-NEXT:    store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
+; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE:       [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT:    br label %[[EXIT]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    ret void
+;
+; GENERIC-LABEL: define void @reverse(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
+; GENERIC-NEXT:  [[ENTRY:.*]]:
+; GENERIC-NEXT:    br label %[[FOR_BODY:.*]]
+; GENERIC:       [[FOR_BODY]]:
+; GENERIC-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; GENERIC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; GENERIC-NEXT:    store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
+; GENERIC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
+; GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; GENERIC:       [[EXIT]]:
+; GENERIC-NEXT:    ret void
+;
+entry:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %1 = sub nsw i64 %len, %iv
+  %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
+  %2 = load <4 x float>, ptr %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
+  store <4 x float> %2, ptr %arrayidx2, align 16
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %len
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+
+define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
+; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; APPLE:       [[VECTOR_BODY]]:
+; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; APPLE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; APPLE-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    ret void
+;
+; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
+; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
+; GENERIC-NEXT:  [[ENTRY:.*]]:
+; GENERIC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; GENERIC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; GENERIC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; GENERIC:       [[VECTOR_BODY]]:
+; GENERIC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; GENERIC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; GENERIC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; GENERIC-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; GENERIC-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; GENERIC-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; GENERIC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; GENERIC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; GENERIC-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GENERIC:       [[EXIT]]:
+; GENERIC-NEXT:    ret void
+;
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+  store <4 x float> %2, ptr %1, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %exit, label %vector.body, !llvm.loop !0
+
+exit:                                 ; preds = %vector.body
+  ret void
+}
+!0 = !{!"llvm.loop.isvectorized", i32 1}
+
+;.
+; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
+;.
+; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
----------------
ayasin-a wrote:

removing the prefixes=COMMON

https://github.com/llvm/llvm-project/pull/147420