[llvm] [AArch64] Unrolling of loops with vector instructions. (PR #147420)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 8 03:53:44 PDT 2025
================
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+define void @reverse(ptr %dst, ptr %src, i32 %len) {
+; APPLE-LABEL: define void @reverse(
+; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-NEXT: [[ENTRY:.*:]]
+; APPLE-NEXT: [[SHR:%.*]] = ashr i32 [[LEN]], 2
+; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[SHR]], 0
+; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; APPLE: [[FOR_BODY_PREHEADER]]:
+; APPLE-NEXT: [[TMP0:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SHR]] to i64
+; APPLE-NEXT: [[TMP5:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
+; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
+; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; APPLE-NEXT: br label %[[FOR_BODY:.*]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
+; APPLE-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
+; APPLE-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
+; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
+; APPLE: [[FOR_BODY_EPIL]]:
+; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
+; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
+; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
+; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
+; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[WIDE_TRIP_COUNT]]
+; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
+; APPLE: [[FOR_COND_CLEANUP]]:
+; APPLE-NEXT: ret void
+; APPLE: [[FOR_BODY]]:
+; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV]]
+; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
+; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
+; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; APPLE-NEXT: [[TMP7:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP7]]
+; APPLE-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[ARRAYIDX2_1]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; APPLE-NEXT: [[TMP9:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP9]]
+; APPLE-NEXT: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_1]]
+; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX2_2]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; APPLE-NEXT: [[TMP11:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP11]]
+; APPLE-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_2]]
+; APPLE-NEXT: store <4 x float> [[TMP12]], ptr [[ARRAYIDX2_3]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; APPLE-NEXT: [[TMP13:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP13]]
+; APPLE-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_3]]
+; APPLE-NEXT: store <4 x float> [[TMP14]], ptr [[ARRAYIDX2_4]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; APPLE-NEXT: [[TMP15:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP15]]
+; APPLE-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_4]]
+; APPLE-NEXT: store <4 x float> [[TMP16]], ptr [[ARRAYIDX2_5]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
+; APPLE-NEXT: [[TMP17:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP17]]
+; APPLE-NEXT: [[TMP18:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_5]]
+; APPLE-NEXT: store <4 x float> [[TMP18]], ptr [[ARRAYIDX2_6]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
+; APPLE-NEXT: [[TMP19:%.*]] = sub nsw i64 [[TMP0]], [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP19]]
+; APPLE-NEXT: [[TMP20:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_NEXT_6]]
+; APPLE-NEXT: store <4 x float> [[TMP20]], ptr [[ARRAYIDX2_7]], align 16
+; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+ %shr = ashr i32 %len, 2
+ %cmp7 = icmp sgt i32 %shr, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %0 = zext nneg i32 %shr to i64
+ %wide.trip.count = zext nneg i32 %shr to i64
----------------
fhahn wrote:
Could you just pass an i64 trip count as argument directly to simplify this?
https://github.com/llvm/llvm-project/pull/147420
More information about the llvm-commits
mailing list