[llvm] [RISCV] Adjust unroll prefs for loops with vectors (PR #151525)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 31 07:09:15 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Ramkumar Ramachandra (artagnon)
<details>
<summary>Changes</summary>
Adjust the unrolling preferences to unroll hand-vectorized code, as well as the scalar remainder of a vectorized loop. Inspired by a similar effort in AArch64: see #<!-- -->147420 and #<!-- -->151164.
---
Patch is 42.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151525.diff
2 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+7-8)
- (added) llvm/test/Transforms/LoopUnroll/RISCV/vector.ll (+603)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 61dbd066d3e22..0d5eb86bf899c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2627,18 +2627,17 @@ void RISCVTTIImpl::getUnrollingPreferences(
if (L->getNumBlocks() > 4)
return;
- // Don't unroll vectorized loops, including the remainder loop
- if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
- return;
-
// Scan the loop: don't unroll loops with calls as this could prevent
- // inlining.
+ // inlining. Don't unroll auto-vectorized loops either, though do allow
+ // unrolling of the scalar remainder.
+ bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
InstructionCost Cost = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- // Initial setting - Don't unroll loops containing vectorized
- // instructions.
- if (I.getType()->isVectorTy())
+ // Both auto-vectorized loops and the scalar remainder have the
+ // isvectorized attribute, so differentiate between them by the presence
+ // of vector instructions.
+ if (IsVectorized && I.getType()->isVectorTy())
return;
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
diff --git a/llvm/test/Transforms/LoopUnroll/RISCV/vector.ll b/llvm/test/Transforms/LoopUnroll/RISCV/vector.ll
new file mode 100644
index 0000000000000..811d055cb4c45
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/RISCV/vector.ll
@@ -0,0 +1,603 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mtriple riscv64 -mattr=+v,+f -S %s | FileCheck %s --check-prefixes=COMMON,CHECK
+; RUN: opt -p loop-unroll -mtriple=riscv64 -mcpu=sifive-s76 -S %s | FileCheck %s --check-prefixes=COMMON,SIFIVE
+
+define void @reverse(ptr %dst, ptr %src, i64 %len) {
+; CHECK-LABEL: define void @reverse(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+; SIFIVE-LABEL: define void @reverse(
+; SIFIVE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
+; SIFIVE-NEXT: [[ENTRY:.*]]:
+; SIFIVE-NEXT: [[TMP2:%.*]] = add i64 [[LEN]], -1
+; SIFIVE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
+; SIFIVE-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 7
+; SIFIVE-NEXT: br i1 [[TMP3]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; SIFIVE: [[ENTRY_NEW]]:
+; SIFIVE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
+; SIFIVE-NEXT: br label %[[FOR_BODY:.*]]
+; SIFIVE: [[FOR_BODY]]:
+; SIFIVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; SIFIVE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; SIFIVE-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
+; SIFIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
+; SIFIVE-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
+; SIFIVE-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
+; SIFIVE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; SIFIVE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
+; SIFIVE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
+; SIFIVE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
+; SIFIVE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; SIFIVE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
+; SIFIVE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
+; SIFIVE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
+; SIFIVE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; SIFIVE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
+; SIFIVE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
+; SIFIVE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
+; SIFIVE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
+; SIFIVE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
+; SIFIVE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
+; SIFIVE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
+; SIFIVE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
+; SIFIVE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
+; SIFIVE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
+; SIFIVE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
+; SIFIVE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
+; SIFIVE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
+; SIFIVE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
+; SIFIVE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
+; SIFIVE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
+; SIFIVE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
+; SIFIVE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
+; SIFIVE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
+; SIFIVE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
+; SIFIVE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; SIFIVE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; SIFIVE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; SIFIVE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; SIFIVE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
+; SIFIVE-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; SIFIVE: [[EXIT_UNR_LCSSA]]:
+; SIFIVE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; SIFIVE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; SIFIVE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; SIFIVE: [[FOR_BODY_EPIL_PREHEADER]]:
+; SIFIVE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
+; SIFIVE: [[FOR_BODY_EPIL]]:
+; SIFIVE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]]
+; SIFIVE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
+; SIFIVE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]]
+; SIFIVE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1
+; SIFIVE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
+; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]]
+; SIFIVE: [[FOR_BODY_EPIL_1]]:
+; SIFIVE-NEXT: [[TMP20:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]]
+; SIFIVE-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP20]]
+; SIFIVE-NEXT: [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; SIFIVE-NEXT: store <4 x float> [[TMP21]], ptr [[ARRAYIDX2_EPIL_1]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2
+; SIFIVE-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
+; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]]
+; SIFIVE: [[FOR_BODY_EPIL_2]]:
+; SIFIVE-NEXT: [[TMP22:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]]
+; SIFIVE-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP22]]
+; SIFIVE-NEXT: [[TMP23:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]]
+; SIFIVE-NEXT: store <4 x float> [[TMP23]], ptr [[ARRAYIDX2_EPIL_2]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_EPIL_2:%.*]] = add nuw nsw i64 [[IV_UNR]], 3
+; SIFIVE-NEXT: [[EPIL_ITER_CMP_2:%.*]] = icmp ne i64 3, [[XTRAITER]]
+; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_2]], label %[[FOR_BODY_EPIL_3:.*]], label %[[EXIT_EPILOG_LCSSA]]
+; SIFIVE: [[FOR_BODY_EPIL_3]]:
+; SIFIVE-NEXT: [[TMP24:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_2]]
+; SIFIVE-NEXT: [[ARRAYIDX_EPIL_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP24]]
+; SIFIVE-NEXT: [[TMP25:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_3]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_2]]
+; SIFIVE-NEXT: store <4 x float> [[TMP25]], ptr [[ARRAYIDX2_EPIL_3]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_EPIL_3:%.*]] = add nuw nsw i64 [[IV_UNR]], 4
+; SIFIVE-NEXT: [[EPIL_ITER_CMP_3:%.*]] = icmp ne i64 4, [[XTRAITER]]
+; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_3]], label %[[FOR_BODY_EPIL_4:.*]], label %[[EXIT_EPILOG_LCSSA]]
+; SIFIVE: [[FOR_BODY_EPIL_4]]:
+; SIFIVE-NEXT: [[TMP26:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_3]]
+; SIFIVE-NEXT: [[ARRAYIDX_EPIL_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP26]]
+; SIFIVE-NEXT: [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_4]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_3]]
+; SIFIVE-NEXT: store <4 x float> [[TMP27]], ptr [[ARRAYIDX2_EPIL_4]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_EPIL_4:%.*]] = add nuw nsw i64 [[IV_UNR]], 5
+; SIFIVE-NEXT: [[EPIL_ITER_CMP_4:%.*]] = icmp ne i64 5, [[XTRAITER]]
+; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_4]], label %[[FOR_BODY_EPIL_5:.*]], label %[[EXIT_EPILOG_LCSSA]]
+; SIFIVE: [[FOR_BODY_EPIL_5]]:
+; SIFIVE-NEXT: [[TMP28:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_4]]
+; SIFIVE-NEXT: [[ARRAYIDX_EPIL_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP28]]
+; SIFIVE-NEXT: [[TMP29:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_5]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_4]]
+; SIFIVE-NEXT: store <4 x float> [[TMP29]], ptr [[ARRAYIDX2_EPIL_5]], align 16
+; SIFIVE-NEXT: [[IV_NEXT_EPIL_5:%.*]] = add nuw nsw i64 [[IV_UNR]], 6
+; SIFIVE-NEXT: [[EPIL_ITER_CMP_5:%.*]] = icmp ne i64 6, [[XTRAITER]]
+; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_5]], label %[[FOR_BODY_EPIL_6:.*]], label %[[EXIT_EPILOG_LCSSA]]
+; SIFIVE: [[FOR_BODY_EPIL_6]]:
+; SIFIVE-NEXT: [[TMP30:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_5]]
+; SIFIVE-NEXT: [[ARRAYIDX_EPIL_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP30]]
+; SIFIVE-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_6]], align 16
+; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_5]]
+; SIFIVE-NEXT: store <4 x float> [[TMP31]], ptr [[ARRAYIDX2_EPIL_6]], align 16
+; SIFIVE-NEXT: br label %[[EXIT_EPILOG_LCSSA]]
+; SIFIVE: [[EXIT_EPILOG_LCSSA]]:
+; SIFIVE-NEXT: br label %[[EXIT]]
+; SIFIVE: [[EXIT]]:
+; SIFIVE-NEXT: ret void
+;
+entry: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %1 = sub nsw i64 %len, %iv
+ %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
+ %2 = load <4 x float>, ptr %arrayidx, align 16
+ %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
+ store <4 x float> %2, ptr %arrayidx2, align 16
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %len
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body, %entry
+ ret void
+}
+
+
+define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
+; COMMON-LABEL: define void @saxpy_tripcount8_full_unroll(
+; COMMON-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; COMMON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
+; COMMON: [[VECTOR_BODY]]:
+; COMMON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
+; COMMON-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
+; COMMON-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; COMMON-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
+; COMMON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
+; COMMON-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; COMMON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
+; COMMON-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; COMMON-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; COMMON-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; COMMON-NEXT: ret void
+;
+entry:
+ %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
+ %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
+ %wide.load = load <4 x float>, ptr %0, align 4
+ %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
+ %wide.load12 = load <4 x float>, ptr %1, align 4
+ %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+ store <4 x float> %2, ptr %1, align 4
+ %index.next = add nuw i64 %index, 4
+ %3 = icmp eq i64 %index.next, 8
+ br i1 %3, label %exit, label %vector.body
+
+exit: ; preds = %vector.body
+ ret void
+}
+
+
+define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) {
+; CHECK-LABEL: define void @saxpy_tripcount1K_av0(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+; SIFIVE-LABEL: define void @saxpy_tripcount1K_av0(
+; SIFIVE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; SIFIVE-NEXT: [[ENTRY:.*]]:
+; SIFIVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; SIFIVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; SIFIVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SIFIVE: [[VECTOR_BODY]]:
+; SIFIVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SIFIVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
+; SIFIVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; SIFIVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; SIFIVE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; SIFIVE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+; SIFIVE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; SIFIVE-NEXT: [[INDEX_NEXT1:%.*]] = add nuw nsw i64 [[INDEX]], 4
+; SIFIVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT1]]
+; SIFIVE-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
+; SIFIVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT1]]
+; SIFIVE-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; SIFIVE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
+; SIFIVE-NEXT: store <4 x float> [[TMP5]], ptr [[TM...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/151525
More information about the llvm-commits
mailing list