[llvm] [LV] Pre-commit test for vectorisation of SAXPY unrolled by 5 (NFC). (PR #153039)

Sun Aug 24 12:21:48 PDT 2025

================
@@ -0,0 +1,249 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=loop-vectorize | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -S -passes=loop-vectorize -mattr=+sve | FileCheck %s --check-prefix=CHECK-SVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test contains an example of where vectorising a loop based on SAXPY
+; manually unrolled by five is not profitable:
+;
+;   void saxpy(long n, float a, float *x, float *y) {
+;     for (int i = 0; i < n; i += 5) {
+;       y[i] += a * x[i];
+;       y[i + 1] += a * x[i + 1];
+;       y[i + 2] += a * x[i + 2];
+;       y[i + 3] += a * x[i + 3];
+;       y[i + 4] += a * x[i + 4];
+;     }
+;   }
+;
+; Note: Even though the loop is not vectorised with scalable vectors, the issue
+; currently only manifests itself with +sve due to an interaction with
+; `prefersVectorizedAddressing'.
+
+define void @saxpy(i64 %n, float %a, ptr readonly %x, ptr noalias %y) {
+; CHECK-LABEL: define void @saxpy(
+; CHECK-SAME: i64 [[N:%.*]], float [[A:%.*]], ptr readonly [[X:%.*]], ptr noalias [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP36:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast float [[TMP3]], [[A]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    store float [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], [[A]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast float [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    store float [[TMP14]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast float [[TMP17]], [[A]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd fast float [[TMP20]], [[TMP18]]
+; CHECK-NEXT:    store float [[TMP21]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = fmul fast float [[TMP24]], [[A]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast float [[TMP27]], [[TMP25]]
+; CHECK-NEXT:    store float [[TMP28]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = add nuw nsw i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul fast float [[TMP31]], [[A]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[TMP33]], align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = fadd fast float [[TMP34]], [[TMP32]]
+; CHECK-NEXT:    store float [[TMP35]], ptr [[TMP33]], align 4
+; CHECK-NEXT:    [[TMP36]] = add nuw nsw i64 [[TMP1]], 5
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp sgt i64 [[N]], [[TMP36]]
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; CHECK-SVE-LABEL: define void @saxpy(
+; CHECK-SVE-SAME: i64 [[N:%.*]], float [[A:%.*]], ptr readonly [[X:%.*]], ptr noalias [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*:]]
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-NEXT:    br i1 [[TMP0]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-SVE:       [[LOOP_PREHEADER]]:
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -1
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 5
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-SVE:       [[VECTOR_PH]]:
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 5
+; CHECK-SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
+; CHECK-SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-SVE-NEXT:    [[WIDE_VEC:%.*]] = load <10 x float>, ptr [[TMP5]], align 4
+; CHECK-SVE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 1, i32 6>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 2, i32 7>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 3, i32 8>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 4, i32 9>
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
+; CHECK-SVE-NEXT:    [[WIDE_VEC5:%.*]] = load <10 x float>, ptr [[TMP7]], align 4
+; CHECK-SVE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 1, i32 6>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 2, i32 7>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 3, i32 8>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 4, i32 9>
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = fadd fast <2 x float> [[STRIDED_VEC6]], [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = fmul fast <2 x float> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = fadd fast <2 x float> [[STRIDED_VEC7]], [[TMP9]]
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = fadd fast <2 x float> [[STRIDED_VEC8]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = fmul fast <2 x float> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = fadd fast <2 x float> [[STRIDED_VEC9]], [[TMP13]]
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = fmul fast <2 x float> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = fadd fast <2 x float> [[STRIDED_VEC10]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-SVE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-SVE-NEXT:    [[TMP21:%.*]] = shufflevector <8 x float> [[TMP19]], <8 x float> [[TMP20]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-SVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x float> [[TMP21]], <10 x float> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
+; CHECK-SVE-NEXT:    store <10 x float> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-SVE:       [[SCALAR_PH]]:
+; CHECK-SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-SVE-NEXT:    br label %[[LOOP:.*]]
+; CHECK-SVE:       [[LOOP]]:
+; CHECK-SVE-NEXT:    [[TMP23:%.*]] = phi i64 [ [[TMP58:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-SVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP23]]
+; CHECK-SVE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4
+; CHECK-SVE-NEXT:    [[TMP26:%.*]] = fmul fast float [[TMP25]], [[A]]
+; CHECK-SVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP23]]
+; CHECK-SVE-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP27]], align 4
+; CHECK-SVE-NEXT:    [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP26]]
+; CHECK-SVE-NEXT:    store float [[TMP29]], ptr [[TMP27]], align 4
+; CHECK-SVE-NEXT:    [[TMP30:%.*]] = add nuw nsw i64 [[TMP23]], 1
+; CHECK-SVE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP30]]
+; CHECK-SVE-NEXT:    [[TMP32:%.*]] = load float, ptr [[TMP31]], align 4
+; CHECK-SVE-NEXT:    [[TMP33:%.*]] = fmul fast float [[TMP32]], [[A]]
+; CHECK-SVE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP30]]
+; CHECK-SVE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4
+; CHECK-SVE-NEXT:    [[TMP36:%.*]] = fadd fast float [[TMP35]], [[TMP33]]
+; CHECK-SVE-NEXT:    store float [[TMP36]], ptr [[TMP34]], align 4
+; CHECK-SVE-NEXT:    [[TMP37:%.*]] = add nuw nsw i64 [[TMP23]], 2
+; CHECK-SVE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP37]]
+; CHECK-SVE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4
+; CHECK-SVE-NEXT:    [[TMP40:%.*]] = fmul fast float [[TMP39]], [[A]]
+; CHECK-SVE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP37]]
+; CHECK-SVE-NEXT:    [[TMP42:%.*]] = load float, ptr [[TMP41]], align 4
+; CHECK-SVE-NEXT:    [[TMP43:%.*]] = fadd fast float [[TMP42]], [[TMP40]]
+; CHECK-SVE-NEXT:    store float [[TMP43]], ptr [[TMP41]], align 4
+; CHECK-SVE-NEXT:    [[TMP44:%.*]] = add nuw nsw i64 [[TMP23]], 3
+; CHECK-SVE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP44]]
+; CHECK-SVE-NEXT:    [[TMP46:%.*]] = load float, ptr [[TMP45]], align 4
+; CHECK-SVE-NEXT:    [[TMP47:%.*]] = fmul fast float [[TMP46]], [[A]]
+; CHECK-SVE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP44]]
+; CHECK-SVE-NEXT:    [[TMP49:%.*]] = load float, ptr [[TMP48]], align 4
+; CHECK-SVE-NEXT:    [[TMP50:%.*]] = fadd fast float [[TMP49]], [[TMP47]]
+; CHECK-SVE-NEXT:    store float [[TMP50]], ptr [[TMP48]], align 4
+; CHECK-SVE-NEXT:    [[TMP51:%.*]] = add nuw nsw i64 [[TMP23]], 4
+; CHECK-SVE-NEXT:    [[TMP52:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP51]]
+; CHECK-SVE-NEXT:    [[TMP53:%.*]] = load float, ptr [[TMP52]], align 4
+; CHECK-SVE-NEXT:    [[TMP54:%.*]] = fmul fast float [[TMP53]], [[A]]
+; CHECK-SVE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP51]]
+; CHECK-SVE-NEXT:    [[TMP56:%.*]] = load float, ptr [[TMP55]], align 4
+; CHECK-SVE-NEXT:    [[TMP57:%.*]] = fadd fast float [[TMP56]], [[TMP54]]
+; CHECK-SVE-NEXT:    store float [[TMP57]], ptr [[TMP55]], align 4
+; CHECK-SVE-NEXT:    [[TMP58]] = add nuw nsw i64 [[TMP23]], 5
+; CHECK-SVE-NEXT:    [[TMP59:%.*]] = icmp sgt i64 [[N]], [[TMP58]]
+; CHECK-SVE-NEXT:    br i1 [[TMP59]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE:       [[EXIT_LOOPEXIT]]:
+; CHECK-SVE-NEXT:    br label %[[EXIT]]
+; CHECK-SVE:       [[EXIT]]:
+; CHECK-SVE-NEXT:    ret void
+;
+entry:
+  %0 = icmp sgt i64 %n, 0
----------------
davemgreen wrote:

Oh that's good to hear. #149047 should ideally be happening inside the vectorizer (the deinterleaving), so that all the costs can be more correct as it vectorizes. (And it could learn different tricks). The test are probably worth having if we don't have them elsewhere.

https://github.com/llvm/llvm-project/pull/153039