[llvm] [ARM][MVE] Invalid tail predication in LowOverheadLoop pass (PR #163941)

Sun Oct 19 03:42:50 PDT 2025

================
@@ -0,0 +1,231 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
+
+# The _wrong_ output of this test is to generate the body of the
+# tail-predicated loop like this:
+#
+#     $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+#     renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+#     $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0
+#     renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+#     $lr = MVE_LETP killed renamable $lr, %bb.1
+#
+# in which the second MVE_VORR, copying q1 into q0, is an invalid conversion of
+# the input MQPRCopy, because it won't copy the vector lanes disabled by
+# FPSCR.LTPSIZE, and those are needed in the output value of the loop.
+#
+# In the right output, that MQPRCopy is expanded into a pair of VMOVD copying
+# d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE.
+
+--- |
+  ; ModuleID = '162644.c'
+  source_filename = "162644.c"
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+  @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16
+
+  ; Function Attrs: nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none)
+  define dso_local <4 x float> @test_func(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
+    %3 = load <4 x float>, ptr @inactive, align 16, !tbaa !3
+    %4 = add i32 %1, 3
+    %5 = call i32 @llvm.smin.i32(i32 %1, i32 4)
+    %6 = sub i32 %4, %5
+    %7 = lshr i32 %6, 2
+    %8 = add nuw nsw i32 %7, 1
+    %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
+    br label %10
+
+  10:                                               ; preds = %10, %2
+    %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ]
+    %12 = phi i32 [ %1, %2 ], [ %19, %10 ]
+    %13 = phi ptr [ %0, %2 ], [ %18, %10 ]
+    %14 = phi i32 [ %9, %2 ], [ %20, %10 ]
+    %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+    %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer)
+    %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3)
+    %18 = getelementptr inbounds nuw i8, ptr %13, i32 16
+    %19 = add i32 %12, -4
+    %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1)
+    %21 = icmp ne i32 %20, 0
+    br i1 %21, label %10, label %22, !llvm.loop !6
+
+  22:                                               ; preds = %10
+    ret <4 x float> %17
+  }
+
+  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+
+  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
+  declare <4 x float> @llvm.masked.load.v4f32.p0(ptr captures(none), i32 immarg, <4 x i1>, <4 x float>) #2
+
+  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+  declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare i32 @llvm.smin.i32(i32, i32) #3
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i32 @llvm.start.loop.iterations.i32(i32) #4
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
+
+  attributes #0 = { nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m52" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+pacbti,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+  attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+  attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+  attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #4 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 1, !"min_enum_size", i32 4}
+  !2 = !{!"clang version 22.0.0git"}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.unroll.disable"}
+...
+---
+name:            test_func
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
----------------
davemgreen wrote:

Equally a lot of these can often be removed to help simplify the test case.

https://github.com/llvm/llvm-project/pull/163941