[llvm] [ARM][MVE] Invalid tail predication in LowOverheadLoop pass (PR #163941)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 19 03:42:50 PDT 2025
================
@@ -0,0 +1,231 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
+
+# The _wrong_ output of this test is to generate the body of the
+# tail-predicated loop like this:
+#
+# $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+# renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+# $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0
+# renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+# $lr = MVE_LETP killed renamable $lr, %bb.1
+#
+# in which the second MVE_VORR, copying q1 into q0, is an invalid conversion of
+# the input MQPRCopy, because it won't copy the vector lanes disabled by
+# FPSCR.LTPSIZE, and those are needed in the output value of the loop.
+#
+# In the right output, that MQPRCopy is expanded into a pair of VMOVD copying
+# d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE.
+
+--- |
+ ; ModuleID = '162644.c'
+ source_filename = "162644.c"
+ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+ target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+ @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16
+
+ ; Function Attrs: nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none)
+ define dso_local <4 x float> @test_func(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
+ %3 = load <4 x float>, ptr @inactive, align 16, !tbaa !3
+ %4 = add i32 %1, 3
+ %5 = call i32 @llvm.smin.i32(i32 %1, i32 4)
+ %6 = sub i32 %4, %5
+ %7 = lshr i32 %6, 2
+ %8 = add nuw nsw i32 %7, 1
+ %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
+ br label %10
+
+ 10: ; preds = %10, %2
+ %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ]
+ %12 = phi i32 [ %1, %2 ], [ %19, %10 ]
+ %13 = phi ptr [ %0, %2 ], [ %18, %10 ]
+ %14 = phi i32 [ %9, %2 ], [ %20, %10 ]
+ %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+ %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer)
+ %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3)
+ %18 = getelementptr inbounds nuw i8, ptr %13, i32 16
+ %19 = add i32 %12, -4
+ %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1)
+ %21 = icmp ne i32 %20, 0
+ br i1 %21, label %10, label %22, !llvm.loop !6
+
+ 22: ; preds = %10
+ ret <4 x float> %17
+ }
+
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+ declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
+ declare <4 x float> @llvm.masked.load.v4f32.p0(ptr captures(none), i32 immarg, <4 x i1>, <4 x float>) #2
+
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+ declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+ declare i32 @llvm.smin.i32(i32, i32) #3
+
+ ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+ declare i32 @llvm.start.loop.iterations.i32(i32) #4
+
+ ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+ declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
+
+ attributes #0 = { nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m52" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+pacbti,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+ attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+ attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+ attributes #4 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+ !llvm.module.flags = !{!0, !1}
+ !llvm.ident = !{!2}
+
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 1, !"min_enum_size", i32 4}
+ !2 = !{!"clang version 22.0.0git"}
+ !3 = !{!4, !4, i64 0}
+ !4 = !{!"omnipotent char", !5, i64 0}
+ !5 = !{!"Simple C/C++ TBAA"}
+ !6 = distinct !{!6, !7, !8}
+ !7 = !{!"llvm.loop.mustprogress"}
+ !8 = !{!"llvm.loop.unroll.disable"}
+...
+---
+name: test_func
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
----------------
davemgreen wrote:
Equally a lot of these can often be removed to help simplify the test case.
https://github.com/llvm/llvm-project/pull/163941
More information about the llvm-commits
mailing list