[llvm] Swap UnrollAndJam Pass to before the SLP Vectorizer Pass (PR #97029)
Yashwant Singh via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 28 23:39:03 PDT 2024
================
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -enable-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+ at aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at cc = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at b = dso_local global [32000 x float] zeroinitializer, align 64
+ at c = dso_local global [32000 x float] zeroinitializer, align 64
+ at d = dso_local global [32000 x float] zeroinitializer, align 64
+ at a = dso_local global [32000 x float] zeroinitializer, align 64
+ at e = dso_local global [32000 x float] zeroinitializer, align 64
+ at tt = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 64
+
+; Function Attrs: nounwind uwtable vscale_range(1,16)
+define dso_local nofpclass(nan inf) float @s2275(ptr nocapture noundef readnone %func_args) local_unnamed_addr #0 {
+; CHECK-LABEL: @s2275(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[NL_056:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float undef
+; CHECK: vector.ph:
+; CHECK-NEXT: [[INDVARS_IV58:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT59_3:%.*]], [[FOR_COND_CLEANUP7:%.*]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT59_1:%.*]] = or disjoint i64 [[INDVARS_IV58]], 2
+; CHECK-NEXT: [[INDVARS_IV_NEXT59_2:%.*]] = or disjoint i64 [[INDVARS_IV58]], 3
+; CHECK-NEXT: [[INDVARS_IV_NEXT59_3]] = add nuw nsw i64 [[INDVARS_IV58]], 4
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP2]], align 16, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[TMP4]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[TMP6]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x float> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[TMP2]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[INDEX_2]], 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP19:%.*]] = fmul fast float [[TMP18]], [[TMP16]]
+; CHECK-NEXT: [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]]
+; CHECK-NEXT: store float [[TMP20]], ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX_2]], 2
+; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX_3]], 1
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP27]], [[TMP25]]
+; CHECK-NEXT: [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP23]]
+; CHECK-NEXT: [[TMP30:%.*]] = load <4 x float>, ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[TMP3]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP32:%.*]] = load <4 x float>, ptr [[TMP5]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[TMP32]], [[TMP31]]
+; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <4 x float> [[TMP33]], [[TMP30]]
+; CHECK-NEXT: store <4 x float> [[TMP34]], ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: store float [[TMP29]], ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX_3]], 2
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 256
+; CHECK-NEXT: br i1 [[TMP35]], label [[FOR_COND_CLEANUP7]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: for.cond.cleanup3:
+; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[INC37]] = add nuw nsw i32 [[NL_056]], 1
+; CHECK-NEXT: [[EXITCOND62_NOT:%.*]] = icmp eq i32 [[INC37]], 39000
+; CHECK-NEXT: br i1 [[EXITCOND62_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: for.cond.cleanup7:
+; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX24]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX26]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX28]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <4 x float> [[TMP38]], [[TMP37]]
+; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <4 x float> [[TMP39]], [[TMP36]]
+; CHECK-NEXT: store <4 x float> [[TMP40]], ptr [[ARRAYIDX32]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[EXITCOND61_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT59_3]], 256
+; CHECK-NEXT: br i1 [[EXITCOND61_NOT_3]], label [[FOR_COND_CLEANUP3]], label [[VECTOR_PH]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3
+ %nl.056 = phi i32 [ 0, %entry ], [ %inc37, %for.cond.cleanup3 ]
+ br label %for.cond5.preheader
+
+for.cond.cleanup: ; preds = %for.cond.cleanup3
+ ret float undef
+
+for.cond5.preheader: ; preds = %for.cond1.preheader, %for.cond.cleanup7
+ %indvars.iv58 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next59, %for.cond.cleanup7 ]
+ br label %for.body8
+
+for.cond.cleanup3: ; preds = %for.cond.cleanup7
+ %call = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #2
+ %inc37 = add nuw nsw i32 %nl.056, 1
+ %exitcond62.not = icmp eq i32 %inc37, 39000
+ br i1 %exitcond62.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !6
+
+for.cond.cleanup7: ; preds = %for.body8
+ %arrayidx24 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv58
+ %0 = load float, ptr %arrayidx24, align 4, !tbaa !8
+ %arrayidx26 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv58
+ %1 = load float, ptr %arrayidx26, align 4, !tbaa !8
+ %arrayidx28 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv58
+ %2 = load float, ptr %arrayidx28, align 4, !tbaa !8
+ %mul29 = fmul fast float %2, %1
+ %add30 = fadd fast float %mul29, %0
+ %arrayidx32 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv58
+ store float %add30, ptr %arrayidx32, align 4, !tbaa !8
+ %indvars.iv.next59 = add nuw nsw i64 %indvars.iv58, 1
+ %exitcond61.not = icmp eq i64 %indvars.iv.next59, 256
+ br i1 %exitcond61.not, label %for.cond.cleanup3, label %for.cond5.preheader, !llvm.loop !12
+
+for.body8: ; preds = %for.cond5.preheader, %for.body8
+ %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+ %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+ %3 = load float, ptr %arrayidx10, align 4, !tbaa !8
+ %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+ %4 = load float, ptr %arrayidx14, align 4, !tbaa !8
+ %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+ %5 = load float, ptr %arrayidx18, align 4, !tbaa !8
+ %mul = fmul fast float %5, %4
+ %add = fadd fast float %mul, %3
+ store float %add, ptr %arrayidx10, align 4, !tbaa !8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+ br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !14
+}
+
+declare i32 @dummy(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, float noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+attributes #0 = { nounwind uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!"clang version 19.0.0git (git at github.com:sjoerdmeijer/llvm-project.git 6efcff18dfc42038bafa67091e990b9c1b839a71)"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
+!12 = distinct !{!12, !7, !13}
+!13 = !{!"llvm.loop.unroll_and_jam.count", i32 4}
+!14 = distinct !{!14}
----------------
yashssh wrote:
Can simplify the test by getting rid of unnecessary attributes and metadata
https://github.com/llvm/llvm-project/pull/97029
More information about the llvm-commits
mailing list