[llvm] Swap UnrollAndJam Pass to before the SLP Vectorizer Pass (PR #97029)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 26 06:05:07 PDT 2024
https://github.com/adprasad-nvidia updated https://github.com/llvm/llvm-project/pull/97029
>From a845bd89c47c02e82c2f865170edbe9984a143af Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Tue, 18 Jun 2024 13:36:01 +0530
Subject: [PATCH 1/7] [UnJ] Move LoopUnrollAndJamPass before SLPVectorizerPass
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 757b20dcd6693..9a04c1013e86c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1317,6 +1317,11 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
FPM.addPass(BDCEPass());
}
+ // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
+ if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+ FPM.addPass(createFunctionToLoopPassAdaptor(
+ LoopUnrollAndJamPass(Level.getSpeedupLevel())));
+ }
// Optimize parallel scalar instruction chains into SIMD instructions.
if (PTO.SLPVectorization) {
FPM.addPass(SLPVectorizerPass());
@@ -1335,11 +1340,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
- // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
- if (EnableUnrollAndJam && PTO.LoopUnrolling) {
- FPM.addPass(createFunctionToLoopPassAdaptor(
- LoopUnrollAndJamPass(Level.getSpeedupLevel())));
- }
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
>From 74656b4bc8f51932a0e45da0668ac8844eed4b92 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Tue, 25 Jun 2024 15:26:56 +0530
Subject: [PATCH 2/7] [UnJ] Add comments explaining new position of
UnrollAndJam
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 9a04c1013e86c..beb4325829d74 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1245,6 +1245,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
+ // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
if (EnableUnrollAndJam && PTO.LoopUnrolling)
FPM.addPass(createFunctionToLoopPassAdaptor(
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
@@ -1318,6 +1319,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
}
// We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
+ // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
if (EnableUnrollAndJam && PTO.LoopUnrolling) {
FPM.addPass(createFunctionToLoopPassAdaptor(
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
>From 7368a69f82c94e0e8077e37fd179e2bd22ba7353 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Tue, 2 Jul 2024 15:03:17 +0530
Subject: [PATCH 3/7] [UnJ] Do not run UnrollAndJam twice if full LTO
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index beb4325829d74..d5943cdc1581d 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1320,7 +1320,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
// We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
// In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
- if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+ if (!IsFullLTO && EnableUnrollAndJam && PTO.LoopUnrolling) {
FPM.addPass(createFunctionToLoopPassAdaptor(
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
}
>From 721e8d7cd8b57161f9c33b2b80f469eaa0fae397 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Wed, 10 Jul 2024 02:31:29 +0530
Subject: [PATCH 4/7] [UnJ] Add test in Transforms/PhaseOrdering for outer loop
vectorization
---
.../PhaseOrdering/outer-loop-vectorize.ll | 174 ++++++++++++++++++
1 file changed, 174 insertions(+)
create mode 100644 llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
diff --git a/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll b/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
new file mode 100644
index 0000000000000..b27433d2997fe
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -enable-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+ at aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at cc = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at b = dso_local global [32000 x float] zeroinitializer, align 64
+ at c = dso_local global [32000 x float] zeroinitializer, align 64
+ at d = dso_local global [32000 x float] zeroinitializer, align 64
+ at a = dso_local global [32000 x float] zeroinitializer, align 64
+ at e = dso_local global [32000 x float] zeroinitializer, align 64
+ at tt = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 64
+
+; Function Attrs: nounwind uwtable vscale_range(1,16)
+define dso_local nofpclass(nan inf) float @s2275(ptr nocapture noundef readnone %func_args) local_unnamed_addr #0 {
+; CHECK-LABEL: @s2275(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[NL_056:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float undef
+; CHECK: vector.ph:
+; CHECK-NEXT: [[INDVARS_IV58:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT59_3:%.*]], [[FOR_COND_CLEANUP7:%.*]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT59_1:%.*]] = or disjoint i64 [[INDVARS_IV58]], 2
+; CHECK-NEXT: [[INDVARS_IV_NEXT59_2:%.*]] = or disjoint i64 [[INDVARS_IV58]], 3
+; CHECK-NEXT: [[INDVARS_IV_NEXT59_3]] = add nuw nsw i64 [[INDVARS_IV58]], 4
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP2]], align 16, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[TMP4]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[TMP6]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x float> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[TMP2]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[INDEX_2]], 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP19:%.*]] = fmul fast float [[TMP18]], [[TMP16]]
+; CHECK-NEXT: [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]]
+; CHECK-NEXT: store float [[TMP20]], ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX_2]], 2
+; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX_3]], 1
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP27]], [[TMP25]]
+; CHECK-NEXT: [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP23]]
+; CHECK-NEXT: [[TMP30:%.*]] = load <4 x float>, ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[TMP3]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP32:%.*]] = load <4 x float>, ptr [[TMP5]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[TMP32]], [[TMP31]]
+; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <4 x float> [[TMP33]], [[TMP30]]
+; CHECK-NEXT: store <4 x float> [[TMP34]], ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: store float [[TMP29]], ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX_3]], 2
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 256
+; CHECK-NEXT: br i1 [[TMP35]], label [[FOR_COND_CLEANUP7]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: for.cond.cleanup3:
+; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[INC37]] = add nuw nsw i32 [[NL_056]], 1
+; CHECK-NEXT: [[EXITCOND62_NOT:%.*]] = icmp eq i32 [[INC37]], 39000
+; CHECK-NEXT: br i1 [[EXITCOND62_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: for.cond.cleanup7:
+; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX24]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX26]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX28]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <4 x float> [[TMP38]], [[TMP37]]
+; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <4 x float> [[TMP39]], [[TMP36]]
+; CHECK-NEXT: store <4 x float> [[TMP40]], ptr [[ARRAYIDX32]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT: [[EXITCOND61_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT59_3]], 256
+; CHECK-NEXT: br i1 [[EXITCOND61_NOT_3]], label [[FOR_COND_CLEANUP3]], label [[VECTOR_PH]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3
+ %nl.056 = phi i32 [ 0, %entry ], [ %inc37, %for.cond.cleanup3 ]
+ br label %for.cond5.preheader
+
+for.cond.cleanup: ; preds = %for.cond.cleanup3
+ ret float undef
+
+for.cond5.preheader: ; preds = %for.cond1.preheader, %for.cond.cleanup7
+ %indvars.iv58 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next59, %for.cond.cleanup7 ]
+ br label %for.body8
+
+for.cond.cleanup3: ; preds = %for.cond.cleanup7
+ %call = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #2
+ %inc37 = add nuw nsw i32 %nl.056, 1
+ %exitcond62.not = icmp eq i32 %inc37, 39000
+ br i1 %exitcond62.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !6
+
+for.cond.cleanup7: ; preds = %for.body8
+ %arrayidx24 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv58
+ %0 = load float, ptr %arrayidx24, align 4, !tbaa !8
+ %arrayidx26 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv58
+ %1 = load float, ptr %arrayidx26, align 4, !tbaa !8
+ %arrayidx28 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv58
+ %2 = load float, ptr %arrayidx28, align 4, !tbaa !8
+ %mul29 = fmul fast float %2, %1
+ %add30 = fadd fast float %mul29, %0
+ %arrayidx32 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv58
+ store float %add30, ptr %arrayidx32, align 4, !tbaa !8
+ %indvars.iv.next59 = add nuw nsw i64 %indvars.iv58, 1
+ %exitcond61.not = icmp eq i64 %indvars.iv.next59, 256
+ br i1 %exitcond61.not, label %for.cond.cleanup3, label %for.cond5.preheader, !llvm.loop !12
+
+for.body8: ; preds = %for.cond5.preheader, %for.body8
+ %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+ %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+ %3 = load float, ptr %arrayidx10, align 4, !tbaa !8
+ %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+ %4 = load float, ptr %arrayidx14, align 4, !tbaa !8
+ %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+ %5 = load float, ptr %arrayidx18, align 4, !tbaa !8
+ %mul = fmul fast float %5, %4
+ %add = fadd fast float %mul, %3
+ store float %add, ptr %arrayidx10, align 4, !tbaa !8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+ br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !14
+}
+
+declare i32 @dummy(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, float noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+attributes #0 = { nounwind uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!"clang version 19.0.0git (git at github.com:sjoerdmeijer/llvm-project.git 6efcff18dfc42038bafa67091e990b9c1b839a71)"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
+!12 = distinct !{!12, !7, !13}
+!13 = !{!"llvm.loop.unroll_and_jam.count", i32 4}
+!14 = distinct !{!14}
>From 0f8326e66534dc3a9fcd97c8d3401f8447333517 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Wed, 24 Jul 2024 15:53:17 +0530
Subject: [PATCH 5/7] [UnJ] Run UnJ with !IsFullLTO in same place as UnJ with
IsFullLTO
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 32 ++++++++++++++++--------
1 file changed, 21 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index d5943cdc1581d..bf62747f3734a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1236,6 +1236,27 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
if (EnableInferAlignmentPass)
FPM.addPass(InferAlignmentPass());
+
+ // Cleanup after loop vectorization. Simplification passes like CVP and
+ // GVN, loop transforms, and others have already run, so it's now better to
+ // convert to more optimized IR using more aggressive simplify CFG options.
+ FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+ .forwardSwitchCondToPhi(true)
+ .convertSwitchRangeToICmp(true)
+ .convertSwitchToLookupTable(true)
+ .needCanonicalLoops(false)
+ .hoistCommonInsts(true)
+ .sinkCommonInsts(true)));
+
+ // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
+ // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
+ // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus
+ // placing it immediately before the SLPVectorizerPass, presumably due to analysis re-use.
+ if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+ FPM.addPass(createFunctionToLoopPassAdaptor(
+ LoopUnrollAndJamPass(Level.getSpeedupLevel())));
+ }
+
if (IsFullLTO) {
// The vectorizer may have significantly shortened a loop body; unroll
// again. Unroll small loops to hide loop backedge latency and saturate any
@@ -1244,11 +1265,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
- // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
- // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
- if (EnableUnrollAndJam && PTO.LoopUnrolling)
- FPM.addPass(createFunctionToLoopPassAdaptor(
- LoopUnrollAndJamPass(Level.getSpeedupLevel())));
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
@@ -1318,12 +1334,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
FPM.addPass(BDCEPass());
}
- // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
- // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
- if (!IsFullLTO && EnableUnrollAndJam && PTO.LoopUnrolling) {
- FPM.addPass(createFunctionToLoopPassAdaptor(
- LoopUnrollAndJamPass(Level.getSpeedupLevel())));
- }
// Optimize parallel scalar instruction chains into SIMD instructions.
if (PTO.SLPVectorization) {
FPM.addPass(SLPVectorizerPass());
>From 5088bacf52f667ed327018e958a01da9bf546d70 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Thu, 25 Jul 2024 07:18:56 -0700
Subject: [PATCH 6/7] [UnJ] [SimplifyCFG] Only run extra SimplifyCFGPass if UnJ
enabled
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 25 ++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index bf62747f3734a..77743f275b1d3 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1237,22 +1237,23 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
if (EnableInferAlignmentPass)
FPM.addPass(InferAlignmentPass());
- // Cleanup after loop vectorization. Simplification passes like CVP and
- // GVN, loop transforms, and others have already run, so it's now better to
- // convert to more optimized IR using more aggressive simplify CFG options.
- FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
- .forwardSwitchCondToPhi(true)
- .convertSwitchRangeToICmp(true)
- .convertSwitchToLookupTable(true)
- .needCanonicalLoops(false)
- .hoistCommonInsts(true)
- .sinkCommonInsts(true)));
-
// We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
// In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
// Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus
- // placing it immediately before the SLPVectorizerPass, presumably due to analysis re-use.
+ // placing it immediately before the SLPVectorizerPass, due to analysis re-use.
if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+ // Cleanup after loop vectorization. Simplification passes like CVP and
+ // GVN, loop transforms, and others have already run, so it's now better to
+ // convert to more optimized IR using more aggressive simplify CFG options.
+ // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP outer loop vectorization to happen.
+ FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+ .forwardSwitchCondToPhi(true)
+ .convertSwitchRangeToICmp(true)
+ .convertSwitchToLookupTable(true)
+ .needCanonicalLoops(false)
+ .hoistCommonInsts(true)
+ .sinkCommonInsts(true)));
+
FPM.addPass(createFunctionToLoopPassAdaptor(
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
}
>From b9059f4f343fc011a1d7c96e7550da571cd47b5a Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Fri, 26 Jul 2024 18:32:26 +0530
Subject: [PATCH 7/7] [UnJ] [SimplifyCFG] Fix comment formatting
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 77743f275b1d3..a8f9a1590c150 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1238,14 +1238,17 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
FPM.addPass(InferAlignmentPass());
// We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
- // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
- // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus
- // placing it immediately before the SLPVectorizerPass, due to analysis re-use.
+ // In order for outer loop vectorization to be done, UnrollAndJam must occur
+ // before the SLPVectorizerPass. Placing UnrollAndJam immediately after the
+ // LoopVectorizePass when !IsFullLTO leads to improved compile times versus
+ // placing it immediately before the SLPVectorizerPass, due to analysis
+ // re-use.
if (EnableUnrollAndJam && PTO.LoopUnrolling) {
// Cleanup after loop vectorization. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
- // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP outer loop vectorization to happen.
+ // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP
+ // outer loop vectorization to happen.
FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchRangeToICmp(true)
More information about the llvm-commits
mailing list