[llvm] Swap UnrollAndJam Pass to before the SLP Vectorizer Pass (PR #97029)

Fri Jul 26 06:05:07 PDT 2024

https://github.com/adprasad-nvidia updated https://github.com/llvm/llvm-project/pull/97029

>From a845bd89c47c02e82c2f865170edbe9984a143af Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Tue, 18 Jun 2024 13:36:01 +0530
Subject: [PATCH 1/7] [UnJ] Move LoopUnrollAndJamPass before SLPVectorizerPass

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 757b20dcd6693..9a04c1013e86c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1317,6 +1317,11 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(BDCEPass());
   }
 
+  // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
+  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+    FPM.addPass(createFunctionToLoopPassAdaptor(
+        LoopUnrollAndJamPass(Level.getSpeedupLevel())));
+  }
   // Optimize parallel scalar instruction chains into SIMD instructions.
   if (PTO.SLPVectorization) {
     FPM.addPass(SLPVectorizerPass());
@@ -1335,11 +1340,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     // FIXME: It would be really good to use a loop-integrated instruction
     // combiner for cleanup here so that the unrolling and LICM can be pipelined
     // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && PTO.LoopUnrolling) {
-      FPM.addPass(createFunctionToLoopPassAdaptor(
-          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
-    }
     FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
         Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
         PTO.ForgetAllSCEVInLoopUnroll)));

>From 74656b4bc8f51932a0e45da0668ac8844eed4b92 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Tue, 25 Jun 2024 15:26:56 +0530
Subject: [PATCH 2/7] [UnJ] Add comments explaining new position of
 UnrollAndJam

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 9a04c1013e86c..beb4325829d74 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1245,6 +1245,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     // combiner for cleanup here so that the unrolling and LICM can be pipelined
     // across the loop nests.
     // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
+    // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
     if (EnableUnrollAndJam && PTO.LoopUnrolling)
       FPM.addPass(createFunctionToLoopPassAdaptor(
           LoopUnrollAndJamPass(Level.getSpeedupLevel())));
@@ -1318,6 +1319,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
 
   // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
+  // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
   if (EnableUnrollAndJam && PTO.LoopUnrolling) {
     FPM.addPass(createFunctionToLoopPassAdaptor(
         LoopUnrollAndJamPass(Level.getSpeedupLevel())));

>From 7368a69f82c94e0e8077e37fd179e2bd22ba7353 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Tue, 2 Jul 2024 15:03:17 +0530
Subject: [PATCH 3/7] [UnJ] Do not run UnrollAndJam twice if full LTO

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index beb4325829d74..d5943cdc1581d 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1320,7 +1320,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
 
   // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
   // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
-  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+  if (!IsFullLTO && EnableUnrollAndJam && PTO.LoopUnrolling) {
     FPM.addPass(createFunctionToLoopPassAdaptor(
         LoopUnrollAndJamPass(Level.getSpeedupLevel())));
   }

>From 721e8d7cd8b57161f9c33b2b80f469eaa0fae397 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Wed, 10 Jul 2024 02:31:29 +0530
Subject: [PATCH 4/7] [UnJ] Add test in Transforms/PhaseOrdering for outer loop
 vectorization

---
 .../PhaseOrdering/outer-loop-vectorize.ll     | 174 ++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll b/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
new file mode 100644
index 0000000000000..b27433d2997fe
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -enable-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+ at aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at cc = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at b = dso_local global [32000 x float] zeroinitializer, align 64
+ at c = dso_local global [32000 x float] zeroinitializer, align 64
+ at d = dso_local global [32000 x float] zeroinitializer, align 64
+ at a = dso_local global [32000 x float] zeroinitializer, align 64
+ at e = dso_local global [32000 x float] zeroinitializer, align 64
+ at tt = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 64
+
+; Function Attrs: nounwind uwtable vscale_range(1,16)
+define dso_local nofpclass(nan inf) float @s2275(ptr nocapture noundef readnone %func_args) local_unnamed_addr #0 {
+; CHECK-LABEL: @s2275(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[NL_056:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret float undef
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[INDVARS_IV58:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT59_3:%.*]], [[FOR_COND_CLEANUP7:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT59_1:%.*]] = or disjoint i64 [[INDVARS_IV58]], 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT59_2:%.*]] = or disjoint i64 [[INDVARS_IV58]], 3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT59_3]] = add nuw nsw i64 [[INDVARS_IV58]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, ptr [[TMP2]], align 16, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[TMP4]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, ptr [[TMP6]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x float> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x float> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[TMP2]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[INDEX_2]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast float [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    store float [[TMP20]], ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX_2]], 2
+; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX_3]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP27]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP23]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load <4 x float>, ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load <4 x float>, ptr [[TMP3]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load <4 x float>, ptr [[TMP5]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <4 x float> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fadd fast <4 x float> [[TMP33]], [[TMP30]]
+; CHECK-NEXT:    store <4 x float> [[TMP34]], ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store float [[TMP29]], ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX_3]], 2
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 256
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FOR_COND_CLEANUP7]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       for.cond.cleanup3:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[INC37]] = add nuw nsw i32 [[NL_056]], 1
+; CHECK-NEXT:    [[EXITCOND62_NOT:%.*]] = icmp eq i32 [[INC37]], 39000
+; CHECK-NEXT:    br i1 [[EXITCOND62_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       for.cond.cleanup7:
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX24]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX26]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX28]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul fast <4 x float> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fadd fast <4 x float> [[TMP39]], [[TMP36]]
+; CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[ARRAYIDX32]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[EXITCOND61_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT59_3]], 256
+; CHECK-NEXT:    br i1 [[EXITCOND61_NOT_3]], label [[FOR_COND_CLEANUP3]], label [[VECTOR_PH]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.cond.cleanup3
+  %nl.056 = phi i32 [ 0, %entry ], [ %inc37, %for.cond.cleanup3 ]
+  br label %for.cond5.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret float undef
+
+for.cond5.preheader:                              ; preds = %for.cond1.preheader, %for.cond.cleanup7
+  %indvars.iv58 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next59, %for.cond.cleanup7 ]
+  br label %for.body8
+
+for.cond.cleanup3:                                ; preds = %for.cond.cleanup7
+  %call = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #2
+  %inc37 = add nuw nsw i32 %nl.056, 1
+  %exitcond62.not = icmp eq i32 %inc37, 39000
+  br i1 %exitcond62.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !6
+
+for.cond.cleanup7:                                ; preds = %for.body8
+  %arrayidx24 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv58
+  %0 = load float, ptr %arrayidx24, align 4, !tbaa !8
+  %arrayidx26 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv58
+  %1 = load float, ptr %arrayidx26, align 4, !tbaa !8
+  %arrayidx28 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv58
+  %2 = load float, ptr %arrayidx28, align 4, !tbaa !8
+  %mul29 = fmul fast float %2, %1
+  %add30 = fadd fast float %mul29, %0
+  %arrayidx32 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv58
+  store float %add30, ptr %arrayidx32, align 4, !tbaa !8
+  %indvars.iv.next59 = add nuw nsw i64 %indvars.iv58, 1
+  %exitcond61.not = icmp eq i64 %indvars.iv.next59, 256
+  br i1 %exitcond61.not, label %for.cond.cleanup3, label %for.cond5.preheader, !llvm.loop !12
+
+for.body8:                                        ; preds = %for.cond5.preheader, %for.body8
+  %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+  %3 = load float, ptr %arrayidx10, align 4, !tbaa !8
+  %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+  %4 = load float, ptr %arrayidx14, align 4, !tbaa !8
+  %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+  %5 = load float, ptr %arrayidx18, align 4, !tbaa !8
+  %mul = fmul fast float %5, %4
+  %add = fadd fast float %mul, %3
+  store float %add, ptr %arrayidx10, align 4, !tbaa !8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !14
+}
+
+declare i32 @dummy(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, float noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+attributes #0 = { nounwind uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!"clang version 19.0.0git (git at github.com:sjoerdmeijer/llvm-project.git 6efcff18dfc42038bafa67091e990b9c1b839a71)"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
+!12 = distinct !{!12, !7, !13}
+!13 = !{!"llvm.loop.unroll_and_jam.count", i32 4}
+!14 = distinct !{!14}

>From 0f8326e66534dc3a9fcd97c8d3401f8447333517 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Wed, 24 Jul 2024 15:53:17 +0530
Subject: [PATCH 5/7] [UnJ] Run UnJ with !IsFullLTO in same place as UnJ with
 IsFullLTO

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 32 ++++++++++++++++--------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index d5943cdc1581d..bf62747f3734a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1236,6 +1236,27 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
 
   if (EnableInferAlignmentPass)
     FPM.addPass(InferAlignmentPass());
+
+  // Cleanup after loop vectorization. Simplification passes like CVP and
+  // GVN, loop transforms, and others have already run, so it's now better to
+  // convert to more optimized IR using more aggressive simplify CFG options.
+  FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                  .forwardSwitchCondToPhi(true)
+                                  .convertSwitchRangeToICmp(true)
+                                  .convertSwitchToLookupTable(true)
+                                  .needCanonicalLoops(false)
+                                  .hoistCommonInsts(true)
+                                  .sinkCommonInsts(true)));
+
+  // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
+  // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
+  // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus
+  // placing it immediately before the SLPVectorizerPass, presumably due to analysis re-use.
+  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+    FPM.addPass(createFunctionToLoopPassAdaptor(
+        LoopUnrollAndJamPass(Level.getSpeedupLevel())));
+  }
+
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
     // again. Unroll small loops to hide loop backedge latency and saturate any
@@ -1244,11 +1265,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     // FIXME: It would be really good to use a loop-integrated instruction
     // combiner for cleanup here so that the unrolling and LICM can be pipelined
     // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
-    if (EnableUnrollAndJam && PTO.LoopUnrolling)
-      FPM.addPass(createFunctionToLoopPassAdaptor(
-          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
     FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
         Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
         PTO.ForgetAllSCEVInLoopUnroll)));
@@ -1318,12 +1334,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(BDCEPass());
   }
 
-  // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first.
-  // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
-  if (!IsFullLTO && EnableUnrollAndJam && PTO.LoopUnrolling) {
-    FPM.addPass(createFunctionToLoopPassAdaptor(
-        LoopUnrollAndJamPass(Level.getSpeedupLevel())));
-  }
   // Optimize parallel scalar instruction chains into SIMD instructions.
   if (PTO.SLPVectorization) {
     FPM.addPass(SLPVectorizerPass());

>From 5088bacf52f667ed327018e958a01da9bf546d70 Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Thu, 25 Jul 2024 07:18:56 -0700
Subject: [PATCH 6/7] [UnJ] [SimplifyCFG] Only run extra SimplifyCFGPass if UnJ
 enabled

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 25 ++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index bf62747f3734a..77743f275b1d3 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1237,22 +1237,23 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   if (EnableInferAlignmentPass)
     FPM.addPass(InferAlignmentPass());
 
-  // Cleanup after loop vectorization. Simplification passes like CVP and
-  // GVN, loop transforms, and others have already run, so it's now better to
-  // convert to more optimized IR using more aggressive simplify CFG options.
-  FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
-                                  .forwardSwitchCondToPhi(true)
-                                  .convertSwitchRangeToICmp(true)
-                                  .convertSwitchToLookupTable(true)
-                                  .needCanonicalLoops(false)
-                                  .hoistCommonInsts(true)
-                                  .sinkCommonInsts(true)));
-
   // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
   // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
   // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus
-  // placing it immediately before the SLPVectorizerPass, presumably due to analysis re-use.
+  // placing it immediately before the SLPVectorizerPass, due to analysis re-use.
   if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+    // Cleanup after loop vectorization. Simplification passes like CVP and
+    // GVN, loop transforms, and others have already run, so it's now better to
+    // convert to more optimized IR using more aggressive simplify CFG options.
+    // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP outer loop vectorization to happen.
+    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                    .forwardSwitchCondToPhi(true)
+                                    .convertSwitchRangeToICmp(true)
+                                    .convertSwitchToLookupTable(true)
+                                    .needCanonicalLoops(false)
+                                    .hoistCommonInsts(true)
+                                    .sinkCommonInsts(true)));
+
     FPM.addPass(createFunctionToLoopPassAdaptor(
         LoopUnrollAndJamPass(Level.getSpeedupLevel())));
   }

>From b9059f4f343fc011a1d7c96e7550da571cd47b5a Mon Sep 17 00:00:00 2001
From: adprasad <adprasad at nvidia.com>
Date: Fri, 26 Jul 2024 18:32:26 +0530
Subject: [PATCH 7/7] [UnJ] [SimplifyCFG] Fix comment formatting

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 77743f275b1d3..a8f9a1590c150 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1238,14 +1238,17 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(InferAlignmentPass());
 
   // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
-  // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass.
-  // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus
-  // placing it immediately before the SLPVectorizerPass, due to analysis re-use.
+  // In order for outer loop vectorization to be done, UnrollAndJam must occur
+  // before the SLPVectorizerPass. Placing UnrollAndJam immediately after the
+  // LoopVectorizePass when !IsFullLTO leads to improved compile times versus
+  // placing it immediately before the SLPVectorizerPass, due to analysis
+  // re-use.
   if (EnableUnrollAndJam && PTO.LoopUnrolling) {
     // Cleanup after loop vectorization. Simplification passes like CVP and
     // GVN, loop transforms, and others have already run, so it's now better to
     // convert to more optimized IR using more aggressive simplify CFG options.
-    // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP outer loop vectorization to happen.
+    // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP
+    // outer loop vectorization to happen.
     FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                     .forwardSwitchCondToPhi(true)
                                     .convertSwitchRangeToICmp(true)