[llvm] [Passes] Run IndVars after late partial unrolling. (PR #81275)

Fri Feb 9 08:51:08 PST 2024

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/81275

After aea2a1473631, partial unrolling now happens much later in the pipeline. At this point, we miss a number of simplification passes to further optimize the partially unrolled loop.

In some cases, this can cause notable performance regressions.

To improve codegen for partial unrolled loop bodies, run IndVars on demand on partially unrolled loops.

This patch uses a approach similar to the way extra cleanup passes are run on demand after vectorization (added in acea6e9cfa4c4a0e8678c7) and if we decide to make wider use of this pattern, we should probably factor our the pass manager/analysis logic, before landing this.

Another instance that can be improved by this pattern: https://github.com/llvm/llvm-project/pull/81271

Compile-time impact (geomeans) ranges from +0.11% to +0.19%. https://llvm-compile-time-tracker.com/compare.php?from=68cef34784e33539b75f2c379d8e21a87719d67c&to=446d5e5b2fbfb5efc6cbc40584a31564327df584&stat=instructions:u

This could be reduced by only running a small subset of IndVars to optimize inductions where the impact ranges from +0.05% to +0.10%. https://llvm-compile-time-tracker.com/compare.php?from=68cef34784e33539b75f2c379d8e21a87719d67c&to=bbee5fa003136618446dc30e6ac3897ab03facd7&stat=instructions%3Au

>From ef9c852d563195d0fef93caa596afcf2c6dbb3b8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 8 Feb 2024 13:31:40 +0000
Subject: [PATCH] [Passes] Run IndVars after late partial unrolling.

After aea2a1473631, partial unrolling now happens much later in the
pipeline. At this point, we miss a number of simplification passes to
further optimize the partially unrolled loop.

In some cases, this can cause notable performance regressions.

To improve codegen for partial unrolled loop bodies, run IndVars on
demand on partially unrolled loops.

This patch uses a approach similar to the way extra cleanup passes are
run on demand after vectorization (added in acea6e9cfa4c4a0e8678c7) and
if we decide to make wider use of this pattern, we should probably
factor our the pass manager/analysis logic, before landing this.

Another instance that can be improved by this pattern:
https://github.com/llvm/llvm-project/pull/81271

Compile-time impact (geomeans) ranges from +0.11% to +0.19%.
https://llvm-compile-time-tracker.com/compare.php?from=68cef34784e33539b75f2c379d8e21a87719d67c&to=446d5e5b2fbfb5efc6cbc40584a31564327df584&stat=instructions:u

This could be reduced by only running a small subset of IndVars to
optimize inductions where the impact ranges from +0.05% to +0.10%.
https://llvm-compile-time-tracker.com/compare.php?from=68cef34784e33539b75f2c379d8e21a87719d67c&to=bbee5fa003136618446dc30e6ac3897ab03facd7&stat=instructions%3Au
---
 .../llvm/Transforms/Vectorize/LoopVectorize.h | 34 +++++++++++++++++++
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  6 ++++
 llvm/lib/Passes/PassRegistry.def              |  3 ++
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |  9 ++++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  2 ++
 .../AArch64/extra-unroll-simplifications.ll   |  6 ++--
 6 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index d09fc328c452ff..e6e9e6065fed84 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -56,8 +56,10 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H
 #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H
 
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <functional>
 
 namespace llvm {
@@ -111,6 +113,38 @@ struct ExtraVectorPassManager : public FunctionPassManager {
   }
 };
 
+struct ShouldRunExtraUnrollPasses
+    : public AnalysisInfoMixin<ShouldRunExtraUnrollPasses> {
+  static AnalysisKey Key;
+  struct Result {
+    SmallPtrSet<Loop *, 4> Loops;
+    bool invalidate(Function &F, const PreservedAnalyses &PA,
+                    FunctionAnalysisManager::Invalidator &) {
+      // Check whether the analysis has been explicitly invalidated. Otherwise,
+      // it remains preserved.
+      auto PAC = PA.getChecker<ShouldRunExtraUnrollPasses>();
+      return !PAC.preservedWhenStateless();
+    }
+  };
+
+  Result run(Function &F, FunctionAnalysisManager &FAM) { return Result(); }
+};
+
+template <typename MarkerT>
+struct ExtraLoopPassManager : public LoopPassManager {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+    auto PA = PreservedAnalyses::all();
+    if (auto *X = AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR)
+                      .getCachedResult<ShouldRunExtraUnrollPasses>(
+                          *L.getHeader()->getParent()))
+      if (X->Loops.contains(&L))
+        PA.intersect(LoopPassManager::run(L, AM, AR, U));
+    // PA.abandon<MarkerT>();
+    return PA;
+  }
+};
+
 struct LoopVectorizeOptions {
   /// If false, consider all loops for interleaving.
   /// If true, only loops that explicitly request interleaving are considered.
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6ede8638291206..cb5c88521b2bd7 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1272,6 +1272,12 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
         Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
         PTO.ForgetAllSCEVInLoopUnroll)));
+    {
+      ExtraLoopPassManager<ShouldRunExtraUnrollPasses> ExtraPasses;
+      ExtraPasses.addPass((IndVarSimplifyPass()));
+      FPM.addPass(createFunctionToLoopPassAdaptor(std::move(ExtraPasses)));
+    }
+
     FPM.addPass(WarnMissedTransformationsPass());
     // Now that we are done with loop unrolling, be it either by LoopVectorizer,
     // or LoopUnroll passes, some variable-offset GEP's into alloca's could have
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 6cb87fba426463..42d70f7273565d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -270,6 +270,9 @@ FUNCTION_ANALYSIS("should-not-run-function-passes",
                   ShouldNotRunFunctionPassesAnalysis())
 FUNCTION_ANALYSIS("should-run-extra-vector-passes",
                   ShouldRunExtraVectorPasses())
+FUNCTION_ANALYSIS("should-run-extra-unroll-passes",
+                  ShouldRunExtraUnrollPasses())
+
 FUNCTION_ANALYSIS("ssp-layout", SSPLayoutAnalysis())
 FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis())
 FUNCTION_ANALYSIS("target-ir",
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 75fb8765061edf..6c01271689c8b4 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -58,6 +58,7 @@
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -1601,6 +1602,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
   SmallPriorityWorklist<Loop *, 4> Worklist;
   appendLoopsToWorklist(LI, Worklist);
 
+  auto PA = getLoopPassPreservedAnalyses();
   while (!Worklist.empty()) {
     // Because the LoopInfo stores the loops in RPO, we walk the worklist
     // from back to front so that we work forward across the CFG, which
@@ -1629,6 +1631,11 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
         UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
         UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
     Changed |= Result != LoopUnrollResult::Unmodified;
+    if (Result == LoopUnrollResult::PartiallyUnrolled) {
+      auto &E = AM.getResult<ShouldRunExtraUnrollPasses>(F);
+      E.Loops.insert(&L);
+      PA.preserve<ShouldRunExtraUnrollPasses>();
+    }
 
     // The parent must not be damaged by unrolling!
 #ifndef NDEBUG
@@ -1644,7 +1651,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  return PA;
 }
 
 void LoopUnrollPass::printPipeline(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1a7b301c35f2b8..5b3503066242f5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -487,6 +487,8 @@ namespace llvm {
 
 AnalysisKey ShouldRunExtraVectorPasses::Key;
 
+AnalysisKey ShouldRunExtraUnrollPasses::Key;
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index 6132c35c96ca32..da11888e87ed6d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -20,7 +20,6 @@ define void @partial_unroll_forced(i32 %N, ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[SRC_IDX:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load <8 x half>, ptr [[SRC_IDX]], align 16
 ; CHECK-NEXT:    [[DST_IDX:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV]]
@@ -32,9 +31,8 @@ define void @partial_unroll_forced(i32 %N, ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[DST_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = fadd <8 x half> [[L_1]], [[L_1]]
 ; CHECK-NEXT:    store <8 x half> [[ADD_1]], ptr [[DST_IDX_1]], align 16
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
-; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], [[UNROLL_ITER]]
 ; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit.loopexit.unr-lcssa:
 ; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]