[llvm] 2fef685 - [llvm][loop-rotate] Allow forcing loop-rotation (#82828)

Thu Feb 29 13:46:17 PST 2024

Author: Paul Kirth
Date: 2024-02-29T13:46:13-08:00
New Revision: 2fef685363e13e0640b624cc3d07b1006f12113c

URL: https://github.com/llvm/llvm-project/commit/2fef685363e13e0640b624cc3d07b1006f12113c
DIFF: https://github.com/llvm/llvm-project/commit/2fef685363e13e0640b624cc3d07b1006f12113c.diff

LOG: [llvm][loop-rotate] Allow forcing loop-rotation (#82828)

Many profitable optimizations cannot be performed at -Oz, due to
unrotated loops. While this is worse for size (minimally), many of the
optimizations significantly reduce code size, such as memcpy
optimizations and other patterns found by loop idiom recognition.
Related discussion can be found in issue #50308.

This patch adds an experimental, backend-only flag to allow loop header
duplication, regardless of the optimization level. Downstream consumers
can experiment with this flag, and if it is profitable, we can adjust
the compiler's defaults accordingly, and expose any useful frontend
flags to opt into the new behavior.

Added: 
    llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll

Modified: 
    llvm/lib/Passes/PassBuilderPipelines.cpp
    llvm/test/Transforms/LoopRotate/oz-disable.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 991c3ac8f7446c..cbbbec0ccc8c4d 100644

--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -209,6 +209,15 @@ static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
                                        cl::Hidden,
                                        cl::desc("Enable the LoopFlatten Pass"));
 
+// Experimentally allow loop header duplication. This should allow for better
+// optimization at Oz, since loop-idiom recognition can then recognize things
+// like memcpy. If this ends up being useful for many targets, we should drop
+// this flag and make a code generation option that can be controlled
+// independent of the opt level and exposed through the frontend.
+static cl::opt<bool> EnableLoopHeaderDuplication(
+    "enable-loop-header-duplication", cl::init(false), cl::Hidden,
+    cl::desc("Enable loop header duplication at any optimization level"));
+
 static cl::opt<bool>
     EnableDFAJumpThreading("enable-dfa-jump-thread",
                            cl::desc("Enable DFA jump threading"),
@@ -630,8 +639,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
                         /*AllowSpeculation=*/false));
 
   // Disable header duplication in loop rotation at -Oz.
-  LPM1.addPass(
-      LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
+  LPM1.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
+                                  Level != OptimizationLevel::Oz,
+                              isLTOPreLink(Phase)));
   // TODO: Investigate promotion cap for O1.
   LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                         /*AllowSpeculation=*/true));
@@ -812,7 +822,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
     // Disable header duplication in loop rotation at -Oz.
     MPM.addPass(createModuleToFunctionPassAdaptor(
         createFunctionToLoopPassAdaptor(
-            LoopRotatePass(Level != OptimizationLevel::Oz),
+            LoopRotatePass(EnableLoopHeaderDuplication ||
+                           Level != OptimizationLevel::Oz),
             /*UseMemorySSA=*/false,
             /*UseBlockFrequencyInfo=*/false),
         PTO.EagerlyInvalidateAnalyses));
@@ -1422,7 +1433,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   LoopPassManager LPM;
   // First rotate loops that may have been un-rotated by prior passes.
   // Disable header duplication at -Oz.
-  LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
+  LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
+                                 Level != OptimizationLevel::Oz,
+                             LTOPreLink));
   // Some loops may have become dead by now. Try to delete them.
   // FIXME: see discussion in https://reviews.llvm.org/D112851,
   //        this may need to be revisited once we run GVN before loop deletion

diff  --git a/llvm/test/Transforms/LoopRotate/oz-disable.ll b/llvm/test/Transforms/LoopRotate/oz-disable.ll
index 6a7847ac0ff215..c45603878ee65c 100644
--- a/llvm/test/Transforms/LoopRotate/oz-disable.ll
+++ b/llvm/test/Transforms/LoopRotate/oz-disable.ll
@@ -4,6 +4,9 @@
 ; RUN: opt < %s -S -passes='default<Os>' -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS
 ; RUN: opt < %s -S -passes='default<Oz>' -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OZ
 
+;; Make sure -allow-loop-header-duplication overrides the default behavior at Oz
+; RUN: opt < %s -S -passes='default<Oz>' -enable-loop-header-duplication -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS
+
 ; Loop should be rotated for -Os but not for -Oz.
 ; OS: rotating Loop at depth 1
 ; OZ-NOT: rotating Loop at depth 1

diff  --git a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll
new file mode 100644
index 00000000000000..98b11578b49fbf
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+
+;; Check that -enable-loop-header-duplication at Oz enables certain types of
+;; optimizations, for example replacing the loop body w/ a call to memset. If
+;; loop idiom recognition begins to recognize unrotated loops, this test will
+;; need to be updated.
+
+; RUN: opt -passes='default<Oz>' -S < %s  | FileCheck %s --check-prefix=NOROTATION
+; RUN: opt -passes='default<Oz>' -S  -enable-loop-header-duplication < %s  | FileCheck %s --check-prefix=ROTATION
+; RUN: opt -passes='default<O2>' -S  < %s  | FileCheck %s --check-prefix=ROTATION
+
+define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr {
+; NOROTATION-LABEL: define void @test(
+; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] {
+; NOROTATION-NEXT:  entry:
+; NOROTATION-NEXT:    br label [[LOOP_HEADER:%.*]]
+; NOROTATION:       loop.header:
+; NOROTATION-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], [[ENTRY:%.*]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; NOROTATION-NEXT:    [[_12_I:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; NOROTATION-NEXT:    br i1 [[_12_I]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; NOROTATION:       loop.latch:
+; NOROTATION-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1
+; NOROTATION-NEXT:    store i8 1, ptr [[PTR_IV]], align 1
+; NOROTATION-NEXT:    br label [[LOOP_HEADER]]
+; NOROTATION:       exit:
+; NOROTATION-NEXT:    ret void
+;
+; ROTATION-LABEL: define void @test(
+; ROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] {
+; ROTATION-NEXT:  entry:
+; ROTATION-NEXT:    [[_12_I1:%.*]] = icmp eq ptr [[START]], [[END]]
+; ROTATION-NEXT:    br i1 [[_12_I1]], label [[EXIT:%.*]], label [[LOOP_LATCH_PREHEADER:%.*]]
+; ROTATION:       loop.latch.preheader:
+; ROTATION-NEXT:    [[END3:%.*]] = ptrtoint ptr [[END]] to i64
+; ROTATION-NEXT:    [[START4:%.*]] = ptrtoint ptr [[START]] to i64
+; ROTATION-NEXT:    [[TMP0:%.*]] = sub i64 [[END3]], [[START4]]
+; ROTATION-NEXT:    tail call void @llvm.memset.p0.i64(ptr nonnull align 1 [[START]], i8 1, i64 [[TMP0]], i1 false)
+; ROTATION-NEXT:    br label [[EXIT]]
+; ROTATION:       exit:
+; ROTATION-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi i8* [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %_12.i = icmp eq i8* %ptr.iv, %end
+  br i1 %_12.i, label %exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i8, i8* %ptr.iv, i64 1
+  store i8 1, i8* %ptr.iv, align 1
+  br label %loop.header
+
+exit:
+  ret void
+}