[PATCH] D128830: [Pipelines] Introduce DAE after ArgumentPromotion

Wed Jun 29 08:25:29 PDT 2022

psamolysov created this revision.
psamolysov added reviewers: aeubanks, mtrofin, fhahn, yln, serge-sans-paille, nikic.
psamolysov added a project: LLVM.
Herald added a subscriber: hiraditya.
Herald added a project: All.
psamolysov requested review of this revision.
Herald added a reviewer: jdoerfert.
Herald added subscribers: llvm-commits, sstefan1.

The ArgumentPromotion pass uses Mem2Reg promotion at the end to cutting
down generated `alloca` instructions as well as meaningless `store`s and
this behavior can leave unused (dead) arguments. To eliminate the dead
arguments and therefore let the DeadCodeElimination remove becoming dead
inserted `GEP`s as well as `load`s and `cast`s in the callers, the
DeadArgumentElimination pass should be run after the ArgumentPromotion
one.

For example, the following code

  %struct.ss = type { i32, i64 }
  
  @dummy = global i32 0
  
  define internal void @f(%struct.ss* byval(%struct.ss) align 8 %b, i32* byval(i32) align 4 %X) noinline nounwind  {
  entry:
    %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
    %temp1 = load i32, i32* %temp, align 4
    %temp2 = add i32 %temp1, 1
    store i32 %temp2, i32* @dummy
    store i32 %temp2, i32* %X
    ret void
  }
  
  define i32 @test(i32* %X) {
  entry:
    %S = alloca %struct.ss, align 8
    %temp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
    store i32 1, i32* %temp1, align 8
    %temp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
    store i64 2, i64* %temp4, align 4
    call void @f( %struct.ss* byval(%struct.ss) align 8 %S, i32* byval(i32) align 4 %X)
    ret i32 0
  }

was optimized without the DAE into:

  @dummy = local_unnamed_addr global i32 0
  
  define internal fastcc void @f(i32 %b.0.val, i32 %X.0.val) unnamed_addr {
  ; %X is a dead argument
  entry:
    %temp2 = add i32 %b.0.val, 1
    store i32 %temp2, i32* @dummy, align 4
    ret void
  }
  
  define i32 @test(i32* nocapture readonly %X) local_unnamed_addr {
  entry:
    %X.val = load i32, i32* %X, align 4 ; is not required actually
    tail call fastcc void @f(i32 1, i32 %X.val) ; %X has no sense
    ret i32 0
  }

and after applying this patch, with DAE, into:

  @dummy = local_unnamed_addr global i32 0
  
  define internal fastcc void @f(i32 %b.0.val) unnamed_addr {
  entry:
    %temp2 = add i32 %b.0.val, 1
    store i32 %temp2, i32* @dummy, align 4
    ret void
  }
  
  define i32 @test(i32* nocapture readonly %X) local_unnamed_addr {
  entry:
    tail call fastcc void @f(i32 1) ; no %X and the load instruction
    ret i32 0
  }


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D128830

Files:
  llvm/lib/Passes/PassBuilderPipelines.cpp


Index: llvm/lib/Passes/PassBuilderPipelines.cpp
===================================================================

--- llvm/lib/Passes/PassBuilderPipelines.cpp
+++ llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -635,7 +635,7 @@
 
     FunctionPassManager FPM;
     FPM.addPass(SROAPass());
-    FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
+    FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies.
     FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
         true)));                    // Merge & remove basic blocks.
     FPM.addPass(InstCombinePass()); // Combine silly sequences.
@@ -728,10 +728,9 @@
   if (PGOOpt)
     IP.EnableDeferral = EnablePGOInlineDeferral;
 
-  ModuleInlinerWrapperPass MIWP(
-      IP, PerformMandatoryInliningsFirst,
-      InlineContext{Phase, InlinePass::CGSCCInliner},
-      UseInlineAdvisor, MaxDevirtIterations);
+  ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst,
+                                InlineContext{Phase, InlinePass::CGSCCInliner},
+                                UseInlineAdvisor, MaxDevirtIterations);
 
   // Require the GlobalsAA analysis for the module so we can query it within
   // the CGSCC pipeline.
@@ -767,6 +766,14 @@
   if (Level == OptimizationLevel::O3)
     MainCGPipeline.addPass(ArgumentPromotionPass());
 
+  // The ArgumentPromotion pass runs Mem2Reg promotion at the end and this can
+  // make some arguments unused in the callee. Any unused arguments must be
+  // removed.
+  // FIXME: If ArgumentPromotionPass wouldn't be limited to O3, this also
+  // shouldn't.
+  if (Level == OptimizationLevel::O3)
+    MIWP.addLateModulePass(DeadArgumentEliminationPass());
+
   // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
   // there are no OpenMP runtime calls present in the module.
   if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
@@ -1612,7 +1619,7 @@
       getInlineParamsFromOptLevel(Level),
       /* MandatoryFirst */ true,
       InlineContext{ThinOrFullLTOPhase::FullLTOPostLink,
-                          InlinePass::CGSCCInliner}));
+                    InlinePass::CGSCCInliner}));
 
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
@@ -1624,6 +1631,11 @@
   // transform it to pass arguments by value instead of by reference.
   MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass()));
 
+  // The ArgumentPromotion pass runs Mem2Reg promotion at the end and this
+  // can make some arguments unused in the callee. Any unused arguments must be
+  // removed.
+  MPM.addPass(DeadArgumentEliminationPass());
+
   FunctionPassManager FPM;
   // The IPO Passes may leave cruft around. Clean up after them.
   FPM.addPass(InstCombinePass());
@@ -1686,7 +1698,6 @@
   MainFPM.addPass(DSEPass());
   MainFPM.addPass(MergedLoadStoreMotionPass());
 
-
   if (EnableConstraintElimination)
     MainFPM.addPass(ConstraintEliminationPass());
 
@@ -1711,8 +1722,7 @@
   addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
 
   // Run the OpenMPOpt CGSCC pass again late.
-  MPM.addPass(
-      createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass()));
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass()));
 
   invokePeepholeEPCallbacks(MainFPM, Level);
   MainFPM.addPass(JumpThreadingPass());


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D128830.441025.patch
Type: text/x-patch
Size: 3388 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220629/a526c4a4/attachment.bin>