[llvm] [CodeGen][NPM] Support CodeGenSCCOrder in pipeline (PR #136818)

Akshat Oke via llvm-commits llvm-commits at lists.llvm.org
Mon May 12 02:11:57 PDT 2025


https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/136818

>From dd33574eff03eb3c27a292558b2d383b39322b40 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 23 Apr 2025 06:38:10 +0000
Subject: [PATCH 1/6] [CodeGen][NPM] Support CodeGenSCCOrder in pipeline
 pb/codegenscc-order

---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  89 +++++++++--
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  | 144 ++++++++++++++++++
 3 files changed, 219 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 6ed9ac47405d3..67f4a36511c5b 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -210,10 +211,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
   class AddIRPass {
   public:
     AddIRPass(ModulePassManager &MPM, const DerivedT &PB) : MPM(MPM), PB(PB) {}
-    ~AddIRPass() {
-      if (!FPM.isEmpty())
-        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-    }
+    ~AddIRPass() { flushFPMToMPM(); }
 
     template <typename PassT>
     void operator()(PassT &&Pass, StringRef Name = PassT::name()) {
@@ -231,16 +229,40 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
         FPM.addPass(std::forward<PassT>(Pass));
       } else {
         // Add Module Pass
-        if (!FPM.isEmpty()) {
-          MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-          FPM = FunctionPassManager();
-        }
-
+        flushFPMToMPM();
         MPM.addPass(std::forward<PassT>(Pass));
       }
     }
 
+    /// Setting this will add passes to the CGSCC pass manager.
+    void requireCGSCCOrder() {
+      if (PB.AddInCGSCCOrder)
+        return;
+      flushFPMToMPM();
+      PB.AddInCGSCCOrder = true;
+    }
+
+    /// Stop adding passes to the CGSCC pass manager.
+    /// Existing passes won't be removed.
+    void stopAddingInCGSCCOrder() {
+      if (!PB.AddInCGSCCOrder)
+        return;
+      flushFPMToMPM();
+      PB.AddInCGSCCOrder = false;
+    }
+
   private:
+    void flushFPMToMPM() {
+      if (!FPM.isEmpty()) {
+        if (PB.AddInCGSCCOrder) {
+          MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+              createCGSCCToFunctionPassAdaptor(std::move(FPM))));
+        } else {
+          MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        }
+        FPM = FunctionPassManager();
+      }
+    }
     ModulePassManager &MPM;
     FunctionPassManager FPM;
     const DerivedT &PB;
@@ -257,7 +279,11 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
         FPM.addPass(
             createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
         FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
-        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        if (this->PB.AddInCGSCCOrder) {
+          MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+              createCGSCCToFunctionPassAdaptor(std::move(FPM))));
+        } else
+          MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       }
     }
 
@@ -276,12 +302,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
         MFPM.addPass(std::forward<PassT>(Pass));
       } else {
         // Add Module Pass
-        if (!MFPM.isEmpty()) {
-          MPM.addPass(createModuleToFunctionPassAdaptor(
-              createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))));
-          MFPM = MachineFunctionPassManager();
-        }
-
+        flushMFPMToMPM();
         MPM.addPass(std::forward<PassT>(Pass));
       }
 
@@ -289,7 +310,39 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
         C(Name, MFPM);
     }
 
+    /// Setting this will add passes to the CGSCC pass manager.
+    void requireCGSCCOrder() {
+      if (PB.AddInCGSCCOrder)
+        return;
+      flushMFPMToMPM();
+      PB.AddInCGSCCOrder = true;
+    }
+
+    /// Stop adding passes to the CGSCC pass manager.
+    /// Existing passes won't be removed.
+    void stopAddingInCGSCCOrder() {
+      if (!PB.AddInCGSCCOrder)
+        return;
+      flushMFPMToMPM();
+      PB.AddInCGSCCOrder = false;
+    }
+
   private:
+    void flushMFPMToMPM() {
+      if (!MFPM.isEmpty()) {
+        if (PB.AddInCGSCCOrder) {
+          MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+              createCGSCCToFunctionPassAdaptor(
+                  createFunctionToMachineFunctionPassAdaptor(
+                      std::move(MFPM)))));
+        } else {
+          MPM.addPass(createModuleToFunctionPassAdaptor(
+              createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))));
+        }
+        MFPM = MachineFunctionPassManager();
+      }
+    }
+
     ModulePassManager &MPM;
     MachineFunctionPassManager MFPM;
     const DerivedT &PB;
@@ -555,6 +608,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
   /// Helper variable for `-start-before/-start-after/-stop-before/-stop-after`
   mutable bool Started = true;
   mutable bool Stopped = true;
+  mutable bool AddInCGSCCOrder = false;
 };
 
 template <typename Derived, typename TargetMachineT>
@@ -813,6 +867,9 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addISelPrepare(
     AddIRPass &addPass) const {
   derived().addPreISel(addPass);
 
+  if (Opt.RequiresCodeGenSCCOrder)
+    addPass.requireCGSCCOrder();
+
   addPass(CallBrPreparePass());
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ccb251b730f16..680a3fb78a6e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2079,6 +2079,8 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   // being run on them, which causes crashes in the resource usage analysis).
   addPass(AMDGPULowerBufferFatPointersPass(TM));
 
+  addPass.requireCGSCCOrder();
+
   Base::addCodeGenPrepare(addPass);
 
   if (isPassEnabled(EnableLoadStoreVectorizer))
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
new file mode 100644
index 0000000000000..96a533a19c88a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -0,0 +1,144 @@
+; UNSUPPORTED: expensive_checks
+; RUN: llc -O3 -enable-new-pm -mtriple=amdgcn--amdhsa -disable-verify -print-pipeline-passes < %s 2>&1 \
+; RUN:   | tr ',' '\n' | FileCheck -check-prefix=GCN-O3 %s
+
+; REQUIRES: asserts
+
+; GCN-O3: require<MachineModuleAnalysis>
+; GCN-O3-NEXT: require<profile-summary>
+; GCN-O3-NEXT: require<collector-metadata>
+; GCN-O3-NEXT: pre-isel-intrinsic-lowering
+; GCN-O3-NEXT: function(expand-large-div-rem
+; GCN-O3-NEXT: expand-fp)
+; GCN-O3-NEXT: amdgpu-remove-incompatible-functions
+; GCN-O3-NEXT: amdgpu-printf-runtime-binding
+; GCN-O3-NEXT: amdgpu-lower-ctor-dtor
+; GCN-O3-NEXT: function(amdgpu-image-intrinsic-opt)
+; GCN-O3-NEXT: expand-variadics
+; GCN-O3-NEXT: amdgpu-always-inline
+; GCN-O3-NEXT: always-inline
+; GCN-O3-NEXT: amdgpu-export-kernel-runtime-handles
+; GCN-O3-NEXT: amdgpu-sw-lower-lds
+; GCN-O3-NEXT: amdgpu-lower-module-lds
+; GCN-O3-NEXT: function(infer-address-spaces
+; GCN-O3-NEXT: amdgpu-atomic-optimizer
+; GCN-O3-NEXT: atomic-expand
+; GCN-O3-NEXT: amdgpu-promote-alloca
+; GCN-O3-NEXT: separate-const-offset-from-gep<>
+; GCN-O3-NEXT: slsr
+; GCN-O3-NEXT: gvn<>
+; GCN-O3-NEXT: nary-reassociate
+; GCN-O3-NEXT: early-cse<>
+; GCN-O3-NEXT: amdgpu-codegenprepare
+; GCN-O3-NEXT: loop-mssa(loop-reduce)
+; GCN-O3-NEXT: mergeicmps
+; GCN-O3-NEXT: expand-memcmp
+; GCN-O3-NEXT: gc-lowering
+; GCN-O3-NEXT: lower-constant-intrinsics
+; GCN-O3-NEXT: UnreachableBlockElimPass
+; GCN-O3-NEXT: consthoist
+; GCN-O3-NEXT: ReplaceWithVeclib
+; GCN-O3-NEXT: partially-inline-libcalls
+; GCN-O3-NEXT: ee-instrument<post-inline>
+; GCN-O3-NEXT: scalarize-masked-mem-intrin
+; GCN-O3-NEXT: ExpandReductionsPass
+; GCN-O3-NEXT: gvn<>
+; GCN-O3-NEXT: amdgpu-lower-kernel-arguments)
+; GCN-O3-NEXT: amdgpu-lower-buffer-fat-pointers
+; GCN-O3-NEXT: cgscc(function(codegenprepare
+; GCN-O3-NEXT: load-store-vectorizer
+; GCN-O3-NEXT: lower-switch
+; GCN-O3-NEXT: lower-invoke
+; GCN-O3-NEXT: UnreachableBlockElimPass
+; GCN-O3-NEXT: flatten-cfg
+; GCN-O3-NEXT: sink
+; GCN-O3-NEXT: amdgpu-late-codegenprepare
+; GCN-O3-NEXT: amdgpu-unify-divergent-exit-nodes
+; GCN-O3-NEXT: fix-irreducible
+; GCN-O3-NEXT: unify-loop-exits
+; GCN-O3-NEXT: StructurizeCFGPass
+; GCN-O3-NEXT: amdgpu-annotate-uniform
+; GCN-O3-NEXT: si-annotate-control-flow
+; GCN-O3-NEXT: amdgpu-rewrite-undef-for-phi
+; GCN-O3-NEXT: lcssa))
+; GCN-O3-NEXT: amdgpu-perf-hint
+; GCN-O3-NEXT: cgscc(function(require<uniformity>
+; GCN-O3-NEXT: callbr-prepare
+; GCN-O3-NEXT: safe-stack
+; GCN-O3-NEXT: stack-protector))
+; GCN-O3-NEXT: cgscc(function(machine-function(amdgpu-isel
+; GCN-O3-NEXT: si-fix-sgpr-copies
+; GCN-O3-NEXT: si-i1-copies
+; GCN-O3-NEXT: finalize-isel
+; GCN-O3-NEXT: early-tailduplication
+; GCN-O3-NEXT: opt-phis
+; GCN-O3-NEXT: stack-coloring
+; GCN-O3-NEXT: localstackalloc
+; GCN-O3-NEXT: dead-mi-elimination
+; GCN-O3-NEXT: early-machinelicm
+; GCN-O3-NEXT: machine-cse
+; GCN-O3-NEXT: machine-sink
+; GCN-O3-NEXT: peephole-opt
+; GCN-O3-NEXT: dead-mi-elimination
+; GCN-O3-NEXT: si-fold-operands
+; GCN-O3-NEXT: gcn-dpp-combine
+; GCN-O3-NEXT: si-load-store-opt
+; GCN-O3-NEXT: si-peephole-sdwa
+; GCN-O3-NEXT: early-machinelicm
+; GCN-O3-NEXT: machine-cse
+; GCN-O3-NEXT: si-fold-operands
+; GCN-O3-NEXT: dead-mi-elimination
+; GCN-O3-NEXT: si-shrink-instructions
+; GCN-O3-NEXT: detect-dead-lanes
+; GCN-O3-NEXT: InitUndefPass
+; GCN-O3-NEXT: ProcessImplicitDefsPass
+; GCN-O3-NEXT: unreachable-mbb-elimination
+; GCN-O3-NEXT: require<live-vars>
+; GCN-O3-NEXT: require<machine-loops>
+; GCN-O3-NEXT: phi-node-elimination
+; GCN-O3-NEXT: two-address-instruction
+; GCN-O3-NEXT: register-coalescer
+; GCN-O3-NEXT: rename-independent-subregs
+; GCN-O3-NEXT: machine-scheduler
+; GCN-O3-NEXT: greedy<all>
+; GCN-O3-NEXT: amdgpu-nsa-reassign
+; GCN-O3-NEXT: VirtRegRewriterPass
+; GCN-O3-NEXT: stack-slot-coloring
+; GCN-O3-NEXT: machine-cp
+; GCN-O3-NEXT: machinelicm
+; GCN-O3-NEXT: si-fix-vgpr-copies
+; GCN-O3-NEXT: si-optimize-exec-masking
+; GCN-O3-NEXT: remove-redundant-debug-values
+; GCN-O3-NEXT: fixup-statepoint-caller-saved
+; GCN-O3-NEXT: PostRAMachineSinkingPass
+; GCN-O3-NEXT: ShrinkWrapPass
+; GCN-O3-NEXT: PrologEpilogInserterPass
+; GCN-O3-NEXT: branch-folder
+; GCN-O3-NEXT: tailduplication
+; GCN-O3-NEXT: machine-latecleanup
+; GCN-O3-NEXT: machine-cp
+; GCN-O3-NEXT: post-ra-pseudos
+; GCN-O3-NEXT: postmisched
+; GCN-O3-NEXT: block-placement
+; GCN-O3-NEXT: fentry-insert
+; GCN-O3-NEXT: xray-instrumentation
+; GCN-O3-NEXT: patchable-function
+; GCN-O3-NEXT: gcn-create-vopd
+; GCN-O3-NEXT: si-memory-legalizer
+; GCN-O3-NEXT: si-insert-waitcnts
+; GCN-O3-NEXT: si-late-branch-lowering
+; GCN-O3-NEXT: si-pre-emit-peephole
+; GCN-O3-NEXT: post-RA-hazard-rec
+; GCN-O3-NEXT: AMDGPUWaitSGPRHazardsPass
+; GCN-O3-NEXT: amdgpu-insert-delay-alu
+; GCN-O3-NEXT: branch-relaxation
+; GCN-O3-NEXT: remove-loads-into-fake-uses
+; GCN-O3-NEXT: live-debug-values
+; GCN-O3-NEXT: machine-sanmd
+; GCN-O3-NEXT: stack-frame-layout)
+; GCN-O3-NEXT: invalidate<machine-function-info>))
+
+
+define void @empty() {
+  ret void
+}

>From 435d1c64d8712483d40cbf8a9707ba257e3abd71 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 23 Apr 2025 08:55:23 +0000
Subject: [PATCH 2/6] Remove tr and support expensive check

---
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 267 +++++++++----------
 1 file changed, 133 insertions(+), 134 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 96a533a19c88a..7ba1771eba08d 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -1,142 +1,141 @@
-; UNSUPPORTED: expensive_checks
 ; RUN: llc -O3 -enable-new-pm -mtriple=amdgcn--amdhsa -disable-verify -print-pipeline-passes < %s 2>&1 \
-; RUN:   | tr ',' '\n' | FileCheck -check-prefix=GCN-O3 %s
+; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 ; REQUIRES: asserts
 
 ; GCN-O3: require<MachineModuleAnalysis>
-; GCN-O3-NEXT: require<profile-summary>
-; GCN-O3-NEXT: require<collector-metadata>
-; GCN-O3-NEXT: pre-isel-intrinsic-lowering
-; GCN-O3-NEXT: function(expand-large-div-rem
-; GCN-O3-NEXT: expand-fp)
-; GCN-O3-NEXT: amdgpu-remove-incompatible-functions
-; GCN-O3-NEXT: amdgpu-printf-runtime-binding
-; GCN-O3-NEXT: amdgpu-lower-ctor-dtor
-; GCN-O3-NEXT: function(amdgpu-image-intrinsic-opt)
-; GCN-O3-NEXT: expand-variadics
-; GCN-O3-NEXT: amdgpu-always-inline
-; GCN-O3-NEXT: always-inline
-; GCN-O3-NEXT: amdgpu-export-kernel-runtime-handles
-; GCN-O3-NEXT: amdgpu-sw-lower-lds
-; GCN-O3-NEXT: amdgpu-lower-module-lds
-; GCN-O3-NEXT: function(infer-address-spaces
-; GCN-O3-NEXT: amdgpu-atomic-optimizer
-; GCN-O3-NEXT: atomic-expand
-; GCN-O3-NEXT: amdgpu-promote-alloca
-; GCN-O3-NEXT: separate-const-offset-from-gep<>
-; GCN-O3-NEXT: slsr
-; GCN-O3-NEXT: gvn<>
-; GCN-O3-NEXT: nary-reassociate
-; GCN-O3-NEXT: early-cse<>
-; GCN-O3-NEXT: amdgpu-codegenprepare
-; GCN-O3-NEXT: loop-mssa(loop-reduce)
-; GCN-O3-NEXT: mergeicmps
-; GCN-O3-NEXT: expand-memcmp
-; GCN-O3-NEXT: gc-lowering
-; GCN-O3-NEXT: lower-constant-intrinsics
-; GCN-O3-NEXT: UnreachableBlockElimPass
-; GCN-O3-NEXT: consthoist
-; GCN-O3-NEXT: ReplaceWithVeclib
-; GCN-O3-NEXT: partially-inline-libcalls
-; GCN-O3-NEXT: ee-instrument<post-inline>
-; GCN-O3-NEXT: scalarize-masked-mem-intrin
-; GCN-O3-NEXT: ExpandReductionsPass
-; GCN-O3-NEXT: gvn<>
-; GCN-O3-NEXT: amdgpu-lower-kernel-arguments)
-; GCN-O3-NEXT: amdgpu-lower-buffer-fat-pointers
-; GCN-O3-NEXT: cgscc(function(codegenprepare
-; GCN-O3-NEXT: load-store-vectorizer
-; GCN-O3-NEXT: lower-switch
-; GCN-O3-NEXT: lower-invoke
-; GCN-O3-NEXT: UnreachableBlockElimPass
-; GCN-O3-NEXT: flatten-cfg
-; GCN-O3-NEXT: sink
-; GCN-O3-NEXT: amdgpu-late-codegenprepare
-; GCN-O3-NEXT: amdgpu-unify-divergent-exit-nodes
-; GCN-O3-NEXT: fix-irreducible
-; GCN-O3-NEXT: unify-loop-exits
-; GCN-O3-NEXT: StructurizeCFGPass
-; GCN-O3-NEXT: amdgpu-annotate-uniform
-; GCN-O3-NEXT: si-annotate-control-flow
-; GCN-O3-NEXT: amdgpu-rewrite-undef-for-phi
-; GCN-O3-NEXT: lcssa))
-; GCN-O3-NEXT: amdgpu-perf-hint
-; GCN-O3-NEXT: cgscc(function(require<uniformity>
-; GCN-O3-NEXT: callbr-prepare
-; GCN-O3-NEXT: safe-stack
-; GCN-O3-NEXT: stack-protector))
-; GCN-O3-NEXT: cgscc(function(machine-function(amdgpu-isel
-; GCN-O3-NEXT: si-fix-sgpr-copies
-; GCN-O3-NEXT: si-i1-copies
-; GCN-O3-NEXT: finalize-isel
-; GCN-O3-NEXT: early-tailduplication
-; GCN-O3-NEXT: opt-phis
-; GCN-O3-NEXT: stack-coloring
-; GCN-O3-NEXT: localstackalloc
-; GCN-O3-NEXT: dead-mi-elimination
-; GCN-O3-NEXT: early-machinelicm
-; GCN-O3-NEXT: machine-cse
-; GCN-O3-NEXT: machine-sink
-; GCN-O3-NEXT: peephole-opt
-; GCN-O3-NEXT: dead-mi-elimination
-; GCN-O3-NEXT: si-fold-operands
-; GCN-O3-NEXT: gcn-dpp-combine
-; GCN-O3-NEXT: si-load-store-opt
-; GCN-O3-NEXT: si-peephole-sdwa
-; GCN-O3-NEXT: early-machinelicm
-; GCN-O3-NEXT: machine-cse
-; GCN-O3-NEXT: si-fold-operands
-; GCN-O3-NEXT: dead-mi-elimination
-; GCN-O3-NEXT: si-shrink-instructions
-; GCN-O3-NEXT: detect-dead-lanes
-; GCN-O3-NEXT: InitUndefPass
-; GCN-O3-NEXT: ProcessImplicitDefsPass
-; GCN-O3-NEXT: unreachable-mbb-elimination
-; GCN-O3-NEXT: require<live-vars>
-; GCN-O3-NEXT: require<machine-loops>
-; GCN-O3-NEXT: phi-node-elimination
-; GCN-O3-NEXT: two-address-instruction
-; GCN-O3-NEXT: register-coalescer
-; GCN-O3-NEXT: rename-independent-subregs
-; GCN-O3-NEXT: machine-scheduler
-; GCN-O3-NEXT: greedy<all>
-; GCN-O3-NEXT: amdgpu-nsa-reassign
-; GCN-O3-NEXT: VirtRegRewriterPass
-; GCN-O3-NEXT: stack-slot-coloring
-; GCN-O3-NEXT: machine-cp
-; GCN-O3-NEXT: machinelicm
-; GCN-O3-NEXT: si-fix-vgpr-copies
-; GCN-O3-NEXT: si-optimize-exec-masking
-; GCN-O3-NEXT: remove-redundant-debug-values
-; GCN-O3-NEXT: fixup-statepoint-caller-saved
-; GCN-O3-NEXT: PostRAMachineSinkingPass
-; GCN-O3-NEXT: ShrinkWrapPass
-; GCN-O3-NEXT: PrologEpilogInserterPass
-; GCN-O3-NEXT: branch-folder
-; GCN-O3-NEXT: tailduplication
-; GCN-O3-NEXT: machine-latecleanup
-; GCN-O3-NEXT: machine-cp
-; GCN-O3-NEXT: post-ra-pseudos
-; GCN-O3-NEXT: postmisched
-; GCN-O3-NEXT: block-placement
-; GCN-O3-NEXT: fentry-insert
-; GCN-O3-NEXT: xray-instrumentation
-; GCN-O3-NEXT: patchable-function
-; GCN-O3-NEXT: gcn-create-vopd
-; GCN-O3-NEXT: si-memory-legalizer
-; GCN-O3-NEXT: si-insert-waitcnts
-; GCN-O3-NEXT: si-late-branch-lowering
-; GCN-O3-NEXT: si-pre-emit-peephole
-; GCN-O3-NEXT: post-RA-hazard-rec
-; GCN-O3-NEXT: AMDGPUWaitSGPRHazardsPass
-; GCN-O3-NEXT: amdgpu-insert-delay-alu
-; GCN-O3-NEXT: branch-relaxation
-; GCN-O3-NEXT: remove-loads-into-fake-uses
-; GCN-O3-NEXT: live-debug-values
-; GCN-O3-NEXT: machine-sanmd
-; GCN-O3-NEXT: stack-frame-layout)
-; GCN-O3-NEXT: invalidate<machine-function-info>))
+; GCN-O3: require<profile-summary>
+; GCN-O3: require<collector-metadata>
+; GCN-O3: pre-isel-intrinsic-lowering
+; GCN-O3: function(expand-large-div-rem
+; GCN-O3: expand-fp)
+; GCN-O3: amdgpu-remove-incompatible-functions
+; GCN-O3: amdgpu-printf-runtime-binding
+; GCN-O3: amdgpu-lower-ctor-dtor
+; GCN-O3: function(amdgpu-image-intrinsic-opt)
+; GCN-O3: expand-variadics
+; GCN-O3: amdgpu-always-inline
+; GCN-O3: always-inline
+; GCN-O3: amdgpu-export-kernel-runtime-handles
+; GCN-O3: amdgpu-sw-lower-lds
+; GCN-O3: amdgpu-lower-module-lds
+; GCN-O3: function(infer-address-spaces
+; GCN-O3: amdgpu-atomic-optimizer
+; GCN-O3: atomic-expand
+; GCN-O3: amdgpu-promote-alloca
+; GCN-O3: separate-const-offset-from-gep<>
+; GCN-O3: slsr
+; GCN-O3: gvn<>
+; GCN-O3: nary-reassociate
+; GCN-O3: early-cse<>
+; GCN-O3: amdgpu-codegenprepare
+; GCN-O3: loop-mssa(loop-reduce)
+; GCN-O3: mergeicmps
+; GCN-O3: expand-memcmp
+; GCN-O3: gc-lowering
+; GCN-O3: lower-constant-intrinsics
+; GCN-O3: UnreachableBlockElimPass
+; GCN-O3: consthoist
+; GCN-O3: ReplaceWithVeclib
+; GCN-O3: partially-inline-libcalls
+; GCN-O3: ee-instrument<post-inline>
+; GCN-O3: scalarize-masked-mem-intrin
+; GCN-O3: ExpandReductionsPass
+; GCN-O3: gvn<>
+; GCN-O3: amdgpu-lower-kernel-arguments)
+; GCN-O3: amdgpu-lower-buffer-fat-pointers
+; GCN-O3: cgscc(function(codegenprepare
+; GCN-O3: load-store-vectorizer
+; GCN-O3: lower-switch
+; GCN-O3: lower-invoke
+; GCN-O3: UnreachableBlockElimPass
+; GCN-O3: flatten-cfg
+; GCN-O3: sink
+; GCN-O3: amdgpu-late-codegenprepare
+; GCN-O3: amdgpu-unify-divergent-exit-nodes
+; GCN-O3: fix-irreducible
+; GCN-O3: unify-loop-exits
+; GCN-O3: StructurizeCFGPass
+; GCN-O3: amdgpu-annotate-uniform
+; GCN-O3: si-annotate-control-flow
+; GCN-O3: amdgpu-rewrite-undef-for-phi
+; GCN-O3: lcssa))
+; GCN-O3: amdgpu-perf-hint
+; GCN-O3: cgscc(function(require<uniformity>
+; GCN-O3: callbr-prepare
+; GCN-O3: safe-stack
+; GCN-O3: stack-protector))
+; GCN-O3: cgscc(function(machine-function(amdgpu-isel
+; GCN-O3: si-fix-sgpr-copies
+; GCN-O3: si-i1-copies
+; GCN-O3: finalize-isel
+; GCN-O3: early-tailduplication
+; GCN-O3: opt-phis
+; GCN-O3: stack-coloring
+; GCN-O3: localstackalloc
+; GCN-O3: dead-mi-elimination
+; GCN-O3: early-machinelicm
+; GCN-O3: machine-cse
+; GCN-O3: machine-sink
+; GCN-O3: peephole-opt
+; GCN-O3: dead-mi-elimination
+; GCN-O3: si-fold-operands
+; GCN-O3: gcn-dpp-combine
+; GCN-O3: si-load-store-opt
+; GCN-O3: si-peephole-sdwa
+; GCN-O3: early-machinelicm
+; GCN-O3: machine-cse
+; GCN-O3: si-fold-operands
+; GCN-O3: dead-mi-elimination
+; GCN-O3: si-shrink-instructions
+; GCN-O3: detect-dead-lanes
+; GCN-O3: InitUndefPass
+; GCN-O3: ProcessImplicitDefsPass
+; GCN-O3: unreachable-mbb-elimination
+; GCN-O3: require<live-vars>
+; GCN-O3: require<machine-loops>
+; GCN-O3: phi-node-elimination
+; GCN-O3: two-address-instruction
+; GCN-O3: register-coalescer
+; GCN-O3: rename-independent-subregs
+; GCN-O3: machine-scheduler
+; GCN-O3: greedy<all>
+; GCN-O3: amdgpu-nsa-reassign
+; GCN-O3: VirtRegRewriterPass
+; GCN-O3: stack-slot-coloring
+; GCN-O3: machine-cp
+; GCN-O3: machinelicm
+; GCN-O3: si-fix-vgpr-copies
+; GCN-O3: si-optimize-exec-masking
+; GCN-O3: remove-redundant-debug-values
+; GCN-O3: fixup-statepoint-caller-saved
+; GCN-O3: PostRAMachineSinkingPass
+; GCN-O3: ShrinkWrapPass
+; GCN-O3: PrologEpilogInserterPass
+; GCN-O3: branch-folder
+; GCN-O3: tailduplication
+; GCN-O3: machine-latecleanup
+; GCN-O3: machine-cp
+; GCN-O3: post-ra-pseudos
+; GCN-O3: postmisched
+; GCN-O3: block-placement
+; GCN-O3: fentry-insert
+; GCN-O3: xray-instrumentation
+; GCN-O3: patchable-function
+; GCN-O3: gcn-create-vopd
+; GCN-O3: si-memory-legalizer
+; GCN-O3: si-insert-waitcnts
+; GCN-O3: si-late-branch-lowering
+; GCN-O3: si-pre-emit-peephole
+; GCN-O3: post-RA-hazard-rec
+; GCN-O3: AMDGPUWaitSGPRHazardsPass
+; GCN-O3: amdgpu-insert-delay-alu
+; GCN-O3: branch-relaxation
+; GCN-O3: remove-loads-into-fake-uses
+; GCN-O3: live-debug-values
+; GCN-O3: machine-sanmd
+; GCN-O3: stack-frame-layout)
+; GCN-O3: invalidate<machine-function-info>))
 
 
 define void @empty() {

>From cd553295ea5357b2886579089660eec822c0cca3 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 24 Apr 2025 04:51:11 +0000
Subject: [PATCH 3/6] separate tests for O2 and O3

---
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 139 ++++++++++++++++++-
 1 file changed, 137 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 7ba1771eba08d..fbfb850cd7776 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -1,7 +1,142 @@
-; RUN: llc -O3 -enable-new-pm -mtriple=amdgcn--amdhsa -disable-verify -print-pipeline-passes < %s 2>&1 \
+; RUN: llc -enable-new-pm -disable-verify -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
+; RUN:   | FileCheck -check-prefix=GCN-O2 %s
+
+; RUN: llc -O3 -enable-new-pm -disable-verify -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
-; REQUIRES: asserts
+; GCN-O2: require<MachineModuleAnalysis>
+; GCN-O2: require<profile-summary>
+; GCN-O2: require<collector-metadata>
+; GCN-O2: pre-isel-intrinsic-lowering
+; GCN-O2: function(expand-large-div-rem
+; GCN-O2: expand-fp)
+; GCN-O2: amdgpu-remove-incompatible-functions
+; GCN-O2: amdgpu-printf-runtime-binding
+; GCN-O2: amdgpu-lower-ctor-dtor
+; GCN-O2: function(amdgpu-image-intrinsic-opt)
+; GCN-O2: expand-variadics
+; GCN-O2: amdgpu-always-inline
+; GCN-O2: always-inline
+; GCN-O2: amdgpu-export-kernel-runtime-handles
+; GCN-O2: amdgpu-sw-lower-lds
+; GCN-O2: amdgpu-lower-module-lds
+; GCN-O2: function(infer-address-spaces
+; GCN-O2: amdgpu-atomic-optimizer
+; GCN-O2: atomic-expand
+; GCN-O2: amdgpu-promote-alloca
+; GCN-O2: separate-const-offset-from-gep<>
+; GCN-O2: slsr
+; GCN-O2: early-cse<>
+; GCN-O2: nary-reassociate
+; GCN-O2: early-cse<>
+; GCN-O2: amdgpu-codegenprepare
+; GCN-O2: loop-mssa(loop-reduce)
+; GCN-O2: mergeicmps
+; GCN-O2: expand-memcmp
+; GCN-O2: gc-lowering
+; GCN-O2: lower-constant-intrinsics
+; GCN-O2: UnreachableBlockElimPass
+; GCN-O2: consthoist
+; GCN-O2: ReplaceWithVeclib
+; GCN-O2: partially-inline-libcalls
+; GCN-O2: ee-instrument<post-inline>
+; GCN-O2: scalarize-masked-mem-intrin
+; GCN-O2: ExpandReductionsPass
+; GCN-O2: early-cse<>
+; GCN-O2: amdgpu-lower-kernel-arguments)
+; GCN-O2: amdgpu-lower-buffer-fat-pointers
+; GCN-O2: cgscc(function(codegenprepare
+; GCN-O2: load-store-vectorizer
+; GCN-O2: lower-switch
+; GCN-O2: lower-invoke
+; GCN-O2: UnreachableBlockElimPass
+; GCN-O2: flatten-cfg
+; GCN-O2: sink
+; GCN-O2: amdgpu-late-codegenprepare
+; GCN-O2: amdgpu-unify-divergent-exit-nodes
+; GCN-O2: fix-irreducible
+; GCN-O2: unify-loop-exits
+; GCN-O2: StructurizeCFGPass
+; GCN-O2: amdgpu-annotate-uniform
+; GCN-O2: si-annotate-control-flow
+; GCN-O2: amdgpu-rewrite-undef-for-phi
+; GCN-O2: lcssa))
+; GCN-O2: amdgpu-perf-hint
+; GCN-O2: cgscc(function(require<uniformity>
+; GCN-O2: callbr-prepare
+; GCN-O2: safe-stack
+; GCN-O2: stack-protector))
+; GCN-O2: cgscc(function(machine-function(amdgpu-isel
+; GCN-O2: si-fix-sgpr-copies
+; GCN-O2: si-i1-copies
+; GCN-O2: finalize-isel
+; GCN-O2: early-tailduplication
+; GCN-O2: opt-phis
+; GCN-O2: stack-coloring
+; GCN-O2: localstackalloc
+; GCN-O2: dead-mi-elimination
+; GCN-O2: early-machinelicm
+; GCN-O2: machine-cse
+; GCN-O2: machine-sink
+; GCN-O2: peephole-opt
+; GCN-O2: dead-mi-elimination
+; GCN-O2: si-fold-operands
+; GCN-O2: gcn-dpp-combine
+; GCN-O2: si-load-store-opt
+; GCN-O2: si-peephole-sdwa
+; GCN-O2: early-machinelicm
+; GCN-O2: machine-cse
+; GCN-O2: si-fold-operands
+; GCN-O2: dead-mi-elimination
+; GCN-O2: si-shrink-instructions
+; GCN-O2: detect-dead-lanes
+; GCN-O2: InitUndefPass
+; GCN-O2: ProcessImplicitDefsPass
+; GCN-O2: unreachable-mbb-elimination
+; GCN-O2: require<live-vars>
+; GCN-O2: require<machine-loops>
+; GCN-O2: phi-node-elimination
+; GCN-O2: two-address-instruction
+; GCN-O2: register-coalescer
+; GCN-O2: rename-independent-subregs
+; GCN-O2: machine-scheduler
+; GCN-O2: greedy<all>
+; GCN-O2: amdgpu-nsa-reassign
+; GCN-O2: VirtRegRewriterPass
+; GCN-O2: stack-slot-coloring
+; GCN-O2: machine-cp
+; GCN-O2: machinelicm
+; GCN-O2: si-fix-vgpr-copies
+; GCN-O2: si-optimize-exec-masking
+; GCN-O2: remove-redundant-debug-values
+; GCN-O2: fixup-statepoint-caller-saved
+; GCN-O2: PostRAMachineSinkingPass
+; GCN-O2: ShrinkWrapPass
+; GCN-O2: PrologEpilogInserterPass
+; GCN-O2: branch-folder
+; GCN-O2: tailduplication
+; GCN-O2: machine-latecleanup
+; GCN-O2: machine-cp
+; GCN-O2: post-ra-pseudos
+; GCN-O2: postmisched
+; GCN-O2: block-placement
+; GCN-O2: fentry-insert
+; GCN-O2: xray-instrumentation
+; GCN-O2: patchable-function
+; GCN-O2: gcn-create-vopd
+; GCN-O2: si-memory-legalizer
+; GCN-O2: si-insert-waitcnts
+; GCN-O2: si-late-branch-lowering
+; GCN-O2: si-pre-emit-peephole
+; GCN-O2: post-RA-hazard-rec
+; GCN-O2: AMDGPUWaitSGPRHazardsPass
+; GCN-O2: amdgpu-insert-delay-alu
+; GCN-O2: branch-relaxation
+; GCN-O2: remove-loads-into-fake-uses
+; GCN-O2: live-debug-values
+; GCN-O2: machine-sanmd
+; GCN-O2: stack-frame-layout)
+; GCN-O2: invalidate<machine-function-info>))
 
 ; GCN-O3: require<MachineModuleAnalysis>
 ; GCN-O3: require<profile-summary>

>From ce1ce982b8aaf2565fb73beaa00e0dc44ce1024c Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 5 May 2025 06:49:51 +0000
Subject: [PATCH 4/6] condense test into one line

pipeline is printed on a single line, so having CHECK lines on separate
lines can allow extra characters in between (and will not error out on
extra passes being in the pipeline)
---
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 279 +------------------
 1 file changed, 11 insertions(+), 268 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index fbfb850cd7776..e9b57515e71e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -1,276 +1,19 @@
-; RUN: llc -enable-new-pm -disable-verify -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
+; RUN: llc -enable-new-pm -mtriple=amdgcn--amdhsa -O0 -print-pipeline-passes < %s 2>&1 \
+; RUN:   | FileCheck -check-prefix=GCN-O0 %s
+
+; RUN: llc -enable-new-pm -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
 ; RUN:   | FileCheck -check-prefix=GCN-O2 %s
 
-; RUN: llc -O3 -enable-new-pm -disable-verify -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
+; RUN: llc -O3 -enable-new-pm -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
-; GCN-O2: require<MachineModuleAnalysis>
-; GCN-O2: require<profile-summary>
-; GCN-O2: require<collector-metadata>
-; GCN-O2: pre-isel-intrinsic-lowering
-; GCN-O2: function(expand-large-div-rem
-; GCN-O2: expand-fp)
-; GCN-O2: amdgpu-remove-incompatible-functions
-; GCN-O2: amdgpu-printf-runtime-binding
-; GCN-O2: amdgpu-lower-ctor-dtor
-; GCN-O2: function(amdgpu-image-intrinsic-opt)
-; GCN-O2: expand-variadics
-; GCN-O2: amdgpu-always-inline
-; GCN-O2: always-inline
-; GCN-O2: amdgpu-export-kernel-runtime-handles
-; GCN-O2: amdgpu-sw-lower-lds
-; GCN-O2: amdgpu-lower-module-lds
-; GCN-O2: function(infer-address-spaces
-; GCN-O2: amdgpu-atomic-optimizer
-; GCN-O2: atomic-expand
-; GCN-O2: amdgpu-promote-alloca
-; GCN-O2: separate-const-offset-from-gep<>
-; GCN-O2: slsr
-; GCN-O2: early-cse<>
-; GCN-O2: nary-reassociate
-; GCN-O2: early-cse<>
-; GCN-O2: amdgpu-codegenprepare
-; GCN-O2: loop-mssa(loop-reduce)
-; GCN-O2: mergeicmps
-; GCN-O2: expand-memcmp
-; GCN-O2: gc-lowering
-; GCN-O2: lower-constant-intrinsics
-; GCN-O2: UnreachableBlockElimPass
-; GCN-O2: consthoist
-; GCN-O2: ReplaceWithVeclib
-; GCN-O2: partially-inline-libcalls
-; GCN-O2: ee-instrument<post-inline>
-; GCN-O2: scalarize-masked-mem-intrin
-; GCN-O2: ExpandReductionsPass
-; GCN-O2: early-cse<>
-; GCN-O2: amdgpu-lower-kernel-arguments)
-; GCN-O2: amdgpu-lower-buffer-fat-pointers
-; GCN-O2: cgscc(function(codegenprepare
-; GCN-O2: load-store-vectorizer
-; GCN-O2: lower-switch
-; GCN-O2: lower-invoke
-; GCN-O2: UnreachableBlockElimPass
-; GCN-O2: flatten-cfg
-; GCN-O2: sink
-; GCN-O2: amdgpu-late-codegenprepare
-; GCN-O2: amdgpu-unify-divergent-exit-nodes
-; GCN-O2: fix-irreducible
-; GCN-O2: unify-loop-exits
-; GCN-O2: StructurizeCFGPass
-; GCN-O2: amdgpu-annotate-uniform
-; GCN-O2: si-annotate-control-flow
-; GCN-O2: amdgpu-rewrite-undef-for-phi
-; GCN-O2: lcssa))
-; GCN-O2: amdgpu-perf-hint
-; GCN-O2: cgscc(function(require<uniformity>
-; GCN-O2: callbr-prepare
-; GCN-O2: safe-stack
-; GCN-O2: stack-protector))
-; GCN-O2: cgscc(function(machine-function(amdgpu-isel
-; GCN-O2: si-fix-sgpr-copies
-; GCN-O2: si-i1-copies
-; GCN-O2: finalize-isel
-; GCN-O2: early-tailduplication
-; GCN-O2: opt-phis
-; GCN-O2: stack-coloring
-; GCN-O2: localstackalloc
-; GCN-O2: dead-mi-elimination
-; GCN-O2: early-machinelicm
-; GCN-O2: machine-cse
-; GCN-O2: machine-sink
-; GCN-O2: peephole-opt
-; GCN-O2: dead-mi-elimination
-; GCN-O2: si-fold-operands
-; GCN-O2: gcn-dpp-combine
-; GCN-O2: si-load-store-opt
-; GCN-O2: si-peephole-sdwa
-; GCN-O2: early-machinelicm
-; GCN-O2: machine-cse
-; GCN-O2: si-fold-operands
-; GCN-O2: dead-mi-elimination
-; GCN-O2: si-shrink-instructions
-; GCN-O2: detect-dead-lanes
-; GCN-O2: InitUndefPass
-; GCN-O2: ProcessImplicitDefsPass
-; GCN-O2: unreachable-mbb-elimination
-; GCN-O2: require<live-vars>
-; GCN-O2: require<machine-loops>
-; GCN-O2: phi-node-elimination
-; GCN-O2: two-address-instruction
-; GCN-O2: register-coalescer
-; GCN-O2: rename-independent-subregs
-; GCN-O2: machine-scheduler
-; GCN-O2: greedy<all>
-; GCN-O2: amdgpu-nsa-reassign
-; GCN-O2: VirtRegRewriterPass
-; GCN-O2: stack-slot-coloring
-; GCN-O2: machine-cp
-; GCN-O2: machinelicm
-; GCN-O2: si-fix-vgpr-copies
-; GCN-O2: si-optimize-exec-masking
-; GCN-O2: remove-redundant-debug-values
-; GCN-O2: fixup-statepoint-caller-saved
-; GCN-O2: PostRAMachineSinkingPass
-; GCN-O2: ShrinkWrapPass
-; GCN-O2: PrologEpilogInserterPass
-; GCN-O2: branch-folder
-; GCN-O2: tailduplication
-; GCN-O2: machine-latecleanup
-; GCN-O2: machine-cp
-; GCN-O2: post-ra-pseudos
-; GCN-O2: postmisched
-; GCN-O2: block-placement
-; GCN-O2: fentry-insert
-; GCN-O2: xray-instrumentation
-; GCN-O2: patchable-function
-; GCN-O2: gcn-create-vopd
-; GCN-O2: si-memory-legalizer
-; GCN-O2: si-insert-waitcnts
-; GCN-O2: si-late-branch-lowering
-; GCN-O2: si-pre-emit-peephole
-; GCN-O2: post-RA-hazard-rec
-; GCN-O2: AMDGPUWaitSGPRHazardsPass
-; GCN-O2: amdgpu-insert-delay-alu
-; GCN-O2: branch-relaxation
-; GCN-O2: remove-loads-into-fake-uses
-; GCN-O2: live-debug-values
-; GCN-O2: machine-sanmd
-; GCN-O2: stack-frame-layout)
-; GCN-O2: invalidate<machine-function-info>))
 
-; GCN-O3: require<MachineModuleAnalysis>
-; GCN-O3: require<profile-summary>
-; GCN-O3: require<collector-metadata>
-; GCN-O3: pre-isel-intrinsic-lowering
-; GCN-O3: function(expand-large-div-rem
-; GCN-O3: expand-fp)
-; GCN-O3: amdgpu-remove-incompatible-functions
-; GCN-O3: amdgpu-printf-runtime-binding
-; GCN-O3: amdgpu-lower-ctor-dtor
-; GCN-O3: function(amdgpu-image-intrinsic-opt)
-; GCN-O3: expand-variadics
-; GCN-O3: amdgpu-always-inline
-; GCN-O3: always-inline
-; GCN-O3: amdgpu-export-kernel-runtime-handles
-; GCN-O3: amdgpu-sw-lower-lds
-; GCN-O3: amdgpu-lower-module-lds
-; GCN-O3: function(infer-address-spaces
-; GCN-O3: amdgpu-atomic-optimizer
-; GCN-O3: atomic-expand
-; GCN-O3: amdgpu-promote-alloca
-; GCN-O3: separate-const-offset-from-gep<>
-; GCN-O3: slsr
-; GCN-O3: gvn<>
-; GCN-O3: nary-reassociate
-; GCN-O3: early-cse<>
-; GCN-O3: amdgpu-codegenprepare
-; GCN-O3: loop-mssa(loop-reduce)
-; GCN-O3: mergeicmps
-; GCN-O3: expand-memcmp
-; GCN-O3: gc-lowering
-; GCN-O3: lower-constant-intrinsics
-; GCN-O3: UnreachableBlockElimPass
-; GCN-O3: consthoist
-; GCN-O3: ReplaceWithVeclib
-; GCN-O3: partially-inline-libcalls
-; GCN-O3: ee-instrument<post-inline>
-; GCN-O3: scalarize-masked-mem-intrin
-; GCN-O3: ExpandReductionsPass
-; GCN-O3: gvn<>
-; GCN-O3: amdgpu-lower-kernel-arguments)
-; GCN-O3: amdgpu-lower-buffer-fat-pointers
-; GCN-O3: cgscc(function(codegenprepare
-; GCN-O3: load-store-vectorizer
-; GCN-O3: lower-switch
-; GCN-O3: lower-invoke
-; GCN-O3: UnreachableBlockElimPass
-; GCN-O3: flatten-cfg
-; GCN-O3: sink
-; GCN-O3: amdgpu-late-codegenprepare
-; GCN-O3: amdgpu-unify-divergent-exit-nodes
-; GCN-O3: fix-irreducible
-; GCN-O3: unify-loop-exits
-; GCN-O3: StructurizeCFGPass
-; GCN-O3: amdgpu-annotate-uniform
-; GCN-O3: si-annotate-control-flow
-; GCN-O3: amdgpu-rewrite-undef-for-phi
-; GCN-O3: lcssa))
-; GCN-O3: amdgpu-perf-hint
-; GCN-O3: cgscc(function(require<uniformity>
-; GCN-O3: callbr-prepare
-; GCN-O3: safe-stack
-; GCN-O3: stack-protector))
-; GCN-O3: cgscc(function(machine-function(amdgpu-isel
-; GCN-O3: si-fix-sgpr-copies
-; GCN-O3: si-i1-copies
-; GCN-O3: finalize-isel
-; GCN-O3: early-tailduplication
-; GCN-O3: opt-phis
-; GCN-O3: stack-coloring
-; GCN-O3: localstackalloc
-; GCN-O3: dead-mi-elimination
-; GCN-O3: early-machinelicm
-; GCN-O3: machine-cse
-; GCN-O3: machine-sink
-; GCN-O3: peephole-opt
-; GCN-O3: dead-mi-elimination
-; GCN-O3: si-fold-operands
-; GCN-O3: gcn-dpp-combine
-; GCN-O3: si-load-store-opt
-; GCN-O3: si-peephole-sdwa
-; GCN-O3: early-machinelicm
-; GCN-O3: machine-cse
-; GCN-O3: si-fold-operands
-; GCN-O3: dead-mi-elimination
-; GCN-O3: si-shrink-instructions
-; GCN-O3: detect-dead-lanes
-; GCN-O3: InitUndefPass
-; GCN-O3: ProcessImplicitDefsPass
-; GCN-O3: unreachable-mbb-elimination
-; GCN-O3: require<live-vars>
-; GCN-O3: require<machine-loops>
-; GCN-O3: phi-node-elimination
-; GCN-O3: two-address-instruction
-; GCN-O3: register-coalescer
-; GCN-O3: rename-independent-subregs
-; GCN-O3: machine-scheduler
-; GCN-O3: greedy<all>
-; GCN-O3: amdgpu-nsa-reassign
-; GCN-O3: VirtRegRewriterPass
-; GCN-O3: stack-slot-coloring
-; GCN-O3: machine-cp
-; GCN-O3: machinelicm
-; GCN-O3: si-fix-vgpr-copies
-; GCN-O3: si-optimize-exec-masking
-; GCN-O3: remove-redundant-debug-values
-; GCN-O3: fixup-statepoint-caller-saved
-; GCN-O3: PostRAMachineSinkingPass
-; GCN-O3: ShrinkWrapPass
-; GCN-O3: PrologEpilogInserterPass
-; GCN-O3: branch-folder
-; GCN-O3: tailduplication
-; GCN-O3: machine-latecleanup
-; GCN-O3: machine-cp
-; GCN-O3: post-ra-pseudos
-; GCN-O3: postmisched
-; GCN-O3: block-placement
-; GCN-O3: fentry-insert
-; GCN-O3: xray-instrumentation
-; GCN-O3: patchable-function
-; GCN-O3: gcn-create-vopd
-; GCN-O3: si-memory-legalizer
-; GCN-O3: si-insert-waitcnts
-; GCN-O3: si-late-branch-lowering
-; GCN-O3: si-pre-emit-peephole
-; GCN-O3: post-RA-hazard-rec
-; GCN-O3: AMDGPUWaitSGPRHazardsPass
-; GCN-O3: amdgpu-insert-delay-alu
-; GCN-O3: branch-relaxation
-; GCN-O3: remove-loads-into-fake-uses
-; GCN-O3: live-debug-values
-; GCN-O3: machine-sanmd
-; GCN-O3: stack-frame-layout)
-; GCN-O3: invalidate<machine-function-info>))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,UnreachableBlockElimPass,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,AMDGPUWaitSGPRHazardsPass,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+
+
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(infer-address-spaces,amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,consthoist,ReplaceWithVeclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,early-cse<>,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,UnreachableBlockElimPass,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions,detect-dead-lanes,InitUndefPass,ProcessImplicitDefsPass,unreachable-mbb-elimination,require<live-vars>,require<machine-loops>,phi-node-elimination,two-address-instruction,register-coalescer,rename-independent-subregs,machine-scheduler,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,PostRAMachineSinkingPass,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,AMDGPUWaitSGPRHazardsPass,amdgpu-insert-delay-alu,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(infer-address-spaces,amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,consthoist,ReplaceWithVeclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,gvn<>,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,UnreachableBlockElimPass,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions,detect-dead-lanes,InitUndefPass,ProcessImplicitDefsPass,unreachable-mbb-elimination,require<live-vars>,require<machine-loops>,phi-node-elimination,two-address-instruction,register-coalescer,rename-independent-subregs,machine-scheduler,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,PostRAMachineSinkingPass,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,AMDGPUWaitSGPRHazardsPass,amdgpu-insert-delay-alu,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
 
 
 define void @empty() {

>From 5898658b55362968317175445c5660b8af41fe6e Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Tue, 6 May 2025 09:21:32 +0000
Subject: [PATCH 5/6] early return

---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 67f4a36511c5b..fe3a41b814a89 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -329,18 +329,18 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
 
   private:
     void flushMFPMToMPM() {
-      if (!MFPM.isEmpty()) {
-        if (PB.AddInCGSCCOrder) {
-          MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-              createCGSCCToFunctionPassAdaptor(
-                  createFunctionToMachineFunctionPassAdaptor(
-                      std::move(MFPM)))));
-        } else {
-          MPM.addPass(createModuleToFunctionPassAdaptor(
-              createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))));
-        }
-        MFPM = MachineFunctionPassManager();
+      if (MFPM.isEmpty())
+        return;
+
+      if (PB.AddInCGSCCOrder) {
+        MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+            createCGSCCToFunctionPassAdaptor(
+                createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)))));
+      } else {
+        MPM.addPass(createModuleToFunctionPassAdaptor(
+            createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))));
       }
+      MFPM = MachineFunctionPassManager();
     }
 
     ModulePassManager &MPM;

>From a5389e158c8d131241294f5f6d76e0d6c0004dc9 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 12 May 2025 08:08:45 +0000
Subject: [PATCH 6/6] more early returns

---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index fe3a41b814a89..a3b19af4adc39 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -253,15 +253,15 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
 
   private:
     void flushFPMToMPM() {
-      if (!FPM.isEmpty()) {
-        if (PB.AddInCGSCCOrder) {
-          MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-              createCGSCCToFunctionPassAdaptor(std::move(FPM))));
-        } else {
-          MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-        }
-        FPM = FunctionPassManager();
+      if (FPM.isEmpty())
+        return;
+      if (PB.AddInCGSCCOrder) {
+        MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+            createCGSCCToFunctionPassAdaptor(std::move(FPM))));
+      } else {
+        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       }
+      FPM = FunctionPassManager();
     }
     ModulePassManager &MPM;
     FunctionPassManager FPM;
@@ -274,17 +274,17 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
     AddMachinePass(ModulePassManager &MPM, const DerivedT &PB)
         : MPM(MPM), PB(PB) {}
     ~AddMachinePass() {
-      if (!MFPM.isEmpty()) {
-        FunctionPassManager FPM;
-        FPM.addPass(
-            createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
-        FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
-        if (this->PB.AddInCGSCCOrder) {
-          MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-              createCGSCCToFunctionPassAdaptor(std::move(FPM))));
-        } else
-          MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-      }
+      if (MFPM.isEmpty())
+        return;
+
+      FunctionPassManager FPM;
+      FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
+      FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
+      if (this->PB.AddInCGSCCOrder) {
+        MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+            createCGSCCToFunctionPassAdaptor(std::move(FPM))));
+      } else
+        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
     }
 
     template <typename PassT>



More information about the llvm-commits mailing list