[llvm-branch-commits] [llvm] AMDGPU: Declare pass control flags in header (PR #102865)

Mon Aug 12 04:39:46 PDT 2024

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/102865

>From c5882b7e0b8bc85390cb82495821965013494d12 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 12 Aug 2024 12:46:00 +0400
Subject: [PATCH] AMDGPU: Declare pass control flags in header

This will allow them to be shared between the old PM and new PM files.

I don't really like needing to expose these globally like this; maybe
it would be better to just move TargetPassConfig and the CodeGenPassBuilder
into one common file?
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 203 ++++++++----------
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h  |  41 ++++
 2 files changed, 133 insertions(+), 111 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index cad4585c5b3013..3409a49fe203f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -74,6 +74,7 @@
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
+using namespace llvm::AMDGPU;
 
 namespace {
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
@@ -186,109 +187,95 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR(
   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
 } // anonymous namespace
 
-static cl::opt<bool>
-EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
-                        cl::desc("Run early if-conversion"),
-                        cl::init(false));
+namespace llvm::AMDGPU {
+cl::opt<bool> EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                                      cl::desc("Run early if-conversion"),
+                                      cl::init(false));
 
-static cl::opt<bool>
-OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
-            cl::desc("Run pre-RA exec mask optimizations"),
-            cl::init(true));
+cl::opt<bool> OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+                               cl::desc("Run pre-RA exec mask optimizations"),
+                               cl::init(true));
 
-static cl::opt<bool>
+cl::opt<bool>
     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
                   cl::desc("Lower GPU ctor / dtors to globals on the device."),
                   cl::init(true), cl::Hidden);
 
 // Option to disable vectorizer for tests.
-static cl::opt<bool> EnableLoadStoreVectorizer(
-  "amdgpu-load-store-vectorizer",
-  cl::desc("Enable load store vectorizer"),
-  cl::init(true),
-  cl::Hidden);
+cl::opt<bool>
+    EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer",
+                              cl::desc("Enable load store vectorizer"),
+                              cl::init(true), cl::Hidden);
 
 // Option to control global loads scalarization
-static cl::opt<bool> ScalarizeGlobal(
-  "amdgpu-scalarize-global-loads",
-  cl::desc("Enable global load scalarization"),
-  cl::init(true),
-  cl::Hidden);
+cl::opt<bool> ScalarizeGlobal("amdgpu-scalarize-global-loads",
+                              cl::desc("Enable global load scalarization"),
+                              cl::init(true), cl::Hidden);
 
 // Option to run internalize pass.
-static cl::opt<bool> InternalizeSymbols(
-  "amdgpu-internalize-symbols",
-  cl::desc("Enable elimination of non-kernel functions and unused globals"),
-  cl::init(false),
-  cl::Hidden);
+cl::opt<bool> InternalizeSymbols(
+    "amdgpu-internalize-symbols",
+    cl::desc("Enable elimination of non-kernel functions and unused globals"),
+    cl::init(false), cl::Hidden);
 
 // Option to inline all early.
-static cl::opt<bool> EarlyInlineAll(
-  "amdgpu-early-inline-all",
-  cl::desc("Inline all functions early"),
-  cl::init(false),
-  cl::Hidden);
+cl::opt<bool> EarlyInlineAll("amdgpu-early-inline-all",
+                             cl::desc("Inline all functions early"),
+                             cl::init(false), cl::Hidden);
 
-static cl::opt<bool> RemoveIncompatibleFunctions(
+cl::opt<bool> RemoveIncompatibleFunctions(
     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
     cl::desc("Enable removal of functions when they"
              "use features not supported by the target GPU"),
     cl::init(true));
 
-static cl::opt<bool> EnableSDWAPeephole(
-  "amdgpu-sdwa-peephole",
-  cl::desc("Enable SDWA peepholer"),
-  cl::init(true));
+cl::opt<bool> EnableSDWAPeephole("amdgpu-sdwa-peephole",
+                                 cl::desc("Enable SDWA peepholer"),
+                                 cl::init(true));
 
-static cl::opt<bool> EnableDPPCombine(
-  "amdgpu-dpp-combine",
-  cl::desc("Enable DPP combiner"),
-  cl::init(true));
+cl::opt<bool> EnableDPPCombine("amdgpu-dpp-combine",
+                               cl::desc("Enable DPP combiner"), cl::init(true));
 
 // Enable address space based alias analysis
-static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
-  cl::desc("Enable AMDGPU Alias Analysis"),
-  cl::init(true));
+cl::opt<bool>
+    EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+                              cl::desc("Enable AMDGPU Alias Analysis"),
+                              cl::init(true));
 
 // Option to run late CFG structurizer
-static cl::opt<bool, true> LateCFGStructurize(
-  "amdgpu-late-structurize",
-  cl::desc("Enable late CFG structurization"),
-  cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
-  cl::Hidden);
+cl::opt<bool, true> LateCFGStructurize(
+    "amdgpu-late-structurize", cl::desc("Enable late CFG structurization"),
+    cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden);
 
 // Disable structurizer-based control-flow lowering in order to test convergence
 // control tokens. This should eventually be replaced by the wave-transform.
-static cl::opt<bool, true> DisableStructurizer(
+cl::opt<bool, true> DisableStructurizer(
     "amdgpu-disable-structurizer",
     cl::desc("Disable structurizer for experiments; produces unusable code"),
     cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden);
 
 // Enable lib calls simplifications
-static cl::opt<bool> EnableLibCallSimplify(
-  "amdgpu-simplify-libcall",
-  cl::desc("Enable amdgpu library simplifications"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> EnableLowerKernelArguments(
-  "amdgpu-ir-lower-kernel-arguments",
-  cl::desc("Lower kernel argument loads in IR pass"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> EnableRegReassign(
-  "amdgpu-reassign-regs",
-  cl::desc("Enable register reassign optimizations on gfx10+"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> OptVGPRLiveRange(
+cl::opt<bool>
+    EnableLibCallSimplify("amdgpu-simplify-libcall",
+                          cl::desc("Enable amdgpu library simplifications"),
+                          cl::init(true), cl::Hidden);
+
+cl::opt<bool> EnableLowerKernelArguments(
+    "amdgpu-ir-lower-kernel-arguments",
+    cl::desc("Lower kernel argument loads in IR pass"), cl::init(true),
+    cl::Hidden);
+
+cl::opt<bool> EnableRegReassign(
+    "amdgpu-reassign-regs",
+    cl::desc("Enable register reassign optimizations on gfx10+"),
+    cl::init(true), cl::Hidden);
+
+cl::opt<bool> OptVGPRLiveRange(
     "amdgpu-opt-vgpr-liverange",
     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
     cl::init(true), cl::Hidden);
 
-static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
+cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
     "amdgpu-atomic-optimizer-strategy",
     cl::desc("Select DPP or Iterative strategy for scan"),
     cl::init(ScanOptions::Iterative),
@@ -299,91 +286,85 @@ static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
 
 // Enable Mode register optimization
-static cl::opt<bool> EnableSIModeRegisterPass(
-  "amdgpu-mode-register",
-  cl::desc("Enable mode register pass"),
-  cl::init(true),
-  cl::Hidden);
+cl::opt<bool> EnableSIModeRegisterPass("amdgpu-mode-register",
+                                       cl::desc("Enable mode register pass"),
+                                       cl::init(true), cl::Hidden);
 
 // Enable GFX11.5+ s_singleuse_vdst insertion
-static cl::opt<bool>
+cl::opt<bool>
     EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
                               cl::desc("Enable s_singleuse_vdst insertion"),
                               cl::init(false), cl::Hidden);
 
 // Enable GFX11+ s_delay_alu insertion
-static cl::opt<bool>
-    EnableInsertDelayAlu("amdgpu-enable-delay-alu",
-                         cl::desc("Enable s_delay_alu insertion"),
-                         cl::init(true), cl::Hidden);
+cl::opt<bool> EnableInsertDelayAlu("amdgpu-enable-delay-alu",
+                                   cl::desc("Enable s_delay_alu insertion"),
+                                   cl::init(true), cl::Hidden);
 
 // Enable GFX11+ VOPD
-static cl::opt<bool>
-    EnableVOPD("amdgpu-enable-vopd",
-               cl::desc("Enable VOPD, dual issue of VALU in wave32"),
-               cl::init(true), cl::Hidden);
+cl::opt<bool> EnableVOPD("amdgpu-enable-vopd",
+                         cl::desc("Enable VOPD, dual issue of VALU in wave32"),
+                         cl::init(true), cl::Hidden);
 
 // Option is used in lit tests to prevent deadcoding of patterns inspected.
-static cl::opt<bool>
-EnableDCEInRA("amdgpu-dce-in-ra",
-    cl::init(true), cl::Hidden,
-    cl::desc("Enable machine DCE inside regalloc"));
-
-static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
-                                           cl::desc("Adjust wave priority"),
-                                           cl::init(false), cl::Hidden);
-
-static cl::opt<bool> EnableScalarIRPasses(
-  "amdgpu-scalar-ir-passes",
-  cl::desc("Enable scalar IR passes"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool, true> EnableStructurizerWorkarounds(
+cl::opt<bool> EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden,
+                            cl::desc("Enable machine DCE inside regalloc"));
+
+cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                    cl::desc("Adjust wave priority"),
+                                    cl::init(false), cl::Hidden);
+
+cl::opt<bool> EnableScalarIRPasses("amdgpu-scalar-ir-passes",
+                                   cl::desc("Enable scalar IR passes"),
+                                   cl::init(true), cl::Hidden);
+
+cl::opt<bool, true> EnableStructurizerWorkarounds(
     "amdgpu-enable-structurizer-workarounds",
     cl::desc("Enable workarounds for the StructurizeCFG pass"),
     cl::location(AMDGPUTargetMachine::EnableStructurizerWorkarounds),
     cl::init(true), cl::Hidden);
 
-static cl::opt<bool, true> EnableLowerModuleLDS(
+cl::opt<bool, true> EnableLowerModuleLDS(
     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnablePreRAOptimizations(
-    "amdgpu-enable-pre-ra-optimizations",
-    cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
-    cl::Hidden);
+cl::opt<bool>
+    EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations",
+                             cl::desc("Enable Pre-RA optimizations pass"),
+                             cl::init(true), cl::Hidden);
 
-static cl::opt<bool> EnablePromoteKernelArguments(
+cl::opt<bool> EnablePromoteKernelArguments(
     "amdgpu-enable-promote-kernel-arguments",
     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<bool> EnableImageIntrinsicOptimizer(
+cl::opt<bool> EnableImageIntrinsicOptimizer(
     "amdgpu-enable-image-intrinsic-optimizer",
     cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool>
+cl::opt<bool>
     EnableLoopPrefetch("amdgpu-loop-prefetch",
                        cl::desc("Enable loop data prefetch on AMDGPU"),
                        cl::Hidden, cl::init(false));
 
-static cl::opt<bool> EnableMaxIlpSchedStrategy(
+cl::opt<bool> EnableMaxIlpSchedStrategy(
     "amdgpu-enable-max-ilp-scheduling-strategy",
     cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
     cl::Hidden, cl::init(false));
 
-static cl::opt<bool> EnableRewritePartialRegUses(
+cl::opt<bool> EnableRewritePartialRegUses(
     "amdgpu-enable-rewrite-partial-reg-uses",
     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnableHipStdPar(
-  "amdgpu-enable-hipstdpar",
-  cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
-  cl::Hidden);
+cl::opt<bool>
+    EnableHipStdPar("amdgpu-enable-hipstdpar",
+                    cl::desc("Enable HIP Standard Parallelism Offload support"),
+                    cl::init(false), cl::Hidden);
+
+} // namespace llvm::AMDGPU
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 4d39ad2b415052..f01e26a846f433 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -16,12 +16,53 @@
 
 #include "GCNSubtarget.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 #include <optional>
 #include <utility>
 
 namespace llvm {
 
+enum class ScanOptions;
+
+namespace AMDGPU {
+
+extern cl::opt<bool> EnableEarlyIfConversion;
+extern cl::opt<bool> OptExecMaskPreRA;
+extern cl::opt<bool> LowerCtorDtor;
+extern cl::opt<bool> EnableLoadStoreVectorizer;
+extern cl::opt<bool> ScalarizeGlobal;
+extern cl::opt<bool> InternalizeSymbols;
+extern cl::opt<bool> EarlyInlineAll;
+extern cl::opt<bool> RemoveIncompatibleFunctions;
+extern cl::opt<bool> EnableSDWAPeephole;
+extern cl::opt<bool> EnableDPPCombine;
+extern cl::opt<bool> EnableAMDGPUAliasAnalysis;
+extern cl::opt<bool, true> LateCFGStructurize;
+extern cl::opt<bool, true> DisableStructurizer;
+extern cl::opt<bool> EnableLibCallSimplify;
+extern cl::opt<bool> EnableLowerKernelArguments;
+extern cl::opt<bool> EnableRegReassign;
+extern cl::opt<bool> OptVGPRLiveRange;
+extern cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy;
+extern cl::opt<bool> EnableSIModeRegisterPass;
+extern cl::opt<bool> EnableInsertSingleUseVDST;
+extern cl::opt<bool> EnableInsertDelayAlu;
+extern cl::opt<bool> EnableVOPD;
+extern cl::opt<bool> EnableDCEInRA;
+extern cl::opt<bool> EnableSetWavePriority;
+extern cl::opt<bool> EnableScalarIRPasses;
+extern cl::opt<bool, true> EnableStructurizerWorkarounds;
+extern cl::opt<bool, true> EnableLowerModuleLDS;
+extern cl::opt<bool> EnablePreRAOptimizations;
+extern cl::opt<bool> EnablePromoteKernelArguments;
+extern cl::opt<bool> EnableImageIntrinsicOptimizer;
+extern cl::opt<bool> EnableLoopPrefetch;
+extern cl::opt<bool> EnableMaxIlpSchedStrategy;
+extern cl::opt<bool> EnableRewritePartialRegUses;
+extern cl::opt<bool> EnableHipStdPar;
+} // namespace AMDGPU
+
 //===----------------------------------------------------------------------===//
 // AMDGPU Target Machine (R600+)
 //===----------------------------------------------------------------------===//