[llvm] [AMDGPU] Move AMDGPUTargetMachine into AMDGPUCodeGenPassBuilder(NFC) (PR #103720)

Wed Aug 14 02:01:16 PDT 2024

https://github.com/cdevadas created https://github.com/llvm/llvm-project/pull/103720

This will allow us to reuse the existing flags and the static
functions while building the pipeline for new pass manager.

>From da16d0df05206df2bff51748b03c904d609c2df0 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Wed, 14 Aug 2024 12:28:32 +0530
Subject: [PATCH] [AMDGPU] Move AMDGPUTargetMachine into
 AMDGPUCodeGenPassBuilder(NFC)

This will allow us to reuse the existing flags and the static
functions while building the pipeline for new pass manager.
---
 .../Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp  |    2 +-
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |    2 +-
 .../AMDGPU/AMDGPUCodeGenPassBuilder.cpp       | 1735 +++++++++++++++-
 .../Target/AMDGPU/AMDGPUCodeGenPassBuilder.h  |  143 +-
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |    2 +-
 .../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp  |    2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |    2 +-
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |    2 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |    2 +-
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |    2 +-
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h    |    2 +-
 .../Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp  |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp  |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |    2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1751 -----------------
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h  |  162 --
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |    2 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    1 -
 llvm/lib/Target/AMDGPU/R600TargetMachine.cpp  |    1 -
 llvm/lib/Target/AMDGPU/R600TargetMachine.h    |    2 +-
 .../Target/AMDGPU/R600TargetTransformInfo.cpp |    2 +-
 .../Target/AMDGPU/SIAnnotateControlFlow.cpp   |    2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |    2 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |    1 -
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |    2 +-
 .../Target/AMDGPU/AMDGPUUnitTests.cpp         |    2 +-
 .../Target/AMDGPU/DwarfRegMappings.cpp        |    2 +-
 .../AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp  |    2 +-
 llvm/unittests/Target/AMDGPU/PALMetadata.cpp  |    2 +-
 34 files changed, 1903 insertions(+), 1945 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index f55f656ff922c1..adaecd4cee8383 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index f57fc168c1dfce..59d8c84430f9d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -8,7 +8,7 @@
 
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 25e36dc4b3691f..e124cfb3fba72d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -14,8 +14,8 @@
 
 #include "AMDGPUCallLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPULegalizerInfo.h"
-#include "AMDGPUTargetMachine.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/Analysis.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
index fb3d3259171aca..0d7233432fc2b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
@@ -5,15 +5,1748 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains both AMDGPU target machine and the CodeGen pass builder.
+/// The AMDGPU target machine contains all of the hardware specific information
+/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
+/// CodeGen pass builder design is its equivalent for the new pass manager.
+//
+//===----------------------------------------------------------------------===//
 
 #include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPU.h"
+#include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUCtorDtorLowering.h"
+#include "AMDGPUExportClustering.h"
+#include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUMacroFusion.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "AMDGPURegBankSelect.h"
+#include "AMDGPUSplitModule.h"
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPUTargetTransformInfo.h"
+#include "AMDGPUUnifyDivergentExitNodes.h"
+#include "GCNIterativeScheduler.h"
+#include "GCNSchedStrategy.h"
+#include "GCNVOPDUtils.h"
+#include "R600.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600TargetMachine.h"
 #include "SIFixSGPRCopies.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIMachineScheduler.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
+#include <optional>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
+
+namespace {
+class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
+public:
+  SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+      : RegisterRegAllocBase(N, D, C) {}
+};
+
+class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
+public:
+  VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+      : RegisterRegAllocBase(N, D, C) {}
+};
+
+static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
+                              const MachineRegisterInfo &MRI,
+                              const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
+}
+
+static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
+                              const MachineRegisterInfo &MRI,
+                              const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
+}
+
+/// -{sgpr|vgpr}-regalloc=... command line option.
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+
+/// A dummy default pass factory indicates whether the register allocator is
+/// overridden on the command line.
+static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+
+static SGPRRegisterRegAlloc
+    defaultSGPRRegAlloc("default",
+                        "pick SGPR register allocator based on -O option",
+                        useDefaultRegisterAllocator);
+
+static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<SGPRRegisterRegAlloc>>
+    SGPRRegAlloc("sgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for SGPRs"));
+
+static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<VGPRRegisterRegAlloc>>
+    VGPRRegAlloc("vgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for VGPRs"));
+
+static void initializeDefaultSGPRRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = SGPRRegAlloc;
+    SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
+  }
+}
+
+static void initializeDefaultVGPRRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = VGPRRegAlloc;
+    VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
+  }
+}
+
+static FunctionPass *createBasicSGPRRegisterAllocator() {
+  return createBasicRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createGreedySGPRRegisterAllocator() {
+  return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createFastSGPRRegisterAllocator() {
+  return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+static FunctionPass *createBasicVGPRRegisterAllocator() {
+  return createBasicRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createGreedyVGPRRegisterAllocator() {
+  return createGreedyRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createFastVGPRRegisterAllocator() {
+  return createFastRegisterAllocator(onlyAllocateVGPRs, true);
+}
+
+static SGPRRegisterRegAlloc basicRegAllocSGPR("basic",
+                                              "basic register allocator",
+                                              createBasicSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc
+    greedyRegAllocSGPR("greedy", "greedy register allocator",
+                       createGreedySGPRRegisterAllocator);
+
+static SGPRRegisterRegAlloc fastRegAllocSGPR("fast", "fast register allocator",
+                                             createFastSGPRRegisterAllocator);
+
+static VGPRRegisterRegAlloc basicRegAllocVGPR("basic",
+                                              "basic register allocator",
+                                              createBasicVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc
+    greedyRegAllocVGPR("greedy", "greedy register allocator",
+                       createGreedyVGPRRegisterAllocator);
+
+static VGPRRegisterRegAlloc fastRegAllocVGPR("fast", "fast register allocator",
+                                             createFastVGPRRegisterAllocator);
+} // anonymous namespace
+
+static cl::opt<bool>
+    EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                            cl::desc("Run early if-conversion"),
+                            cl::init(false));
+
+static cl::opt<bool>
+    OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+                     cl::desc("Run pre-RA exec mask optimizations"),
+                     cl::init(true));
+
+static cl::opt<bool>
+    LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
+                  cl::desc("Lower GPU ctor / dtors to globals on the device."),
+                  cl::init(true), cl::Hidden);
+
+// Option to disable vectorizer for tests.
+static cl::opt<bool>
+    EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer",
+                              cl::desc("Enable load store vectorizer"),
+                              cl::init(true), cl::Hidden);
+
+// Option to control global loads scalarization
+static cl::opt<bool>
+    ScalarizeGlobal("amdgpu-scalarize-global-loads",
+                    cl::desc("Enable global load scalarization"),
+                    cl::init(true), cl::Hidden);
+
+// Option to run internalize pass.
+static cl::opt<bool> InternalizeSymbols(
+    "amdgpu-internalize-symbols",
+    cl::desc("Enable elimination of non-kernel functions and unused globals"),
+    cl::init(false), cl::Hidden);
+
+// Option to inline all early.
+static cl::opt<bool> EarlyInlineAll("amdgpu-early-inline-all",
+                                    cl::desc("Inline all functions early"),
+                                    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> RemoveIncompatibleFunctions(
+    "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
+    cl::desc("Enable removal of functions when they"
+             "use features not supported by the target GPU"),
+    cl::init(true));
+
+static cl::opt<bool> EnableSDWAPeephole("amdgpu-sdwa-peephole",
+                                        cl::desc("Enable SDWA peepholer"),
+                                        cl::init(true));
+
+static cl::opt<bool> EnableDPPCombine("amdgpu-dpp-combine",
+                                      cl::desc("Enable DPP combiner"),
+                                      cl::init(true));
+
+// Enable address space based alias analysis
+static cl::opt<bool>
+    EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+                              cl::desc("Enable AMDGPU Alias Analysis"),
+                              cl::init(true));
+
+// Option to run late CFG structurizer
+static cl::opt<bool, true> LateCFGStructurize(
+    "amdgpu-late-structurize", cl::desc("Enable late CFG structurization"),
+    cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden);
+
+// Disable structurizer-based control-flow lowering in order to test convergence
+// control tokens. This should eventually be replaced by the wave-transform.
+static cl::opt<bool, true> DisableStructurizer(
+    "amdgpu-disable-structurizer",
+    cl::desc("Disable structurizer for experiments; produces unusable code"),
+    cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden);
+
+// Enable lib calls simplifications
+static cl::opt<bool>
+    EnableLibCallSimplify("amdgpu-simplify-libcall",
+                          cl::desc("Enable amdgpu library simplifications"),
+                          cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableLowerKernelArguments(
+    "amdgpu-ir-lower-kernel-arguments",
+    cl::desc("Lower kernel argument loads in IR pass"), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool> EnableRegReassign(
+    "amdgpu-reassign-regs",
+    cl::desc("Enable register reassign optimizations on gfx10+"),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> OptVGPRLiveRange(
+    "amdgpu-opt-vgpr-liverange",
+    cl::desc("Enable VGPR liverange optimizations for if-else structure"),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
+    "amdgpu-atomic-optimizer-strategy",
+    cl::desc("Select DPP or Iterative strategy for scan"),
+    cl::init(ScanOptions::Iterative),
+    cl::values(
+        clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
+        clEnumValN(ScanOptions::Iterative, "Iterative",
+                   "Use Iterative approach for scan"),
+        clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
+
+// Enable Mode register optimization
+static cl::opt<bool>
+    EnableSIModeRegisterPass("amdgpu-mode-register",
+                             cl::desc("Enable mode register pass"),
+                             cl::init(true), cl::Hidden);
+
+// Enable GFX11.5+ s_singleuse_vdst insertion
+static cl::opt<bool>
+    EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
+                              cl::desc("Enable s_singleuse_vdst insertion"),
+                              cl::init(false), cl::Hidden);
+
+// Enable GFX11+ s_delay_alu insertion
+static cl::opt<bool>
+    EnableInsertDelayAlu("amdgpu-enable-delay-alu",
+                         cl::desc("Enable s_delay_alu insertion"),
+                         cl::init(true), cl::Hidden);
+
+// Enable GFX11+ VOPD
+static cl::opt<bool>
+    EnableVOPD("amdgpu-enable-vopd",
+               cl::desc("Enable VOPD, dual issue of VALU in wave32"),
+               cl::init(true), cl::Hidden);
+
+// Option is used in lit tests to prevent deadcoding of patterns inspected.
+static cl::opt<bool>
+    EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden,
+                  cl::desc("Enable machine DCE inside regalloc"));
+
+static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                           cl::desc("Adjust wave priority"),
+                                           cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnableScalarIRPasses("amdgpu-scalar-ir-passes",
+                                          cl::desc("Enable scalar IR passes"),
+                                          cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableStructurizerWorkarounds(
+    "amdgpu-enable-structurizer-workarounds",
+    cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool, true> EnableLowerModuleLDS(
+    "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
+    cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool>
+    EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations",
+                             cl::desc("Enable Pre-RA optimizations pass"),
+                             cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnablePromoteKernelArguments(
+    "amdgpu-enable-promote-kernel-arguments",
+    cl::desc("Enable promotion of flat kernel pointer arguments to global"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> EnableImageIntrinsicOptimizer(
+    "amdgpu-enable-image-intrinsic-optimizer",
+    cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool>
+    EnableLoopPrefetch("amdgpu-loop-prefetch",
+                       cl::desc("Enable loop data prefetch on AMDGPU"),
+                       cl::Hidden, cl::init(false));
+
+static cl::opt<bool> EnableMaxIlpSchedStrategy(
+    "amdgpu-enable-max-ilp-scheduling-strategy",
+    cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> EnableRewritePartialRegUses(
+    "amdgpu-enable-rewrite-partial-reg-uses",
+    cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool>
+    EnableHipStdPar("amdgpu-enable-hipstdpar",
+                    cl::desc("Enable HIP Standard Parallelism Offload support"),
+                    cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    EnableAMDGPUAttributor("amdgpu-attributor-enable",
+                           cl::desc("Enable AMDGPUAttributorPass"),
+                           cl::init(true), cl::Hidden);
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
+  // Register the target
+  RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
+  RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
+
+  PassRegistry *PR = PassRegistry::getPassRegistry();
+  initializeR600ClauseMergePassPass(*PR);
+  initializeR600ControlFlowFinalizerPass(*PR);
+  initializeR600PacketizerPass(*PR);
+  initializeR600ExpandSpecialInstrsPassPass(*PR);
+  initializeR600VectorRegMergerPass(*PR);
+  initializeGlobalISel(*PR);
+  initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
+  initializeGCNDPPCombinePass(*PR);
+  initializeSILowerI1CopiesLegacyPass(*PR);
+  initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
+  initializeSILowerWWMCopiesPass(*PR);
+  initializeAMDGPUMarkLastScratchLoadPass(*PR);
+  initializeSILowerSGPRSpillsPass(*PR);
+  initializeSIFixSGPRCopiesLegacyPass(*PR);
+  initializeSIFixVGPRCopiesPass(*PR);
+  initializeSIFoldOperandsPass(*PR);
+  initializeSIPeepholeSDWAPass(*PR);
+  initializeSIShrinkInstructionsPass(*PR);
+  initializeSIOptimizeExecMaskingPreRAPass(*PR);
+  initializeSIOptimizeVGPRLiveRangePass(*PR);
+  initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
+  initializeAMDGPUAlwaysInlinePass(*PR);
+  initializeAMDGPUAttributorLegacyPass(*PR);
+  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
+  initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
+  initializeAMDGPUArgumentUsageInfoPass(*PR);
+  initializeAMDGPUAtomicOptimizerPass(*PR);
+  initializeAMDGPULowerKernelArgumentsPass(*PR);
+  initializeAMDGPUPromoteKernelArgumentsPass(*PR);
+  initializeAMDGPULowerKernelAttributesPass(*PR);
+  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
+  initializeAMDGPUPostLegalizerCombinerPass(*PR);
+  initializeAMDGPUPreLegalizerCombinerPass(*PR);
+  initializeAMDGPURegBankCombinerPass(*PR);
+  initializeAMDGPURegBankSelectPass(*PR);
+  initializeAMDGPUPromoteAllocaPass(*PR);
+  initializeAMDGPUPromoteAllocaToVectorPass(*PR);
+  initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
+  initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
+  initializeAMDGPULowerModuleLDSLegacyPass(*PR);
+  initializeAMDGPULowerBufferFatPointersPass(*PR);
+  initializeAMDGPURewriteOutArgumentsPass(*PR);
+  initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
+  initializeAMDGPUUnifyMetadataPass(*PR);
+  initializeSIAnnotateControlFlowLegacyPass(*PR);
+  initializeAMDGPUInsertSingleUseVDSTPass(*PR);
+  initializeAMDGPUInsertDelayAluPass(*PR);
+  initializeSIInsertHardClausesPass(*PR);
+  initializeSIInsertWaitcntsPass(*PR);
+  initializeSIModeRegisterPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
+  initializeSILowerControlFlowPass(*PR);
+  initializeSIPreEmitPeepholePass(*PR);
+  initializeSILateBranchLoweringPass(*PR);
+  initializeSIMemoryLegalizerPass(*PR);
+  initializeSIOptimizeExecMaskingPass(*PR);
+  initializeSIPreAllocateWWMRegsPass(*PR);
+  initializeSIFormMemoryClausesPass(*PR);
+  initializeSIPostRABundlerPass(*PR);
+  initializeGCNCreateVOPDPass(*PR);
+  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
+  initializeAMDGPUAAWrapperPassPass(*PR);
+  initializeAMDGPUExternalAAWrapperPass(*PR);
+  initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
+  initializeAMDGPUPrintfRuntimeBindingPass(*PR);
+  initializeAMDGPUResourceUsageAnalysisPass(*PR);
+  initializeGCNNSAReassignPass(*PR);
+  initializeGCNPreRAOptimizationsPass(*PR);
+  initializeGCNPreRALongBranchRegPass(*PR);
+  initializeGCNRewritePartialRegUsesPass(*PR);
+  initializeGCNRegPressurePrinterPass(*PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  return std::make_unique<AMDGPUTargetObjectFile>();
+}
+
+static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
+  return new SIScheduleDAGMI(C);
+}
+
+static ScheduleDAGInstrs *
+createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+  ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
+      C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
+  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+  DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  return DAG;
+}
+
+static ScheduleDAGInstrs *
+createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
+  ScheduleDAGMILive *DAG =
+      new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
+  return DAG;
+}
+
+static ScheduleDAGInstrs *
+createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+  auto DAG = new GCNIterativeScheduler(
+      C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
+}
+
+static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
+  return new GCNIterativeScheduler(
+      C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+}
+
+static ScheduleDAGInstrs *
+createIterativeILPMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+  auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+  return DAG;
+}
+
+static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler",
+                                            createSIMachineScheduler);
+
+static MachineSchedRegistry
+    GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+                                 "Run GCN scheduler to maximize occupancy",
+                                 createGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry
+    GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
+                           createGCNMaxILPMachineScheduler);
+
+static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
+    "gcn-iterative-max-occupancy-experimental",
+    "Run GCN scheduler to maximize occupancy (experimental)",
+    createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry GCNMinRegSchedRegistry(
+    "gcn-iterative-minreg",
+    "Run GCN iterative scheduler for minimal register usage (experimental)",
+    createMinRegScheduler);
+
+static MachineSchedRegistry GCNILPSchedRegistry(
+    "gcn-iterative-ilp",
+    "Run GCN iterative scheduler for ILP scheduling (experimental)",
+    createIterativeILPMachineScheduler);
+
+static StringRef computeDataLayout(const Triple &TT) {
+  if (TT.getArch() == Triple::r600) {
+    // 32-bit pointers.
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+           "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
+  }
+
+  // 32-bit private, local, and region pointers. 64-bit global, constant and
+  // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
+  // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
+  // (address space 7), and 128-bit non-integral buffer resourcees (address
+  // space 8) which cannot be non-trivilally accessed by LLVM memory operations
+  // like getelementptr.
+  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+         "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
+         "v32:32-v48:64-v96:"
+         "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
+         "G1-ni:7:8:9";
+}
+
+LLVM_READNONE
+static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
+  if (!GPU.empty())
+    return GPU;
+
+  // Need to default to a target with flat support for HSA.
+  if (TT.getArch() == Triple::amdgcn)
+    return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
+
+  return "r600";
+}
+
+static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+  // The AMDGPU toolchain only supports generating shared objects, so we
+  // must always use PIC.
+  return Reloc::PIC_;
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+) - Legacy Pass Manager interface.
+//===----------------------------------------------------------------------===//
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         std::optional<Reloc::Model> RM,
+                                         std::optional<CodeModel::Model> CM,
+                                         CodeGenOptLevel OptLevel)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+                        FS, Options, getEffectiveRelocModel(RM),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
+      TLOF(createTLOF(getTargetTriple())) {
+  initAsmInfo();
+  if (TT.getArch() == Triple::amdgcn) {
+    if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
+      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
+    else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
+      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
+  }
+}
+
+bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
+bool AMDGPUTargetMachine::DisableStructurizer = false;
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
+
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+  Attribute GPUAttr = F.getFnAttribute("target-cpu");
+  return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  return FSAttr.isValid() ? FSAttr.getValueAsString()
+                          : getTargetFeatureString();
+}
+
+/// Predicate for Internalize pass.
+static bool mustPreserveGV(const GlobalValue &GV) {
+  if (const Function *F = dyn_cast<Function>(&GV))
+    return F->isDeclaration() || F->getName().starts_with("__asan_") ||
+           F->getName().starts_with("__sanitizer_") ||
+           AMDGPU::isEntryFunctionCC(F->getCallingConv());
+
+  GV.removeDeadConstantUsers();
+  return !GV.use_empty();
+}
+
+void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
+  AAM.registerFunctionAnalysis<AMDGPUAA>();
+}
+
+static Expected<ScanOptions>
+parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
+  if (Params.empty())
+    return ScanOptions::Iterative;
+  Params.consume_front("strategy=");
+  auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
+                    .Case("dpp", ScanOptions::DPP)
+                    .Cases("iterative", "", ScanOptions::Iterative)
+                    .Case("none", ScanOptions::None)
+                    .Default(std::nullopt);
+  if (Result)
+    return *Result;
+  return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
+}
+
+Expected<AMDGPUAttributorOptions>
+parseAMDGPUAttributorPassOptions(StringRef Params) {
+  AMDGPUAttributorOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+    if (ParamName == "closed-world") {
+      Result.IsClosedWorld = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+
+#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
+#include "llvm/Passes/TargetPassRegistry.inc"
+
+  PB.registerPipelineStartEPCallback(
+      [](ModulePassManager &PM, OptimizationLevel Level) {
+        FunctionPassManager FPM;
+        PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        if (EnableHipStdPar)
+          PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+      });
+
+  PB.registerPipelineEarlySimplificationEPCallback(
+      [](ModulePassManager &PM, OptimizationLevel Level) {
+        PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+
+        if (Level == OptimizationLevel::O0)
+          return;
+
+        PM.addPass(AMDGPUUnifyMetadataPass());
+
+        if (InternalizeSymbols) {
+          PM.addPass(InternalizePass(mustPreserveGV));
+          PM.addPass(GlobalDCEPass());
+        }
+
+        if (EarlyInlineAll && !EnableFunctionCalls)
+          PM.addPass(AMDGPUAlwaysInlinePass());
+      });
+
+  PB.registerPeepholeEPCallback(
+      [](FunctionPassManager &FPM, OptimizationLevel Level) {
+        if (Level == OptimizationLevel::O0)
+          return;
+
+        FPM.addPass(AMDGPUUseNativeCallsPass());
+        if (EnableLibCallSimplify)
+          FPM.addPass(AMDGPUSimplifyLibCallsPass());
+      });
+
+  PB.registerCGSCCOptimizerLateEPCallback(
+      [this](CGSCCPassManager &PM, OptimizationLevel Level) {
+        if (Level == OptimizationLevel::O0)
+          return;
+
+        FunctionPassManager FPM;
+
+        // Add promote kernel arguments pass to the opt pipeline right before
+        // infer address spaces which is needed to do actual address space
+        // rewriting.
+        if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
+            EnablePromoteKernelArguments)
+          FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
+
+        // Add infer address spaces pass to the opt pipeline after inlining
+        // but before SROA to increase SROA opportunities.
+        FPM.addPass(InferAddressSpacesPass());
+
+        // This should run after inlining to have any chance of doing
+        // anything, and before other cleanup optimizations.
+        FPM.addPass(AMDGPULowerKernelAttributesPass());
+
+        if (Level != OptimizationLevel::O0) {
+          // Promote alloca to vector before SROA and loop unroll. If we
+          // manage to eliminate allocas before unroll we may choose to unroll
+          // less.
+          FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+        }
+
+        PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+      });
+
+  // FIXME: Why is AMDGPUAttributor not in CGSCC?
+  PB.registerOptimizerLastEPCallback(
+      [this](ModulePassManager &MPM, OptimizationLevel Level) {
+        if (Level != OptimizationLevel::O0) {
+          MPM.addPass(AMDGPUAttributorPass(*this));
+        }
+      });
+
+  PB.registerFullLinkTimeOptimizationLastEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        // We want to support the -lto-partitions=N option as "best effort".
+        // For that, we need to lower LDS earlier in the pipeline before the
+        // module is partitioned for codegen.
+        if (EnableLowerModuleLDS)
+          PM.addPass(AMDGPULowerModuleLDSPass(*this));
+        if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0)
+          PM.addPass(AMDGPUAttributorPass(*this));
+      });
+
+  PB.registerRegClassFilterParsingCallback(
+      [](StringRef FilterName) -> RegAllocFilterFunc {
+        if (FilterName == "sgpr")
+          return onlyAllocateSGPRs;
+        if (FilterName == "vgpr")
+          return onlyAllocateVGPRs;
+        return nullptr;
+      });
+}
+
+int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
+  return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+          AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+          AddrSpace == AMDGPUAS::REGION_ADDRESS)
+             ? -1
+             : 0;
+}
+
+bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+                                              unsigned DestAS) const {
+  return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
+         AMDGPU::isFlatGlobalAddrSpace(DestAS);
+}
+
+unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+  const auto *LD = dyn_cast<LoadInst>(V);
+  if (!LD)
+    return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+  // It must be a generic pointer loaded.
+  assert(V->getType()->isPointerTy() &&
+         V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
+
+  const auto *Ptr = LD->getPointerOperand();
+  if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+  // For a generic pointer loaded from the constant memory, it could be assumed
+  // as a global pointer since the constant memory is only populated on the
+  // host side. As implied by the offload programming model, only global
+  // pointers could be referenced on the host side.
+  return AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+std::pair<const Value *, unsigned>
+AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
+  if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::amdgcn_is_shared:
+      return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
+    case Intrinsic::amdgcn_is_private:
+      return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
+    default:
+      break;
+    }
+    return std::pair(nullptr, -1);
+  }
+  // Check the global pointer predication based on
+  // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
+  // the order of 'is_shared' and 'is_private' is not significant.
+  Value *Ptr;
+  if (match(
+          const_cast<Value *>(V),
+          m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
+                  m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
+                      m_Deferred(Ptr))))))
+    return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
+
+  return std::pair(nullptr, -1);
+}
+
+unsigned
+AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
+  switch (Kind) {
+  case PseudoSourceValue::Stack:
+  case PseudoSourceValue::FixedStack:
+    return AMDGPUAS::PRIVATE_ADDRESS;
+  case PseudoSourceValue::ConstantPool:
+  case PseudoSourceValue::GOT:
+  case PseudoSourceValue::JumpTable:
+  case PseudoSourceValue::GlobalValueCallEntry:
+  case PseudoSourceValue::ExternalSymbolCallEntry:
+    return AMDGPUAS::CONSTANT_ADDRESS;
+  }
+  return AMDGPUAS::FLAT_ADDRESS;
+}
+
+bool AMDGPUTargetMachine::splitModule(
+    Module &M, unsigned NumParts,
+    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+  // FIXME(?): Would be better to use an already existing Analysis/PassManager,
+  // but all current users of this API don't have one ready and would need to
+  // create one anyway. Let's hide the boilerplate for now to keep it simple.
+
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+
+  PassBuilder PB(this);
+  PB.registerModuleAnalyses(MAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  ModulePassManager MPM;
+  MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
+  MPM.run(M, MAM);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+) - Legacy Pass Manager interface.
+//===----------------------------------------------------------------------===//
+
+GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   std::optional<Reloc::Model> RM,
+                                   std::optional<CodeModel::Model> CM,
+                                   CodeGenOptLevel OL, bool JIT)
+    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const TargetSubtargetInfo *
+GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
+  }
+
+  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+
+  return I.get();
+}
+
+TargetTransformInfo
+GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
+  return TargetTransformInfo(GCNTTIImpl(this, F));
+}
+
+Error GCNTargetMachine::buildCodeGenPipeline(
+    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, const CGPassBuilderOption &Opts,
+    PassInstrumentationCallbacks *PIC) {
+  AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
+  return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Legacy Pass Setup
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
+  return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
+
+namespace {
+
+class GCNPassConfig final : public AMDGPUPassConfig {
+public:
+  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
+      : AMDGPUPassConfig(TM, PM) {
+    // It is necessary to know the register usage of the entire call graph.  We
+    // allow calls without EnableAMDGPUFunctionCalls if they are marked
+    // noinline, so this is always required.
+    setRequiresCodeGenSCCOrder(true);
+    substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+  }
+
+  GCNTargetMachine &getGCNTargetMachine() const {
+    return getTM<GCNTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override;
+
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
+        C, std::make_unique<PostGenericScheduler>(C),
+        /*RemoveKillFlags=*/true);
+    const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    if (ST.shouldClusterStores())
+      DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
+    DAG->addMutation(
+        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
+    if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
+      DAG->addMutation(createVOPDPairingMutation());
+    return DAG;
+  }
+
+  bool addPreISel() override;
+  void addMachineSSAOptimization() override;
+  bool addILPOpts() override;
+  bool addInstSelector() override;
+  bool addIRTranslator() override;
+  void addPreLegalizeMachineIR() override;
+  bool addLegalizeMachineIR() override;
+  void addPreRegBankSelect() override;
+  bool addRegBankSelect() override;
+  void addPreGlobalInstructionSelect() override;
+  bool addGlobalInstructionSelect() override;
+  void addFastRegAlloc() override;
+  void addOptimizedRegAlloc() override;
+
+  FunctionPass *createSGPRAllocPass(bool Optimized);
+  FunctionPass *createVGPRAllocPass(bool Optimized);
+  FunctionPass *createRegAllocPass(bool Optimized) override;
+
+  bool addRegAssignAndRewriteFast() override;
+  bool addRegAssignAndRewriteOptimized() override;
+
+  void addPreRegAlloc() override;
+  bool addPreRewrite() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+
+} // end anonymous namespace
+
+AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+  // Exceptions and StackMaps are not supported, so these passes will never do
+  // anything.
+  disablePass(&StackMapLivenessID);
+  disablePass(&FuncletLayoutID);
+  // Garbage collection is not supported.
+  disablePass(&GCLoweringID);
+  disablePass(&ShadowStackGCLoweringID);
+}
+
+void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOptLevel::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
+    addPass(createLoopDataPrefetchPass());
+  addPass(createSeparateConstOffsetFromGEPPass());
+  // ReassociateGEPs exposes more opportunities for SLSR. See
+  // the example in reassociate-geps-and-slsr.ll.
+  addPass(createStraightLineStrengthReducePass());
+  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+  // EarlyCSE can reuse.
+  addEarlyCSEOrGVNPass();
+  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+  addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates redundant common expressions, so run
+  // EarlyCSE after it.
+  addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addIRPasses() {
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+
+  Triple::ArchType Arch = TM.getTargetTriple().getArch();
+  if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
+    addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
+
+  // There is no reason to run these.
+  disablePass(&StackMapLivenessID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&PatchableFunctionID);
+
+  addPass(createAMDGPUPrintfRuntimeBinding());
+  if (LowerCtorDtor)
+    addPass(createAMDGPUCtorDtorLoweringLegacyPass());
+
+  if (isPassEnabled(EnableImageIntrinsicOptimizer))
+    addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+
+  // This can be disabled by passing ::Disable here or on the command line
+  // with --expand-variadics-override=disable.
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
+
+  // Function calls are not supported, so make sure we inline everything.
+  addPass(createAMDGPUAlwaysInlinePass());
+  addPass(createAlwaysInlinerLegacyPass());
+
+  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
+  if (Arch == Triple::r600)
+    addPass(createR600OpenCLImageTypeLoweringPass());
+
+  // Replace OpenCL enqueued block function pointers with global variables.
+  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+
+  // Runs before PromoteAlloca so the latter can account for function uses
+  if (EnableLowerModuleLDS) {
+    addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
+  }
+
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
+    addPass(createInferAddressSpacesPass());
+
+  // Run atomic optimizer before Atomic Expand
+  if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
+      (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
+      (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
+    addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
+  }
+
+  addPass(createAtomicExpandLegacyPass());
+
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    addPass(createAMDGPUPromoteAlloca());
+
+    if (isPassEnabled(EnableScalarIRPasses))
+      addStraightLineScalarOptimizationPasses();
+
+    if (EnableAMDGPUAliasAnalysis) {
+      addPass(createAMDGPUAAWrapperPass());
+      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
+                                             AAResults &AAR) {
+        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+          AAR.addAAResult(WrapperPass->getResult());
+      }));
+    }
+
+    if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+      // TODO: May want to move later or split into an early and late one.
+      addPass(createAMDGPUCodeGenPreparePass());
+    }
+
+    // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
+    // have expanded.
+    if (TM.getOptLevel() > CodeGenOptLevel::Less)
+      addPass(createLICMPass());
+  }
+
+  TargetPassConfig::addIRPasses();
+
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  if (isPassEnabled(EnableScalarIRPasses))
+    addEarlyCSEOrGVNPass();
+}
+
+void AMDGPUPassConfig::addCodeGenPrepare() {
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    // FIXME: This pass adds 2 hacky attributes that can be replaced with an
+    // analysis, and should be removed.
+    addPass(createAMDGPUAnnotateKernelFeaturesPass());
+  }
+
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
+      EnableLowerKernelArguments)
+    addPass(createAMDGPULowerKernelArgumentsPass());
+
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    // This lowering has been placed after codegenprepare to take advantage of
+    // address mode matching (which is why it isn't put with the LDS lowerings).
+    // It could be placed anywhere before uniformity annotations (an analysis
+    // that it changes by splitting up fat pointers into their components)
+    // but has been put before switch lowering and CFG flattening so that those
+    // passes can run on the more optimized control flow this pass creates in
+    // many cases.
+    //
+    // FIXME: This should ideally be put after the LoadStoreVectorizer.
+    // However, due to some annoying facts about ResourceUsageAnalysis,
+    // (especially as exercised in the resource-usage-dead-function test),
+    // we need all the function passes codegenprepare all the way through
+    // said resource usage analysis to run on the call graph produced
+    // before codegenprepare runs (because codegenprepare will knock some
+    // nodes out of the graph, which leads to function-level passes not
+    // being run on them, which causes crashes in the resource usage analysis).
+    addPass(createAMDGPULowerBufferFatPointersPass());
+    // In accordance with the above FIXME, manually force all the
+    // function-level passes into a CGSCCPassManager.
+    addPass(new DummyCGSCCPass());
+  }
+
+  TargetPassConfig::addCodeGenPrepare();
+
+  if (isPassEnabled(EnableLoadStoreVectorizer))
+    addPass(createLoadStoreVectorizerPass());
+
+  // LowerSwitch pass may introduce unreachable blocks that can
+  // cause unexpected behavior for subsequent passes. Placing it
+  // here seems better that these blocks would get cleaned up by
+  // UnreachableBlockElim inserted next in the pass flow.
+  addPass(createLowerSwitchPass());
+}
+
+bool AMDGPUPassConfig::addPreISel() {
+  if (TM->getOptLevel() > CodeGenOptLevel::None)
+    addPass(createFlattenCFGPass());
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool AMDGPUPassConfig::addGCPasses() {
+  // Do nothing. GC is not supported.
+  return false;
+}
+
+llvm::ScheduleDAGInstrs *
+AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+  ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
+}
+
+MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
+    BumpPtrAllocator &Allocator, const Function &F,
+    const TargetSubtargetInfo *STI) const {
+  return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
+      Allocator, F, static_cast<const R600Subtarget *>(STI));
+}
+
+//===----------------------------------------------------------------------===//
+// GCN Legacy Pass Setup
+//===----------------------------------------------------------------------===//
+
+ScheduleDAGInstrs *
+GCNPassConfig::createMachineScheduler(MachineSchedContext *C) const {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+  if (ST.enableSIScheduler())
+    return createSIMachineScheduler(C);
+
+  if (EnableMaxIlpSchedStrategy)
+    return createGCNMaxILPMachineScheduler(C);
+
+  return createGCNMaxOccupancyMachineScheduler(C);
+}
+
+bool GCNPassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+
+  if (TM->getOptLevel() > CodeGenOptLevel::None)
+    addPass(createSinkingPass());
+
+  if (TM->getOptLevel() > CodeGenOptLevel::None)
+    addPass(createAMDGPULateCodeGenPrepareLegacyPass());
+
+  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
+  // regions formed by them.
+  addPass(&AMDGPUUnifyDivergentExitNodesID);
+  if (!LateCFGStructurize && !DisableStructurizer) {
+    if (EnableStructurizerWorkarounds) {
+      addPass(createFixIrreduciblePass());
+      addPass(createUnifyLoopExitsPass());
+    }
+    addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
+  }
+  addPass(createAMDGPUAnnotateUniformValuesLegacy());
+  if (!LateCFGStructurize && !DisableStructurizer) {
+    addPass(createSIAnnotateControlFlowLegacyPass());
+    // TODO: Move this right after structurizeCFG to avoid extra divergence
+    // analysis. This depends on stopping SIAnnotateControlFlow from making
+    // control flow modifications.
+    addPass(createAMDGPURewriteUndefForPHILegacyPass());
+  }
+  addPass(createLCSSAPass());
+
+  if (TM->getOptLevel() > CodeGenOptLevel::Less)
+    addPass(&AMDGPUPerfHintAnalysisLegacyID);
+
+  return false;
+}
+
+void GCNPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+
+  // We want to fold operands after PeepholeOptimizer has run (or as part of
+  // it), because it will eliminate extra copies making it easier to fold the
+  // real source operand. We want to eliminate dead instructions after, so that
+  // we see fewer uses of the copies. We then need to clean up the dead
+  // instructions leftover after the operands are folded as well.
+  //
+  // XXX - Can we get away without running DeadMachineInstructionElim again?
+  addPass(&SIFoldOperandsID);
+  if (EnableDPPCombine)
+    addPass(&GCNDPPCombineID);
+  addPass(&SILoadStoreOptimizerID);
+  if (isPassEnabled(EnableSDWAPeephole)) {
+    addPass(&SIPeepholeSDWAID);
+    addPass(&EarlyMachineLICMID);
+    addPass(&MachineCSEID);
+    addPass(&SIFoldOperandsID);
+  }
+  addPass(&DeadMachineInstructionElimID);
+  addPass(createSIShrinkInstructionsPass());
+}
+
+bool GCNPassConfig::addILPOpts() {
+  if (EnableEarlyIfConversion)
+    addPass(&EarlyIfConverterID);
+
+  TargetPassConfig::addILPOpts();
+  return false;
+}
+
+bool GCNPassConfig::addInstSelector() {
+  AMDGPUPassConfig::addInstSelector();
+  addPass(&SIFixSGPRCopiesLegacyID);
+  addPass(createSILowerI1CopiesLegacyPass());
+  return false;
+}
+
+bool GCNPassConfig::addIRTranslator() {
+  addPass(new IRTranslator(getOptLevel()));
+  return false;
+}
+
+void GCNPassConfig::addPreLegalizeMachineIR() {
+  bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
+  addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
+  addPass(new Localizer());
+}
+
+bool GCNPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
+  return false;
+}
+
+void GCNPassConfig::addPreRegBankSelect() {
+  bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
+  addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+  addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
+}
+
+bool GCNPassConfig::addRegBankSelect() {
+  addPass(new AMDGPURegBankSelect());
+  return false;
+}
+
+void GCNPassConfig::addPreGlobalInstructionSelect() {
+  bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
+  addPass(createAMDGPURegBankCombiner(IsOptNone));
+}
+
+bool GCNPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect(getOptLevel()));
+  return false;
+}
+
+void GCNPassConfig::addPreRegAlloc() {
+  if (LateCFGStructurize) {
+    addPass(createAMDGPUMachineCFGStructurizerPass());
+  }
+}
+
+void GCNPassConfig::addFastRegAlloc() {
+  // FIXME: We have to disable the verifier here because of PHIElimination +
+  // TwoAddressInstructions disabling it.
+
+  // This must be run immediately after phi elimination and before
+  // TwoAddressInstructions, otherwise the processing of the tied operand of
+  // SI_ELSE will introduce a copy of the tied operand source after the else.
+  insertPass(&PHIEliminationID, &SILowerControlFlowID);
+
+  insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
+
+  TargetPassConfig::addFastRegAlloc();
+}
+
+void GCNPassConfig::addOptimizedRegAlloc() {
+  // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
+  // instructions that cause scheduling barriers.
+  insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
+
+  if (OptExecMaskPreRA)
+    insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+
+  if (EnableRewritePartialRegUses)
+    insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
+
+  if (isPassEnabled(EnablePreRAOptimizations))
+    insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
+
+  // This is not an essential optimization and it has a noticeable impact on
+  // compilation time, so we only enable it from O2.
+  if (TM->getOptLevel() > CodeGenOptLevel::Less)
+    insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+
+  // FIXME: when an instruction has a Killed operand, and the instruction is
+  // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
+  // the register in LiveVariables, this would trigger a failure in verifier,
+  // we should fix it and enable the verifier.
+  if (OptVGPRLiveRange)
+    insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
+  // This must be run immediately after phi elimination and before
+  // TwoAddressInstructions, otherwise the processing of the tied operand of
+  // SI_ELSE will introduce a copy of the tied operand source after the else.
+  insertPass(&PHIEliminationID, &SILowerControlFlowID);
+
+  if (EnableDCEInRA)
+    insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
+
+  TargetPassConfig::addOptimizedRegAlloc();
+}
+
+bool GCNPassConfig::addPreRewrite() {
+  addPass(&SILowerWWMCopiesID);
+  if (EnableRegReassign)
+    addPass(&GCNNSAReassignID);
+  return true;
+}
+
+FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
+                  initializeDefaultSGPRRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  if (Optimized)
+    return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+
+  return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
+                  initializeDefaultVGPRRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  if (Optimized)
+    return createGreedyVGPRRegisterAllocator();
+
+  return createFastVGPRRegisterAllocator();
+}
+
+FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
+  llvm_unreachable("should not be used");
+}
+
+static const char RegAllocOptNotSupportedMessage[] =
+    "-regalloc not supported with amdgcn. Use -sgpr-regalloc and "
+    "-vgpr-regalloc";
+
+bool GCNPassConfig::addRegAssignAndRewriteFast() {
+  if (!usingDefaultRegAlloc())
+    report_fatal_error(RegAllocOptNotSupportedMessage);
+
+  addPass(&GCNPreRALongBranchRegID);
+
+  addPass(createSGPRAllocPass(false));
+
+  // Equivalent of PEI for SGPRs.
+  addPass(&SILowerSGPRSpillsID);
+  addPass(&SIPreAllocateWWMRegsID);
+
+  addPass(createVGPRAllocPass(false));
+
+  addPass(&SILowerWWMCopiesID);
+  return true;
+}
+
+bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
+  if (!usingDefaultRegAlloc())
+    report_fatal_error(RegAllocOptNotSupportedMessage);
+
+  addPass(&GCNPreRALongBranchRegID);
+
+  addPass(createSGPRAllocPass(true));
+
+  // Commit allocated register changes. This is mostly necessary because too
+  // many things rely on the use lists of the physical registers, such as the
+  // verifier. This is only necessary with allocators which use LiveIntervals,
+  // since FastRegAlloc does the replacements itself.
+  addPass(createVirtRegRewriter(false));
+
+  // Equivalent of PEI for SGPRs.
+  addPass(&SILowerSGPRSpillsID);
+  addPass(&SIPreAllocateWWMRegsID);
+
+  addPass(createVGPRAllocPass(true));
+
+  addPreRewrite();
+  addPass(&VirtRegRewriterID);
+
+  addPass(&AMDGPUMarkLastScratchLoadID);
+
+  return true;
+}
+
+void GCNPassConfig::addPostRegAlloc() {
+  addPass(&SIFixVGPRCopiesID);
+  if (getOptLevel() > CodeGenOptLevel::None)
+    addPass(&SIOptimizeExecMaskingID);
+  TargetPassConfig::addPostRegAlloc();
+}
+
+void GCNPassConfig::addPreSched2() {
+  if (TM->getOptLevel() > CodeGenOptLevel::None)
+    addPass(createSIShrinkInstructionsPass());
+  addPass(&SIPostRABundlerID);
+}
+
+void GCNPassConfig::addPreEmitPass() {
+  if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
+    addPass(&GCNCreateVOPDID);
+  addPass(createSIMemoryLegalizerPass());
+  addPass(createSIInsertWaitcntsPass());
+
+  addPass(createSIModeRegisterPass());
+
+  if (getOptLevel() > CodeGenOptLevel::None)
+    addPass(&SIInsertHardClausesID);
+
+  addPass(&SILateBranchLoweringPassID);
+  if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
+    addPass(createAMDGPUSetWavePriorityPass());
+  if (getOptLevel() > CodeGenOptLevel::None)
+    addPass(&SIPreEmitPeepholeID);
+  // The hazard recognizer that runs as part of the post-ra scheduler does not
+  // guarantee to be able handle all hazards correctly. This is because if there
+  // are multiple scheduling regions in a basic block, the regions are scheduled
+  // bottom up, so when we begin to schedule a region we don't know what
+  // instructions were emitted directly before it.
+  //
+  // Here we add a stand-alone hazard recognizer pass which can handle all
+  // cases.
+  addPass(&PostRAHazardRecognizerID);
+
+  if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
+    addPass(&AMDGPUInsertSingleUseVDSTID);
+
+  if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
+    addPass(&AMDGPUInsertDelayAluID);
+
+  addPass(&BranchRelaxationPassID);
+}
+
+TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new GCNPassConfig(*this, PM);
+}
+
+void GCNTargetMachine::registerMachineRegisterInfoCallback(
+    MachineFunction &MF) const {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MF.getRegInfo().addDelegate(MFI);
+}
+
+MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
+    BumpPtrAllocator &Allocator, const Function &F,
+    const TargetSubtargetInfo *STI) const {
+  return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
+      Allocator, F, static_cast<const GCNSubtarget *>(STI));
+}
+
+yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
+  return new yaml::SIMachineFunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  return new yaml::SIMachineFunctionInfo(
+      *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
+}
+
+bool GCNTargetMachine::parseMachineFunctionInfo(
+    const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
+    SMDiagnostic &Error, SMRange &SourceRange) const {
+  const yaml::SIMachineFunctionInfo &YamlMFI =
+      static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
+  MachineFunction &MF = PFS.MF;
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+  if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
+    return true;
+
+  if (MFI->Occupancy == 0) {
+    // Fixup the subtarget dependent default value.
+    MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
+  }
+
+  auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
+    Register TempReg;
+    if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
+      SourceRange = RegName.SourceRange;
+      return true;
+    }
+    RegVal = TempReg;
+
+    return false;
+  };
+
+  auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
+                                   Register &RegVal) {
+    return !RegName.Value.empty() && parseRegister(RegName, RegVal);
+  };
+
+  if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
+    return true;
+
+  if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
+    return true;
+
+  if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
+                            MFI->LongBranchReservedReg))
+    return true;
+
+  auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
+    // Create a diagnostic for a the register string literal.
+    const MemoryBuffer &Buffer =
+        *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+    Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
+                         RegName.Value.size(), SourceMgr::DK_Error,
+                         "incorrect register class for field", RegName.Value,
+                         std::nullopt, std::nullopt);
+    SourceRange = RegName.SourceRange;
+    return true;
+  };
+
+  if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
+      parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
+      parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
+    return true;
+
+  if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
+      !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
+    return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
+  }
+
+  if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
+      !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
+    return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
+  }
+
+  if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
+      !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
+    return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
+  }
+
+  for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
+    Register ParsedReg;
+    if (parseRegister(YamlReg, ParsedReg))
+      return true;
+
+    MFI->reserveWWMRegister(ParsedReg);
+  }
+
+  auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
+                                   const TargetRegisterClass &RC,
+                                   ArgDescriptor &Arg, unsigned UserSGPRs,
+                                   unsigned SystemSGPRs) {
+    // Skip parsing if it's not present.
+    if (!A)
+      return false;
+
+    if (A->IsRegister) {
+      Register Reg;
+      if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
+        SourceRange = A->RegisterName.SourceRange;
+        return true;
+      }
+      if (!RC.contains(Reg))
+        return diagnoseRegisterClass(A->RegisterName);
+      Arg = ArgDescriptor::createRegister(Reg);
+    } else
+      Arg = ArgDescriptor::createStack(A->StackOffset);
+    // Check and apply the optional mask.
+    if (A->Mask)
+      Arg = ArgDescriptor::createArg(Arg, *A->Mask);
+
+    MFI->NumUserSGPRs += UserSGPRs;
+    MFI->NumSystemSGPRs += SystemSGPRs;
+    return false;
+  };
+
+  if (YamlMFI.ArgInfo &&
+      (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
+                             AMDGPU::SGPR_128RegClass,
+                             MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
+                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
+                             2, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
+                             MFI->ArgInfo.QueuePtr, 2, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
+                             AMDGPU::SReg_64RegClass,
+                             MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
+                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
+                             2, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
+                             AMDGPU::SReg_64RegClass,
+                             MFI->ArgInfo.FlatScratchInit, 2, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
+                             AMDGPU::SGPR_32RegClass,
+                             MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
+                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.LDSKernelId,
+                             0, 1) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
+                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
+                             0, 1) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
+                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
+                             0, 1) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
+                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
+                             0, 1) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
+                             AMDGPU::SGPR_32RegClass,
+                             MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
+                             AMDGPU::SGPR_32RegClass,
+                             MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
+                             AMDGPU::SReg_64RegClass,
+                             MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
+                             AMDGPU::SReg_64RegClass,
+                             MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDX,
+                             0, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDY,
+                             0, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDZ,
+                             0, 0)))
+    return true;
+
+  if (ST.hasIEEEMode())
+    MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
+  if (ST.hasDX10ClampMode())
+    MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
+
+  // FIXME: Move proper support for denormal-fp-math into base MachineFunction
+  MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
+                                      ? DenormalMode::IEEE
+                                      : DenormalMode::PreserveSign;
+  MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
+                                       ? DenormalMode::IEEE
+                                       : DenormalMode::PreserveSign;
+
+  MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
+                                          ? DenormalMode::IEEE
+                                          : DenormalMode::PreserveSign;
+  MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
+                                           ? DenormalMode::IEEE
+                                           : DenormalMode::PreserveSign;
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU CodeGen Pass Builder interface.
+//===----------------------------------------------------------------------===//
 
 AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
     GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
index e656e166b3eb2e..752442eda8e1b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
@@ -9,12 +9,153 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCODEGENPASSBUILDER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCODEGENPASSBUILDER_H
 
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Passes/CodeGenPassBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include <optional>
+#include <utility>
 
 namespace llvm {
 
-class GCNTargetMachine;
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+) - For Legacy Pass Manager.
+//===----------------------------------------------------------------------===//
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+protected:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+  StringRef getGPUName(const Function &F) const;
+  StringRef getFeatureString(const Function &F) const;
+
+public:
+  static bool EnableLateStructurizeCFG;
+  static bool EnableFunctionCalls;
+  static bool EnableLowerModuleLDS;
+  static bool DisableStructurizer;
+
+  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, const TargetOptions &Options,
+                      std::optional<Reloc::Model> RM,
+                      std::optional<CodeModel::Model> CM, CodeGenOptLevel OL);
+  ~AMDGPUTargetMachine() override;
+
+  const TargetSubtargetInfo *getSubtargetImpl() const;
+  const TargetSubtargetInfo *
+  getSubtargetImpl(const Function &) const override = 0;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
+  void registerDefaultAliasAnalyses(AAManager &) override;
+
+  /// Get the integer value of a null pointer in the given address space.
+  static int64_t getNullPointerValue(unsigned AddrSpace);
+
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+  unsigned getAssumedAddrSpace(const Value *V) const override;
+
+  std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const override;
+
+  unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
+
+  bool splitModule(Module &M, unsigned NumParts,
+                   function_ref<void(std::unique_ptr<Module> MPart)>
+                       ModuleCallback) override;
+};
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+) - For Legacy Pass Manager.
+//===----------------------------------------------------------------------===//
+
+class GCNTargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
+
+public:
+  GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, const TargetOptions &Options,
+                   std::optional<Reloc::Model> RM,
+                   std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
+                   bool JIT);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
+
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
+
+  bool useIPRA() const override { return true; }
+
+  Error buildCodeGenPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out,
+                             raw_pwrite_stream *DwoOut,
+                             CodeGenFileType FileType,
+                             const CGPassBuilderOption &Opts,
+                             PassInstrumentationCallbacks *PIC) override;
+
+  void registerMachineRegisterInfoCallback(MachineFunction &MF) const override;
+
+  MachineFunctionInfo *
+  createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
+                            const TargetSubtargetInfo *STI) const override;
+
+  yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+  yaml::MachineFunctionInfo *
+  convertFuncInfoToYAML(const MachineFunction &MF) const override;
+  bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+                                PerFunctionMIParsingState &PFS,
+                                SMDiagnostic &Error,
+                                SMRange &SourceRange) const override;
+};
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Pass Setup - For Legacy Pass Manager.
+//===----------------------------------------------------------------------===//
+
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM);
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override;
+
+  void addEarlyCSEOrGVNPass();
+  void addStraightLineScalarOptimizationPasses();
+  void addIRPasses() override;
+  void addCodeGenPrepare() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addGCPasses() override;
+
+  std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
+
+  /// Check if a pass is enabled given \p Opt option. The option always
+  /// overrides defaults if explicitly used. Otherwise its default will
+  /// be used given that a pass shall work at an optimization \p Level
+  /// minimum.
+  bool isPassEnabled(const cl::opt<bool> &Opt,
+                     CodeGenOptLevel Level = CodeGenOptLevel::Default) const {
+    if (Opt.getNumOccurrences())
+      return Opt;
+    if (TM->getOptLevel() < Level)
+      return false;
+    return Opt;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// AMDGPU CodeGen Pass Builder interface.
+//===----------------------------------------------------------------------===//
 
 class AMDGPUCodeGenPassBuilder
     : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..eea56933921c69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "SIModeRegisterDefaults.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 4e913d1b32e1f1..d182a9498d87db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -16,7 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUIGroupLP.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6a0134e07567a1..1da6aea36b143a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,9 +13,9 @@
 
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPU.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600RegisterInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index 88429e3f0e2181..5057d4aa219f69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -62,8 +62,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPUTargetMachine.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 05ed1b322c0d1b..61f74a29d34e96 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -13,10 +13,10 @@
 
 #include "AMDGPUInstructionSelector.h"
 #include "AMDGPU.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterBankInfo.h"
-#include "AMDGPUTargetMachine.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 36dfebacaed686..0b24150e304d22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c6c4b8f9306471..3740c754705765 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -14,9 +14,9 @@
 #include "AMDGPULegalizerInfo.h"
 
 #include "AMDGPU.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 77971323aa1ec6..2b7bd0601a637d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -194,7 +194,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "GCNSubtarget.h"
 #include "SIDefines.h"
 #include "llvm/ADT/SetOperations.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index bb2603e0076e4b..d04eb842b111f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -177,7 +177,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/BitVector.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index f5b5e9e4275986..82c0dac9465b0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -15,8 +15,8 @@
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPU.h"
 #include "AMDGPUAsmPrinter.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUMachineFunction.h"
-#include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 5c656f158e7146..89c82388291958 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/Casting.h"
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 040e931b82af2f..891965bdd92d34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -14,7 +14,7 @@
 
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index bd0f0e048809bc..dd9292528f7d1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -31,7 +31,7 @@
 /// module splitting.
 
 #include "AMDGPUSplitModule.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 2e1bdf46924783..0f6ac1444d9cf2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,10 +13,10 @@
 
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUCallLowering.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUInstructionSelector.h"
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPURegisterBankInfo.h"
-#include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "R600Subtarget.h"
 #include "SIMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
deleted file mode 100644
index d82be9a7e9041a..00000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ /dev/null
@@ -1,1751 +0,0 @@
-//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// The AMDGPU target machine contains all of the hardware specific
-/// information  needed to emit code for SI+ GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUTargetMachine.h"
-#include "AMDGPU.h"
-#include "AMDGPUAliasAnalysis.h"
-#include "AMDGPUCodeGenPassBuilder.h"
-#include "AMDGPUCtorDtorLowering.h"
-#include "AMDGPUExportClustering.h"
-#include "AMDGPUIGroupLP.h"
-#include "AMDGPUISelDAGToDAG.h"
-#include "AMDGPUMacroFusion.h"
-#include "AMDGPUPerfHintAnalysis.h"
-#include "AMDGPURegBankSelect.h"
-#include "AMDGPUSplitModule.h"
-#include "AMDGPUTargetObjectFile.h"
-#include "AMDGPUTargetTransformInfo.h"
-#include "AMDGPUUnifyDivergentExitNodes.h"
-#include "GCNIterativeScheduler.h"
-#include "GCNSchedStrategy.h"
-#include "GCNVOPDUtils.h"
-#include "R600.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600TargetMachine.h"
-#include "SIFixSGPRCopies.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIMachineScheduler.h"
-#include "TargetInfo/AMDGPUTargetInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
-#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
-#include "llvm/CodeGen/GlobalISel/Legalizer.h"
-#include "llvm/CodeGen/GlobalISel/Localizer.h"
-#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
-#include "llvm/CodeGen/MIRParser/MIParser.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegAllocRegistry.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Passes/PassBuilder.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Transforms/HipStdPar/HipStdPar.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/AlwaysInliner.h"
-#include "llvm/Transforms/IPO/ExpandVariadics.h"
-#include "llvm/Transforms/IPO/GlobalDCE.h"
-#include "llvm/Transforms/IPO/Internalize.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
-#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
-#include <optional>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-namespace {
-class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
-public:
-  SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
-};
-
-class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
-public:
-  VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
-};
-
-static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
-                              const MachineRegisterInfo &MRI,
-                              const Register Reg) {
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
-}
-
-static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
-                              const MachineRegisterInfo &MRI,
-                              const Register Reg) {
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
-}
-
-/// -{sgpr|vgpr}-regalloc=... command line option.
-static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
-
-/// A dummy default pass factory indicates whether the register allocator is
-/// overridden on the command line.
-static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
-static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
-
-static SGPRRegisterRegAlloc
-defaultSGPRRegAlloc("default",
-                    "pick SGPR register allocator based on -O option",
-                    useDefaultRegisterAllocator);
-
-static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
-               RegisterPassParser<SGPRRegisterRegAlloc>>
-SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for SGPRs"));
-
-static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
-               RegisterPassParser<VGPRRegisterRegAlloc>>
-VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for VGPRs"));
-
-
-static void initializeDefaultSGPRRegisterAllocatorOnce() {
-  RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
-
-  if (!Ctor) {
-    Ctor = SGPRRegAlloc;
-    SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
-  }
-}
-
-static void initializeDefaultVGPRRegisterAllocatorOnce() {
-  RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
-
-  if (!Ctor) {
-    Ctor = VGPRRegAlloc;
-    VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
-  }
-}
-
-static FunctionPass *createBasicSGPRRegisterAllocator() {
-  return createBasicRegisterAllocator(onlyAllocateSGPRs);
-}
-
-static FunctionPass *createGreedySGPRRegisterAllocator() {
-  return createGreedyRegisterAllocator(onlyAllocateSGPRs);
-}
-
-static FunctionPass *createFastSGPRRegisterAllocator() {
-  return createFastRegisterAllocator(onlyAllocateSGPRs, false);
-}
-
-static FunctionPass *createBasicVGPRRegisterAllocator() {
-  return createBasicRegisterAllocator(onlyAllocateVGPRs);
-}
-
-static FunctionPass *createGreedyVGPRRegisterAllocator() {
-  return createGreedyRegisterAllocator(onlyAllocateVGPRs);
-}
-
-static FunctionPass *createFastVGPRRegisterAllocator() {
-  return createFastRegisterAllocator(onlyAllocateVGPRs, true);
-}
-
-static SGPRRegisterRegAlloc basicRegAllocSGPR(
-  "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
-static SGPRRegisterRegAlloc greedyRegAllocSGPR(
-  "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
-
-static SGPRRegisterRegAlloc fastRegAllocSGPR(
-  "fast", "fast register allocator", createFastSGPRRegisterAllocator);
-
-
-static VGPRRegisterRegAlloc basicRegAllocVGPR(
-  "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
-static VGPRRegisterRegAlloc greedyRegAllocVGPR(
-  "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
-
-static VGPRRegisterRegAlloc fastRegAllocVGPR(
-  "fast", "fast register allocator", createFastVGPRRegisterAllocator);
-} // anonymous namespace
-
-static cl::opt<bool>
-EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
-                        cl::desc("Run early if-conversion"),
-                        cl::init(false));
-
-static cl::opt<bool>
-OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
-            cl::desc("Run pre-RA exec mask optimizations"),
-            cl::init(true));
-
-static cl::opt<bool>
-    LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
-                  cl::desc("Lower GPU ctor / dtors to globals on the device."),
-                  cl::init(true), cl::Hidden);
-
-// Option to disable vectorizer for tests.
-static cl::opt<bool> EnableLoadStoreVectorizer(
-  "amdgpu-load-store-vectorizer",
-  cl::desc("Enable load store vectorizer"),
-  cl::init(true),
-  cl::Hidden);
-
-// Option to control global loads scalarization
-static cl::opt<bool> ScalarizeGlobal(
-  "amdgpu-scalarize-global-loads",
-  cl::desc("Enable global load scalarization"),
-  cl::init(true),
-  cl::Hidden);
-
-// Option to run internalize pass.
-static cl::opt<bool> InternalizeSymbols(
-  "amdgpu-internalize-symbols",
-  cl::desc("Enable elimination of non-kernel functions and unused globals"),
-  cl::init(false),
-  cl::Hidden);
-
-// Option to inline all early.
-static cl::opt<bool> EarlyInlineAll(
-  "amdgpu-early-inline-all",
-  cl::desc("Inline all functions early"),
-  cl::init(false),
-  cl::Hidden);
-
-static cl::opt<bool> RemoveIncompatibleFunctions(
-    "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
-    cl::desc("Enable removal of functions when they"
-             "use features not supported by the target GPU"),
-    cl::init(true));
-
-static cl::opt<bool> EnableSDWAPeephole(
-  "amdgpu-sdwa-peephole",
-  cl::desc("Enable SDWA peepholer"),
-  cl::init(true));
-
-static cl::opt<bool> EnableDPPCombine(
-  "amdgpu-dpp-combine",
-  cl::desc("Enable DPP combiner"),
-  cl::init(true));
-
-// Enable address space based alias analysis
-static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
-  cl::desc("Enable AMDGPU Alias Analysis"),
-  cl::init(true));
-
-// Option to run late CFG structurizer
-static cl::opt<bool, true> LateCFGStructurize(
-  "amdgpu-late-structurize",
-  cl::desc("Enable late CFG structurization"),
-  cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
-  cl::Hidden);
-
-// Disable structurizer-based control-flow lowering in order to test convergence
-// control tokens. This should eventually be replaced by the wave-transform.
-static cl::opt<bool, true> DisableStructurizer(
-    "amdgpu-disable-structurizer",
-    cl::desc("Disable structurizer for experiments; produces unusable code"),
-    cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden);
-
-// Enable lib calls simplifications
-static cl::opt<bool> EnableLibCallSimplify(
-  "amdgpu-simplify-libcall",
-  cl::desc("Enable amdgpu library simplifications"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> EnableLowerKernelArguments(
-  "amdgpu-ir-lower-kernel-arguments",
-  cl::desc("Lower kernel argument loads in IR pass"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> EnableRegReassign(
-  "amdgpu-reassign-regs",
-  cl::desc("Enable register reassign optimizations on gfx10+"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> OptVGPRLiveRange(
-    "amdgpu-opt-vgpr-liverange",
-    cl::desc("Enable VGPR liverange optimizations for if-else structure"),
-    cl::init(true), cl::Hidden);
-
-static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
-    "amdgpu-atomic-optimizer-strategy",
-    cl::desc("Select DPP or Iterative strategy for scan"),
-    cl::init(ScanOptions::Iterative),
-    cl::values(
-        clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
-        clEnumValN(ScanOptions::Iterative, "Iterative",
-                   "Use Iterative approach for scan"),
-        clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
-
-// Enable Mode register optimization
-static cl::opt<bool> EnableSIModeRegisterPass(
-  "amdgpu-mode-register",
-  cl::desc("Enable mode register pass"),
-  cl::init(true),
-  cl::Hidden);
-
-// Enable GFX11.5+ s_singleuse_vdst insertion
-static cl::opt<bool>
-    EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
-                              cl::desc("Enable s_singleuse_vdst insertion"),
-                              cl::init(false), cl::Hidden);
-
-// Enable GFX11+ s_delay_alu insertion
-static cl::opt<bool>
-    EnableInsertDelayAlu("amdgpu-enable-delay-alu",
-                         cl::desc("Enable s_delay_alu insertion"),
-                         cl::init(true), cl::Hidden);
-
-// Enable GFX11+ VOPD
-static cl::opt<bool>
-    EnableVOPD("amdgpu-enable-vopd",
-               cl::desc("Enable VOPD, dual issue of VALU in wave32"),
-               cl::init(true), cl::Hidden);
-
-// Option is used in lit tests to prevent deadcoding of patterns inspected.
-static cl::opt<bool>
-EnableDCEInRA("amdgpu-dce-in-ra",
-    cl::init(true), cl::Hidden,
-    cl::desc("Enable machine DCE inside regalloc"));
-
-static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
-                                           cl::desc("Adjust wave priority"),
-                                           cl::init(false), cl::Hidden);
-
-static cl::opt<bool> EnableScalarIRPasses(
-  "amdgpu-scalar-ir-passes",
-  cl::desc("Enable scalar IR passes"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> EnableStructurizerWorkarounds(
-    "amdgpu-enable-structurizer-workarounds",
-    cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
-    cl::Hidden);
-
-static cl::opt<bool, true> EnableLowerModuleLDS(
-    "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
-    cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
-    cl::Hidden);
-
-static cl::opt<bool> EnablePreRAOptimizations(
-    "amdgpu-enable-pre-ra-optimizations",
-    cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
-    cl::Hidden);
-
-static cl::opt<bool> EnablePromoteKernelArguments(
-    "amdgpu-enable-promote-kernel-arguments",
-    cl::desc("Enable promotion of flat kernel pointer arguments to global"),
-    cl::Hidden, cl::init(true));
-
-static cl::opt<bool> EnableImageIntrinsicOptimizer(
-    "amdgpu-enable-image-intrinsic-optimizer",
-    cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
-    cl::Hidden);
-
-static cl::opt<bool>
-    EnableLoopPrefetch("amdgpu-loop-prefetch",
-                       cl::desc("Enable loop data prefetch on AMDGPU"),
-                       cl::Hidden, cl::init(false));
-
-static cl::opt<bool> EnableMaxIlpSchedStrategy(
-    "amdgpu-enable-max-ilp-scheduling-strategy",
-    cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
-    cl::Hidden, cl::init(false));
-
-static cl::opt<bool> EnableRewritePartialRegUses(
-    "amdgpu-enable-rewrite-partial-reg-uses",
-    cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
-    cl::Hidden);
-
-static cl::opt<bool> EnableHipStdPar(
-  "amdgpu-enable-hipstdpar",
-  cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
-  cl::Hidden);
-
-static cl::opt<bool>
-    EnableAMDGPUAttributor("amdgpu-attributor-enable",
-                           cl::desc("Enable AMDGPUAttributorPass"),
-                           cl::init(true), cl::Hidden);
-
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
-  // Register the target
-  RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
-  RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
-
-  PassRegistry *PR = PassRegistry::getPassRegistry();
-  initializeR600ClauseMergePassPass(*PR);
-  initializeR600ControlFlowFinalizerPass(*PR);
-  initializeR600PacketizerPass(*PR);
-  initializeR600ExpandSpecialInstrsPassPass(*PR);
-  initializeR600VectorRegMergerPass(*PR);
-  initializeGlobalISel(*PR);
-  initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
-  initializeGCNDPPCombinePass(*PR);
-  initializeSILowerI1CopiesLegacyPass(*PR);
-  initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
-  initializeSILowerWWMCopiesPass(*PR);
-  initializeAMDGPUMarkLastScratchLoadPass(*PR);
-  initializeSILowerSGPRSpillsPass(*PR);
-  initializeSIFixSGPRCopiesLegacyPass(*PR);
-  initializeSIFixVGPRCopiesPass(*PR);
-  initializeSIFoldOperandsPass(*PR);
-  initializeSIPeepholeSDWAPass(*PR);
-  initializeSIShrinkInstructionsPass(*PR);
-  initializeSIOptimizeExecMaskingPreRAPass(*PR);
-  initializeSIOptimizeVGPRLiveRangePass(*PR);
-  initializeSILoadStoreOptimizerPass(*PR);
-  initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
-  initializeAMDGPUAlwaysInlinePass(*PR);
-  initializeAMDGPUAttributorLegacyPass(*PR);
-  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
-  initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
-  initializeAMDGPUArgumentUsageInfoPass(*PR);
-  initializeAMDGPUAtomicOptimizerPass(*PR);
-  initializeAMDGPULowerKernelArgumentsPass(*PR);
-  initializeAMDGPUPromoteKernelArgumentsPass(*PR);
-  initializeAMDGPULowerKernelAttributesPass(*PR);
-  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
-  initializeAMDGPUPostLegalizerCombinerPass(*PR);
-  initializeAMDGPUPreLegalizerCombinerPass(*PR);
-  initializeAMDGPURegBankCombinerPass(*PR);
-  initializeAMDGPURegBankSelectPass(*PR);
-  initializeAMDGPUPromoteAllocaPass(*PR);
-  initializeAMDGPUPromoteAllocaToVectorPass(*PR);
-  initializeAMDGPUCodeGenPreparePass(*PR);
-  initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
-  initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
-  initializeAMDGPULowerModuleLDSLegacyPass(*PR);
-  initializeAMDGPULowerBufferFatPointersPass(*PR);
-  initializeAMDGPURewriteOutArgumentsPass(*PR);
-  initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
-  initializeAMDGPUUnifyMetadataPass(*PR);
-  initializeSIAnnotateControlFlowLegacyPass(*PR);
-  initializeAMDGPUInsertSingleUseVDSTPass(*PR);
-  initializeAMDGPUInsertDelayAluPass(*PR);
-  initializeSIInsertHardClausesPass(*PR);
-  initializeSIInsertWaitcntsPass(*PR);
-  initializeSIModeRegisterPass(*PR);
-  initializeSIWholeQuadModePass(*PR);
-  initializeSILowerControlFlowPass(*PR);
-  initializeSIPreEmitPeepholePass(*PR);
-  initializeSILateBranchLoweringPass(*PR);
-  initializeSIMemoryLegalizerPass(*PR);
-  initializeSIOptimizeExecMaskingPass(*PR);
-  initializeSIPreAllocateWWMRegsPass(*PR);
-  initializeSIFormMemoryClausesPass(*PR);
-  initializeSIPostRABundlerPass(*PR);
-  initializeGCNCreateVOPDPass(*PR);
-  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
-  initializeAMDGPUAAWrapperPassPass(*PR);
-  initializeAMDGPUExternalAAWrapperPass(*PR);
-  initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
-  initializeAMDGPUPrintfRuntimeBindingPass(*PR);
-  initializeAMDGPUResourceUsageAnalysisPass(*PR);
-  initializeGCNNSAReassignPass(*PR);
-  initializeGCNPreRAOptimizationsPass(*PR);
-  initializeGCNPreRALongBranchRegPass(*PR);
-  initializeGCNRewritePartialRegUsesPass(*PR);
-  initializeGCNRegPressurePrinterPass(*PR);
-}
-
-static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
-  return std::make_unique<AMDGPUTargetObjectFile>();
-}
-
-static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
-  return new SIScheduleDAGMI(C);
-}
-
-static ScheduleDAGInstrs *
-createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
-  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  ScheduleDAGMILive *DAG =
-    new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  if (ST.shouldClusterStores())
-    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
-  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
-  DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
-  return DAG;
-}
-
-static ScheduleDAGInstrs *
-createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
-  ScheduleDAGMILive *DAG =
-      new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
-  return DAG;
-}
-
-static ScheduleDAGInstrs *
-createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
-  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  auto DAG = new GCNIterativeScheduler(C,
-    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  if (ST.shouldClusterStores())
-    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  return DAG;
-}
-
-static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
-  return new GCNIterativeScheduler(C,
-    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
-}
-
-static ScheduleDAGInstrs *
-createIterativeILPMachineScheduler(MachineSchedContext *C) {
-  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  auto DAG = new GCNIterativeScheduler(C,
-    GCNIterativeScheduler::SCHEDULE_ILP);
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  if (ST.shouldClusterStores())
-    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
-  return DAG;
-}
-
-static MachineSchedRegistry
-SISchedRegistry("si", "Run SI's custom scheduler",
-                createSIMachineScheduler);
-
-static MachineSchedRegistry
-GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
-                             "Run GCN scheduler to maximize occupancy",
-                             createGCNMaxOccupancyMachineScheduler);
-
-static MachineSchedRegistry
-    GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
-                           createGCNMaxILPMachineScheduler);
-
-static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
-    "gcn-iterative-max-occupancy-experimental",
-    "Run GCN scheduler to maximize occupancy (experimental)",
-    createIterativeGCNMaxOccupancyMachineScheduler);
-
-static MachineSchedRegistry GCNMinRegSchedRegistry(
-    "gcn-iterative-minreg",
-    "Run GCN iterative scheduler for minimal register usage (experimental)",
-    createMinRegScheduler);
-
-static MachineSchedRegistry GCNILPSchedRegistry(
-    "gcn-iterative-ilp",
-    "Run GCN iterative scheduler for ILP scheduling (experimental)",
-    createIterativeILPMachineScheduler);
-
-static StringRef computeDataLayout(const Triple &TT) {
-  if (TT.getArch() == Triple::r600) {
-    // 32-bit pointers.
-    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-           "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
-  }
-
-  // 32-bit private, local, and region pointers. 64-bit global, constant and
-  // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
-  // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
-  // (address space 7), and 128-bit non-integral buffer resourcees (address
-  // space 8) which cannot be non-trivilally accessed by LLVM memory operations
-  // like getelementptr.
-  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
-         "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
-         "v32:32-v48:64-v96:"
-         "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
-         "G1-ni:7:8:9";
-}
-
-LLVM_READNONE
-static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
-  if (!GPU.empty())
-    return GPU;
-
-  // Need to default to a target with flat support for HSA.
-  if (TT.getArch() == Triple::amdgcn)
-    return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
-
-  return "r600";
-}
-
-static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
-  // The AMDGPU toolchain only supports generating shared objects, so we
-  // must always use PIC.
-  return Reloc::PIC_;
-}
-
-AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
-                                         StringRef CPU, StringRef FS,
-                                         const TargetOptions &Options,
-                                         std::optional<Reloc::Model> RM,
-                                         std::optional<CodeModel::Model> CM,
-                                         CodeGenOptLevel OptLevel)
-    : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
-                        FS, Options, getEffectiveRelocModel(RM),
-                        getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
-      TLOF(createTLOF(getTargetTriple())) {
-  initAsmInfo();
-  if (TT.getArch() == Triple::amdgcn) {
-    if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
-      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
-    else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
-      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
-  }
-}
-
-bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
-bool AMDGPUTargetMachine::EnableFunctionCalls = false;
-bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
-bool AMDGPUTargetMachine::DisableStructurizer = false;
-
-AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
-
-StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
-  Attribute GPUAttr = F.getFnAttribute("target-cpu");
-  return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
-}
-
-StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
-  Attribute FSAttr = F.getFnAttribute("target-features");
-
-  return FSAttr.isValid() ? FSAttr.getValueAsString()
-                          : getTargetFeatureString();
-}
-
-/// Predicate for Internalize pass.
-static bool mustPreserveGV(const GlobalValue &GV) {
-  if (const Function *F = dyn_cast<Function>(&GV))
-    return F->isDeclaration() || F->getName().starts_with("__asan_") ||
-           F->getName().starts_with("__sanitizer_") ||
-           AMDGPU::isEntryFunctionCC(F->getCallingConv());
-
-  GV.removeDeadConstantUsers();
-  return !GV.use_empty();
-}
-
-void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
-  AAM.registerFunctionAnalysis<AMDGPUAA>();
-}
-
-static Expected<ScanOptions>
-parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
-  if (Params.empty())
-    return ScanOptions::Iterative;
-  Params.consume_front("strategy=");
-  auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
-                    .Case("dpp", ScanOptions::DPP)
-                    .Cases("iterative", "", ScanOptions::Iterative)
-                    .Case("none", ScanOptions::None)
-                    .Default(std::nullopt);
-  if (Result)
-    return *Result;
-  return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
-}
-
-Expected<AMDGPUAttributorOptions>
-parseAMDGPUAttributorPassOptions(StringRef Params) {
-  AMDGPUAttributorOptions Result;
-  while (!Params.empty()) {
-    StringRef ParamName;
-    std::tie(ParamName, Params) = Params.split(';');
-    if (ParamName == "closed-world") {
-      Result.IsClosedWorld = true;
-    } else {
-      return make_error<StringError>(
-          formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
-              .str(),
-          inconvertibleErrorCode());
-    }
-  }
-  return Result;
-}
-
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
-
-#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
-#include "llvm/Passes/TargetPassRegistry.inc"
-
-  PB.registerPipelineStartEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
-        FunctionPassManager FPM;
-        PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-        if (EnableHipStdPar)
-          PM.addPass(HipStdParAcceleratorCodeSelectionPass());
-      });
-
-  PB.registerPipelineEarlySimplificationEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
-        PM.addPass(AMDGPUPrintfRuntimeBindingPass());
-
-        if (Level == OptimizationLevel::O0)
-          return;
-
-        PM.addPass(AMDGPUUnifyMetadataPass());
-
-        if (InternalizeSymbols) {
-          PM.addPass(InternalizePass(mustPreserveGV));
-          PM.addPass(GlobalDCEPass());
-        }
-
-        if (EarlyInlineAll && !EnableFunctionCalls)
-          PM.addPass(AMDGPUAlwaysInlinePass());
-      });
-
-  PB.registerPeepholeEPCallback(
-      [](FunctionPassManager &FPM, OptimizationLevel Level) {
-        if (Level == OptimizationLevel::O0)
-          return;
-
-        FPM.addPass(AMDGPUUseNativeCallsPass());
-        if (EnableLibCallSimplify)
-          FPM.addPass(AMDGPUSimplifyLibCallsPass());
-      });
-
-  PB.registerCGSCCOptimizerLateEPCallback(
-      [this](CGSCCPassManager &PM, OptimizationLevel Level) {
-        if (Level == OptimizationLevel::O0)
-          return;
-
-        FunctionPassManager FPM;
-
-        // Add promote kernel arguments pass to the opt pipeline right before
-        // infer address spaces which is needed to do actual address space
-        // rewriting.
-        if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
-            EnablePromoteKernelArguments)
-          FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
-
-        // Add infer address spaces pass to the opt pipeline after inlining
-        // but before SROA to increase SROA opportunities.
-        FPM.addPass(InferAddressSpacesPass());
-
-        // This should run after inlining to have any chance of doing
-        // anything, and before other cleanup optimizations.
-        FPM.addPass(AMDGPULowerKernelAttributesPass());
-
-        if (Level != OptimizationLevel::O0) {
-          // Promote alloca to vector before SROA and loop unroll. If we
-          // manage to eliminate allocas before unroll we may choose to unroll
-          // less.
-          FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
-        }
-
-        PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
-      });
-
-  // FIXME: Why is AMDGPUAttributor not in CGSCC?
-  PB.registerOptimizerLastEPCallback(
-      [this](ModulePassManager &MPM, OptimizationLevel Level) {
-        if (Level != OptimizationLevel::O0) {
-          MPM.addPass(AMDGPUAttributorPass(*this));
-        }
-      });
-
-  PB.registerFullLinkTimeOptimizationLastEPCallback(
-      [this](ModulePassManager &PM, OptimizationLevel Level) {
-        // We want to support the -lto-partitions=N option as "best effort".
-        // For that, we need to lower LDS earlier in the pipeline before the
-        // module is partitioned for codegen.
-        if (EnableLowerModuleLDS)
-          PM.addPass(AMDGPULowerModuleLDSPass(*this));
-        if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0)
-          PM.addPass(AMDGPUAttributorPass(*this));
-      });
-
-  PB.registerRegClassFilterParsingCallback(
-      [](StringRef FilterName) -> RegAllocFilterFunc {
-        if (FilterName == "sgpr")
-          return onlyAllocateSGPRs;
-        if (FilterName == "vgpr")
-          return onlyAllocateVGPRs;
-        return nullptr;
-      });
-}
-
-int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
-  return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-          AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
-          AddrSpace == AMDGPUAS::REGION_ADDRESS)
-             ? -1
-             : 0;
-}
-
-bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
-                                              unsigned DestAS) const {
-  return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
-         AMDGPU::isFlatGlobalAddrSpace(DestAS);
-}
-
-unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
-  const auto *LD = dyn_cast<LoadInst>(V);
-  if (!LD)
-    return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
-
-  // It must be a generic pointer loaded.
-  assert(V->getType()->isPointerTy() &&
-         V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
-
-  const auto *Ptr = LD->getPointerOperand();
-  if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
-    return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
-  // For a generic pointer loaded from the constant memory, it could be assumed
-  // as a global pointer since the constant memory is only populated on the
-  // host side. As implied by the offload programming model, only global
-  // pointers could be referenced on the host side.
-  return AMDGPUAS::GLOBAL_ADDRESS;
-}
-
-std::pair<const Value *, unsigned>
-AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
-  if (auto *II = dyn_cast<IntrinsicInst>(V)) {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::amdgcn_is_shared:
-      return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
-    case Intrinsic::amdgcn_is_private:
-      return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
-    default:
-      break;
-    }
-    return std::pair(nullptr, -1);
-  }
-  // Check the global pointer predication based on
-  // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
-  // the order of 'is_shared' and 'is_private' is not significant.
-  Value *Ptr;
-  if (match(
-          const_cast<Value *>(V),
-          m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
-                  m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
-                      m_Deferred(Ptr))))))
-    return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
-
-  return std::pair(nullptr, -1);
-}
-
-unsigned
-AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
-  switch (Kind) {
-  case PseudoSourceValue::Stack:
-  case PseudoSourceValue::FixedStack:
-    return AMDGPUAS::PRIVATE_ADDRESS;
-  case PseudoSourceValue::ConstantPool:
-  case PseudoSourceValue::GOT:
-  case PseudoSourceValue::JumpTable:
-  case PseudoSourceValue::GlobalValueCallEntry:
-  case PseudoSourceValue::ExternalSymbolCallEntry:
-    return AMDGPUAS::CONSTANT_ADDRESS;
-  }
-  return AMDGPUAS::FLAT_ADDRESS;
-}
-
-bool AMDGPUTargetMachine::splitModule(
-    Module &M, unsigned NumParts,
-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
-  // FIXME(?): Would be better to use an already existing Analysis/PassManager,
-  // but all current users of this API don't have one ready and would need to
-  // create one anyway. Let's hide the boilerplate for now to keep it simple.
-
-  LoopAnalysisManager LAM;
-  FunctionAnalysisManager FAM;
-  CGSCCAnalysisManager CGAM;
-  ModuleAnalysisManager MAM;
-
-  PassBuilder PB(this);
-  PB.registerModuleAnalyses(MAM);
-  PB.registerFunctionAnalyses(FAM);
-  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-  ModulePassManager MPM;
-  MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
-  MPM.run(M, MAM);
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// GCN Target Machine (SI+)
-//===----------------------------------------------------------------------===//
-
-GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
-                                   std::optional<Reloc::Model> RM,
-                                   std::optional<CodeModel::Model> CM,
-                                   CodeGenOptLevel OL, bool JIT)
-    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
-const TargetSubtargetInfo *
-GCNTargetMachine::getSubtargetImpl(const Function &F) const {
-  StringRef GPU = getGPUName(F);
-  StringRef FS = getFeatureString(F);
-
-  SmallString<128> SubtargetKey(GPU);
-  SubtargetKey.append(FS);
-
-  auto &I = SubtargetMap[SubtargetKey];
-  if (!I) {
-    // This needs to be done before we create a new subtarget since any
-    // creation will depend on the TM and the code generation flags on the
-    // function that reside in TargetOptions.
-    resetTargetOptions(F);
-    I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
-  }
-
-  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
-
-  return I.get();
-}
-
-TargetTransformInfo
-GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
-  return TargetTransformInfo(GCNTTIImpl(this, F));
-}
-
-Error GCNTargetMachine::buildCodeGenPipeline(
-    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
-    CodeGenFileType FileType, const CGPassBuilderOption &Opts,
-    PassInstrumentationCallbacks *PIC) {
-  AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
-  return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
-}
-
-//===----------------------------------------------------------------------===//
-// AMDGPU Pass Setup
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
-  return getStandardCSEConfigForOpt(TM->getOptLevel());
-}
-
-namespace {
-
-class GCNPassConfig final : public AMDGPUPassConfig {
-public:
-  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
-    : AMDGPUPassConfig(TM, PM) {
-    // It is necessary to know the register usage of the entire call graph.  We
-    // allow calls without EnableAMDGPUFunctionCalls if they are marked
-    // noinline, so this is always required.
-    setRequiresCodeGenSCCOrder(true);
-    substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
-  }
-
-  GCNTargetMachine &getGCNTargetMachine() const {
-    return getTM<GCNTargetMachine>();
-  }
-
-  ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const override;
-
-  ScheduleDAGInstrs *
-  createPostMachineScheduler(MachineSchedContext *C) const override {
-    ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
-        C, std::make_unique<PostGenericScheduler>(C),
-        /*RemoveKillFlags=*/true);
-    const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-    if (ST.shouldClusterStores())
-      DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-    DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(
-        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
-    if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
-      DAG->addMutation(createVOPDPairingMutation());
-    return DAG;
-  }
-
-  bool addPreISel() override;
-  void addMachineSSAOptimization() override;
-  bool addILPOpts() override;
-  bool addInstSelector() override;
-  bool addIRTranslator() override;
-  void addPreLegalizeMachineIR() override;
-  bool addLegalizeMachineIR() override;
-  void addPreRegBankSelect() override;
-  bool addRegBankSelect() override;
-  void addPreGlobalInstructionSelect() override;
-  bool addGlobalInstructionSelect() override;
-  void addFastRegAlloc() override;
-  void addOptimizedRegAlloc() override;
-
-  FunctionPass *createSGPRAllocPass(bool Optimized);
-  FunctionPass *createVGPRAllocPass(bool Optimized);
-  FunctionPass *createRegAllocPass(bool Optimized) override;
-
-  bool addRegAssignAndRewriteFast() override;
-  bool addRegAssignAndRewriteOptimized() override;
-
-  void addPreRegAlloc() override;
-  bool addPreRewrite() override;
-  void addPostRegAlloc() override;
-  void addPreSched2() override;
-  void addPreEmitPass() override;
-};
-
-} // end anonymous namespace
-
-AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {
-  // Exceptions and StackMaps are not supported, so these passes will never do
-  // anything.
-  disablePass(&StackMapLivenessID);
-  disablePass(&FuncletLayoutID);
-  // Garbage collection is not supported.
-  disablePass(&GCLoweringID);
-  disablePass(&ShadowStackGCLoweringID);
-}
-
-void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
-  if (getOptLevel() == CodeGenOptLevel::Aggressive)
-    addPass(createGVNPass());
-  else
-    addPass(createEarlyCSEPass());
-}
-
-void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
-  if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
-    addPass(createLoopDataPrefetchPass());
-  addPass(createSeparateConstOffsetFromGEPPass());
-  // ReassociateGEPs exposes more opportunities for SLSR. See
-  // the example in reassociate-geps-and-slsr.ll.
-  addPass(createStraightLineStrengthReducePass());
-  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
-  // EarlyCSE can reuse.
-  addEarlyCSEOrGVNPass();
-  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
-  addPass(createNaryReassociatePass());
-  // NaryReassociate on GEPs creates redundant common expressions, so run
-  // EarlyCSE after it.
-  addPass(createEarlyCSEPass());
-}
-
-void AMDGPUPassConfig::addIRPasses() {
-  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
-
-  Triple::ArchType Arch = TM.getTargetTriple().getArch();
-  if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
-    addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
-
-  // There is no reason to run these.
-  disablePass(&StackMapLivenessID);
-  disablePass(&FuncletLayoutID);
-  disablePass(&PatchableFunctionID);
-
-  addPass(createAMDGPUPrintfRuntimeBinding());
-  if (LowerCtorDtor)
-    addPass(createAMDGPUCtorDtorLoweringLegacyPass());
-
-  if (isPassEnabled(EnableImageIntrinsicOptimizer))
-    addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
-
-  // This can be disabled by passing ::Disable here or on the command line
-  // with --expand-variadics-override=disable.
-  addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
-
-  // Function calls are not supported, so make sure we inline everything.
-  addPass(createAMDGPUAlwaysInlinePass());
-  addPass(createAlwaysInlinerLegacyPass());
-
-  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
-  if (Arch == Triple::r600)
-    addPass(createR600OpenCLImageTypeLoweringPass());
-
-  // Replace OpenCL enqueued block function pointers with global variables.
-  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
-
-  // Runs before PromoteAlloca so the latter can account for function uses
-  if (EnableLowerModuleLDS) {
-    addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
-  }
-
-  if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addPass(createInferAddressSpacesPass());
-
-  // Run atomic optimizer before Atomic Expand
-  if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
-      (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
-      (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
-    addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
-  }
-
-  addPass(createAtomicExpandLegacyPass());
-
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    addPass(createAMDGPUPromoteAlloca());
-
-    if (isPassEnabled(EnableScalarIRPasses))
-      addStraightLineScalarOptimizationPasses();
-
-    if (EnableAMDGPUAliasAnalysis) {
-      addPass(createAMDGPUAAWrapperPass());
-      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
-                                             AAResults &AAR) {
-        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
-          AAR.addAAResult(WrapperPass->getResult());
-        }));
-    }
-
-    if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
-      // TODO: May want to move later or split into an early and late one.
-      addPass(createAMDGPUCodeGenPreparePass());
-    }
-
-    // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
-    // have expanded.
-    if (TM.getOptLevel() > CodeGenOptLevel::Less)
-      addPass(createLICMPass());
-  }
-
-  TargetPassConfig::addIRPasses();
-
-  // EarlyCSE is not always strong enough to clean up what LSR produces. For
-  // example, GVN can combine
-  //
-  //   %0 = add %a, %b
-  //   %1 = add %b, %a
-  //
-  // and
-  //
-  //   %0 = shl nsw %a, 2
-  //   %1 = shl %a, 2
-  //
-  // but EarlyCSE can do neither of them.
-  if (isPassEnabled(EnableScalarIRPasses))
-    addEarlyCSEOrGVNPass();
-}
-
-void AMDGPUPassConfig::addCodeGenPrepare() {
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
-    // FIXME: This pass adds 2 hacky attributes that can be replaced with an
-    // analysis, and should be removed.
-    addPass(createAMDGPUAnnotateKernelFeaturesPass());
-  }
-
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
-      EnableLowerKernelArguments)
-    addPass(createAMDGPULowerKernelArgumentsPass());
-
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
-    // This lowering has been placed after codegenprepare to take advantage of
-    // address mode matching (which is why it isn't put with the LDS lowerings).
-    // It could be placed anywhere before uniformity annotations (an analysis
-    // that it changes by splitting up fat pointers into their components)
-    // but has been put before switch lowering and CFG flattening so that those
-    // passes can run on the more optimized control flow this pass creates in
-    // many cases.
-    //
-    // FIXME: This should ideally be put after the LoadStoreVectorizer.
-    // However, due to some annoying facts about ResourceUsageAnalysis,
-    // (especially as exercised in the resource-usage-dead-function test),
-    // we need all the function passes codegenprepare all the way through
-    // said resource usage analysis to run on the call graph produced
-    // before codegenprepare runs (because codegenprepare will knock some
-    // nodes out of the graph, which leads to function-level passes not
-    // being run on them, which causes crashes in the resource usage analysis).
-    addPass(createAMDGPULowerBufferFatPointersPass());
-    // In accordance with the above FIXME, manually force all the
-    // function-level passes into a CGSCCPassManager.
-    addPass(new DummyCGSCCPass());
-  }
-
-  TargetPassConfig::addCodeGenPrepare();
-
-  if (isPassEnabled(EnableLoadStoreVectorizer))
-    addPass(createLoadStoreVectorizerPass());
-
-  // LowerSwitch pass may introduce unreachable blocks that can
-  // cause unexpected behavior for subsequent passes. Placing it
-  // here seems better that these blocks would get cleaned up by
-  // UnreachableBlockElim inserted next in the pass flow.
-  addPass(createLowerSwitchPass());
-}
-
-bool AMDGPUPassConfig::addPreISel() {
-  if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createFlattenCFGPass());
-  return false;
-}
-
-bool AMDGPUPassConfig::addInstSelector() {
-  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
-  return false;
-}
-
-bool AMDGPUPassConfig::addGCPasses() {
-  // Do nothing. GC is not supported.
-  return false;
-}
-
-llvm::ScheduleDAGInstrs *
-AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
-  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  if (ST.shouldClusterStores())
-    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  return DAG;
-}
-
-MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
-    BumpPtrAllocator &Allocator, const Function &F,
-    const TargetSubtargetInfo *STI) const {
-  return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
-      Allocator, F, static_cast<const R600Subtarget *>(STI));
-}
-
-//===----------------------------------------------------------------------===//
-// GCN Pass Setup
-//===----------------------------------------------------------------------===//
-
-ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
-  MachineSchedContext *C) const {
-  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  if (ST.enableSIScheduler())
-    return createSIMachineScheduler(C);
-
-  if (EnableMaxIlpSchedStrategy)
-    return createGCNMaxILPMachineScheduler(C);
-
-  return createGCNMaxOccupancyMachineScheduler(C);
-}
-
-bool GCNPassConfig::addPreISel() {
-  AMDGPUPassConfig::addPreISel();
-
-  if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createSinkingPass());
-
-  if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createAMDGPULateCodeGenPrepareLegacyPass());
-
-  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
-  // regions formed by them.
-  addPass(&AMDGPUUnifyDivergentExitNodesID);
-  if (!LateCFGStructurize && !DisableStructurizer) {
-    if (EnableStructurizerWorkarounds) {
-      addPass(createFixIrreduciblePass());
-      addPass(createUnifyLoopExitsPass());
-    }
-    addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
-  }
-  addPass(createAMDGPUAnnotateUniformValuesLegacy());
-  if (!LateCFGStructurize && !DisableStructurizer) {
-    addPass(createSIAnnotateControlFlowLegacyPass());
-    // TODO: Move this right after structurizeCFG to avoid extra divergence
-    // analysis. This depends on stopping SIAnnotateControlFlow from making
-    // control flow modifications.
-    addPass(createAMDGPURewriteUndefForPHILegacyPass());
-  }
-  addPass(createLCSSAPass());
-
-  if (TM->getOptLevel() > CodeGenOptLevel::Less)
-    addPass(&AMDGPUPerfHintAnalysisLegacyID);
-
-  return false;
-}
-
-void GCNPassConfig::addMachineSSAOptimization() {
-  TargetPassConfig::addMachineSSAOptimization();
-
-  // We want to fold operands after PeepholeOptimizer has run (or as part of
-  // it), because it will eliminate extra copies making it easier to fold the
-  // real source operand. We want to eliminate dead instructions after, so that
-  // we see fewer uses of the copies. We then need to clean up the dead
-  // instructions leftover after the operands are folded as well.
-  //
-  // XXX - Can we get away without running DeadMachineInstructionElim again?
-  addPass(&SIFoldOperandsID);
-  if (EnableDPPCombine)
-    addPass(&GCNDPPCombineID);
-  addPass(&SILoadStoreOptimizerID);
-  if (isPassEnabled(EnableSDWAPeephole)) {
-    addPass(&SIPeepholeSDWAID);
-    addPass(&EarlyMachineLICMID);
-    addPass(&MachineCSEID);
-    addPass(&SIFoldOperandsID);
-  }
-  addPass(&DeadMachineInstructionElimID);
-  addPass(createSIShrinkInstructionsPass());
-}
-
-bool GCNPassConfig::addILPOpts() {
-  if (EnableEarlyIfConversion)
-    addPass(&EarlyIfConverterID);
-
-  TargetPassConfig::addILPOpts();
-  return false;
-}
-
-bool GCNPassConfig::addInstSelector() {
-  AMDGPUPassConfig::addInstSelector();
-  addPass(&SIFixSGPRCopiesLegacyID);
-  addPass(createSILowerI1CopiesLegacyPass());
-  return false;
-}
-
-bool GCNPassConfig::addIRTranslator() {
-  addPass(new IRTranslator(getOptLevel()));
-  return false;
-}
-
-void GCNPassConfig::addPreLegalizeMachineIR() {
-  bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
-  addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
-  addPass(new Localizer());
-}
-
-bool GCNPassConfig::addLegalizeMachineIR() {
-  addPass(new Legalizer());
-  return false;
-}
-
-void GCNPassConfig::addPreRegBankSelect() {
-  bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
-  addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
-  addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
-}
-
-bool GCNPassConfig::addRegBankSelect() {
-  addPass(new AMDGPURegBankSelect());
-  return false;
-}
-
-void GCNPassConfig::addPreGlobalInstructionSelect() {
-  bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
-  addPass(createAMDGPURegBankCombiner(IsOptNone));
-}
-
-bool GCNPassConfig::addGlobalInstructionSelect() {
-  addPass(new InstructionSelect(getOptLevel()));
-  return false;
-}
-
-void GCNPassConfig::addPreRegAlloc() {
-  if (LateCFGStructurize) {
-    addPass(createAMDGPUMachineCFGStructurizerPass());
-  }
-}
-
-void GCNPassConfig::addFastRegAlloc() {
-  // FIXME: We have to disable the verifier here because of PHIElimination +
-  // TwoAddressInstructions disabling it.
-
-  // This must be run immediately after phi elimination and before
-  // TwoAddressInstructions, otherwise the processing of the tied operand of
-  // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowID);
-
-  insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
-
-  TargetPassConfig::addFastRegAlloc();
-}
-
-void GCNPassConfig::addOptimizedRegAlloc() {
-  // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
-  // instructions that cause scheduling barriers.
-  insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
-
-  if (OptExecMaskPreRA)
-    insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
-
-  if (EnableRewritePartialRegUses)
-    insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
-
-  if (isPassEnabled(EnablePreRAOptimizations))
-    insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
-
-  // This is not an essential optimization and it has a noticeable impact on
-  // compilation time, so we only enable it from O2.
-  if (TM->getOptLevel() > CodeGenOptLevel::Less)
-    insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
-
-  // FIXME: when an instruction has a Killed operand, and the instruction is
-  // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
-  // the register in LiveVariables, this would trigger a failure in verifier,
-  // we should fix it and enable the verifier.
-  if (OptVGPRLiveRange)
-    insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
-  // This must be run immediately after phi elimination and before
-  // TwoAddressInstructions, otherwise the processing of the tied operand of
-  // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowID);
-
-  if (EnableDCEInRA)
-    insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
-
-  TargetPassConfig::addOptimizedRegAlloc();
-}
-
-bool GCNPassConfig::addPreRewrite() {
-  addPass(&SILowerWWMCopiesID);
-  if (EnableRegReassign)
-    addPass(&GCNNSAReassignID);
-  return true;
-}
-
-FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
-  // Initialize the global default.
-  llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
-                  initializeDefaultSGPRRegisterAllocatorOnce);
-
-  RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
-  if (Ctor != useDefaultRegisterAllocator)
-    return Ctor();
-
-  if (Optimized)
-    return createGreedyRegisterAllocator(onlyAllocateSGPRs);
-
-  return createFastRegisterAllocator(onlyAllocateSGPRs, false);
-}
-
-FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
-  // Initialize the global default.
-  llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
-                  initializeDefaultVGPRRegisterAllocatorOnce);
-
-  RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
-  if (Ctor != useDefaultRegisterAllocator)
-    return Ctor();
-
-  if (Optimized)
-    return createGreedyVGPRRegisterAllocator();
-
-  return createFastVGPRRegisterAllocator();
-}
-
-FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
-  llvm_unreachable("should not be used");
-}
-
-static const char RegAllocOptNotSupportedMessage[] =
-  "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
-
-bool GCNPassConfig::addRegAssignAndRewriteFast() {
-  if (!usingDefaultRegAlloc())
-    report_fatal_error(RegAllocOptNotSupportedMessage);
-
-  addPass(&GCNPreRALongBranchRegID);
-
-  addPass(createSGPRAllocPass(false));
-
-  // Equivalent of PEI for SGPRs.
-  addPass(&SILowerSGPRSpillsID);
-  addPass(&SIPreAllocateWWMRegsID);
-
-  addPass(createVGPRAllocPass(false));
-
-  addPass(&SILowerWWMCopiesID);
-  return true;
-}
-
-bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
-  if (!usingDefaultRegAlloc())
-    report_fatal_error(RegAllocOptNotSupportedMessage);
-
-  addPass(&GCNPreRALongBranchRegID);
-
-  addPass(createSGPRAllocPass(true));
-
-  // Commit allocated register changes. This is mostly necessary because too
-  // many things rely on the use lists of the physical registers, such as the
-  // verifier. This is only necessary with allocators which use LiveIntervals,
-  // since FastRegAlloc does the replacements itself.
-  addPass(createVirtRegRewriter(false));
-
-  // Equivalent of PEI for SGPRs.
-  addPass(&SILowerSGPRSpillsID);
-  addPass(&SIPreAllocateWWMRegsID);
-
-  addPass(createVGPRAllocPass(true));
-
-  addPreRewrite();
-  addPass(&VirtRegRewriterID);
-
-  addPass(&AMDGPUMarkLastScratchLoadID);
-
-  return true;
-}
-
-void GCNPassConfig::addPostRegAlloc() {
-  addPass(&SIFixVGPRCopiesID);
-  if (getOptLevel() > CodeGenOptLevel::None)
-    addPass(&SIOptimizeExecMaskingID);
-  TargetPassConfig::addPostRegAlloc();
-}
-
-void GCNPassConfig::addPreSched2() {
-  if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createSIShrinkInstructionsPass());
-  addPass(&SIPostRABundlerID);
-}
-
-void GCNPassConfig::addPreEmitPass() {
-  if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
-    addPass(&GCNCreateVOPDID);
-  addPass(createSIMemoryLegalizerPass());
-  addPass(createSIInsertWaitcntsPass());
-
-  addPass(createSIModeRegisterPass());
-
-  if (getOptLevel() > CodeGenOptLevel::None)
-    addPass(&SIInsertHardClausesID);
-
-  addPass(&SILateBranchLoweringPassID);
-  if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
-    addPass(createAMDGPUSetWavePriorityPass());
-  if (getOptLevel() > CodeGenOptLevel::None)
-    addPass(&SIPreEmitPeepholeID);
-  // The hazard recognizer that runs as part of the post-ra scheduler does not
-  // guarantee to be able handle all hazards correctly. This is because if there
-  // are multiple scheduling regions in a basic block, the regions are scheduled
-  // bottom up, so when we begin to schedule a region we don't know what
-  // instructions were emitted directly before it.
-  //
-  // Here we add a stand-alone hazard recognizer pass which can handle all
-  // cases.
-  addPass(&PostRAHazardRecognizerID);
-
-  if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
-    addPass(&AMDGPUInsertSingleUseVDSTID);
-
-  if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
-    addPass(&AMDGPUInsertDelayAluID);
-
-  addPass(&BranchRelaxationPassID);
-}
-
-TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new GCNPassConfig(*this, PM);
-}
-
-void GCNTargetMachine::registerMachineRegisterInfoCallback(
-    MachineFunction &MF) const {
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  MF.getRegInfo().addDelegate(MFI);
-}
-
-MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
-    BumpPtrAllocator &Allocator, const Function &F,
-    const TargetSubtargetInfo *STI) const {
-  return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
-      Allocator, F, static_cast<const GCNSubtarget *>(STI));
-}
-
-yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
-  return new yaml::SIMachineFunctionInfo();
-}
-
-yaml::MachineFunctionInfo *
-GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  return new yaml::SIMachineFunctionInfo(
-      *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
-}
-
-bool GCNTargetMachine::parseMachineFunctionInfo(
-    const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
-    SMDiagnostic &Error, SMRange &SourceRange) const {
-  const yaml::SIMachineFunctionInfo &YamlMFI =
-      static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
-  MachineFunction &MF = PFS.MF;
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
-  if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
-    return true;
-
-  if (MFI->Occupancy == 0) {
-    // Fixup the subtarget dependent default value.
-    MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
-  }
-
-  auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
-    Register TempReg;
-    if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
-      SourceRange = RegName.SourceRange;
-      return true;
-    }
-    RegVal = TempReg;
-
-    return false;
-  };
-
-  auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
-                                   Register &RegVal) {
-    return !RegName.Value.empty() && parseRegister(RegName, RegVal);
-  };
-
-  if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
-    return true;
-
-  if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
-    return true;
-
-  if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
-                            MFI->LongBranchReservedReg))
-    return true;
-
-  auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
-    // Create a diagnostic for a the register string literal.
-    const MemoryBuffer &Buffer =
-        *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
-    Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
-                         RegName.Value.size(), SourceMgr::DK_Error,
-                         "incorrect register class for field", RegName.Value,
-                         std::nullopt, std::nullopt);
-    SourceRange = RegName.SourceRange;
-    return true;
-  };
-
-  if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
-      parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
-      parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
-    return true;
-
-  if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
-      !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
-    return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
-  }
-
-  if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
-      !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
-    return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
-  }
-
-  if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
-      !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
-    return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
-  }
-
-  for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
-    Register ParsedReg;
-    if (parseRegister(YamlReg, ParsedReg))
-      return true;
-
-    MFI->reserveWWMRegister(ParsedReg);
-  }
-
-  auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
-                                   const TargetRegisterClass &RC,
-                                   ArgDescriptor &Arg, unsigned UserSGPRs,
-                                   unsigned SystemSGPRs) {
-    // Skip parsing if it's not present.
-    if (!A)
-      return false;
-
-    if (A->IsRegister) {
-      Register Reg;
-      if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
-        SourceRange = A->RegisterName.SourceRange;
-        return true;
-      }
-      if (!RC.contains(Reg))
-        return diagnoseRegisterClass(A->RegisterName);
-      Arg = ArgDescriptor::createRegister(Reg);
-    } else
-      Arg = ArgDescriptor::createStack(A->StackOffset);
-    // Check and apply the optional mask.
-    if (A->Mask)
-      Arg = ArgDescriptor::createArg(Arg, *A->Mask);
-
-    MFI->NumUserSGPRs += UserSGPRs;
-    MFI->NumSystemSGPRs += SystemSGPRs;
-    return false;
-  };
-
-  if (YamlMFI.ArgInfo &&
-      (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
-                             AMDGPU::SGPR_128RegClass,
-                             MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
-                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
-                             2, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
-                             MFI->ArgInfo.QueuePtr, 2, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
-                             AMDGPU::SReg_64RegClass,
-                             MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
-                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
-                             2, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
-                             AMDGPU::SReg_64RegClass,
-                             MFI->ArgInfo.FlatScratchInit, 2, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
-                             AMDGPU::SGPR_32RegClass,
-                             MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
-                             AMDGPU::SGPR_32RegClass,
-                             MFI->ArgInfo.LDSKernelId, 0, 1) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
-                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
-                             0, 1) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
-                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
-                             0, 1) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
-                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
-                             0, 1) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
-                             AMDGPU::SGPR_32RegClass,
-                             MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
-                             AMDGPU::SGPR_32RegClass,
-                             MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
-                             AMDGPU::SReg_64RegClass,
-                             MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
-                             AMDGPU::SReg_64RegClass,
-                             MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDX, 0, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDY, 0, 0) ||
-       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDZ, 0, 0)))
-    return true;
-
-  if (ST.hasIEEEMode())
-    MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
-  if (ST.hasDX10ClampMode())
-    MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
-
-  // FIXME: Move proper support for denormal-fp-math into base MachineFunction
-  MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
-                                      ? DenormalMode::IEEE
-                                      : DenormalMode::PreserveSign;
-  MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
-                                       ? DenormalMode::IEEE
-                                       : DenormalMode::PreserveSign;
-
-  MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
-                                          ? DenormalMode::IEEE
-                                          : DenormalMode::PreserveSign;
-  MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
-                                           ? DenormalMode::IEEE
-                                           : DenormalMode::PreserveSign;
-
-  return false;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
deleted file mode 100644
index 6bb8788cc73b0c..00000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ /dev/null
@@ -1,162 +0,0 @@
-//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// The AMDGPU TargetMachine interface definition for hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
-
-#include "GCNSubtarget.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Target/TargetMachine.h"
-#include <optional>
-#include <utility>
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// AMDGPU Target Machine (R600+)
-//===----------------------------------------------------------------------===//
-
-class AMDGPUTargetMachine : public LLVMTargetMachine {
-protected:
-  std::unique_ptr<TargetLoweringObjectFile> TLOF;
-
-  StringRef getGPUName(const Function &F) const;
-  StringRef getFeatureString(const Function &F) const;
-
-public:
-  static bool EnableLateStructurizeCFG;
-  static bool EnableFunctionCalls;
-  static bool EnableLowerModuleLDS;
-  static bool DisableStructurizer;
-
-  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                      StringRef FS, const TargetOptions &Options,
-                      std::optional<Reloc::Model> RM,
-                      std::optional<CodeModel::Model> CM, CodeGenOptLevel OL);
-  ~AMDGPUTargetMachine() override;
-
-  const TargetSubtargetInfo *getSubtargetImpl() const;
-  const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override = 0;
-
-  TargetLoweringObjectFile *getObjFileLowering() const override {
-    return TLOF.get();
-  }
-
-  void registerPassBuilderCallbacks(PassBuilder &PB) override;
-  void registerDefaultAliasAnalyses(AAManager &) override;
-
-  /// Get the integer value of a null pointer in the given address space.
-  static int64_t getNullPointerValue(unsigned AddrSpace);
-
-  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
-
-  unsigned getAssumedAddrSpace(const Value *V) const override;
-
-  std::pair<const Value *, unsigned>
-  getPredicatedAddrSpace(const Value *V) const override;
-
-  unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
-
-  bool splitModule(Module &M, unsigned NumParts,
-                   function_ref<void(std::unique_ptr<Module> MPart)>
-                       ModuleCallback) override;
-};
-
-//===----------------------------------------------------------------------===//
-// GCN Target Machine (SI+)
-//===----------------------------------------------------------------------===//
-
-class GCNTargetMachine final : public AMDGPUTargetMachine {
-private:
-  mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
-
-public:
-  GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                   StringRef FS, const TargetOptions &Options,
-                   std::optional<Reloc::Model> RM,
-                   std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
-                   bool JIT);
-
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-
-  const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
-
-  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
-
-  bool useIPRA() const override {
-    return true;
-  }
-
-  Error buildCodeGenPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out,
-                             raw_pwrite_stream *DwoOut,
-                             CodeGenFileType FileType,
-                             const CGPassBuilderOption &Opts,
-                             PassInstrumentationCallbacks *PIC) override;
-
-  void registerMachineRegisterInfoCallback(MachineFunction &MF) const override;
-
-  MachineFunctionInfo *
-  createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
-                            const TargetSubtargetInfo *STI) const override;
-
-  yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
-  yaml::MachineFunctionInfo *
-  convertFuncInfoToYAML(const MachineFunction &MF) const override;
-  bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
-                                PerFunctionMIParsingState &PFS,
-                                SMDiagnostic &Error,
-                                SMRange &SourceRange) const override;
-};
-
-//===----------------------------------------------------------------------===//
-// AMDGPU Pass Setup
-//===----------------------------------------------------------------------===//
-
-class AMDGPUPassConfig : public TargetPassConfig {
-public:
-  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM);
-
-  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
-    return getTM<AMDGPUTargetMachine>();
-  }
-
-  ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const override;
-
-  void addEarlyCSEOrGVNPass();
-  void addStraightLineScalarOptimizationPasses();
-  void addIRPasses() override;
-  void addCodeGenPrepare() override;
-  bool addPreISel() override;
-  bool addInstSelector() override;
-  bool addGCPasses() override;
-
-  std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
-
-  /// Check if a pass is enabled given \p Opt option. The option always
-  /// overrides defaults if explicitly used. Otherwise its default will
-  /// be used given that a pass shall work at an optimization \p Level
-  /// minimum.
-  bool isPassEnabled(const cl::opt<bool> &Opt,
-                     CodeGenOptLevel Level = CodeGenOptLevel::Default) const {
-    if (Opt.getNumOccurrences())
-      return Opt;
-    if (TM->getOptLevel() < Level)
-      return false;
-    return Opt;
-  }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c38c2dc0f5f618..165bdb791afb35 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetTransformInfo.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIModeRegisterDefaults.h"
 #include "llvm/Analysis/InlineCost.h"
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 671caf8484cd97..e21b2cf62f0c8f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -101,7 +101,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUSetWavePriority.cpp
   AMDGPUSplitModule.cpp
   AMDGPUSubtarget.cpp
-  AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp
   AMDGPUTargetTransformInfo.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index c550cfaf06c100..fe1927e5bdf74a 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600TargetMachine.h"
-#include "AMDGPUTargetMachine.h"
 #include "R600.h"
 #include "R600CodeGenPassBuilder.h"
 #include "R600MachineScheduler.h"
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
index 29e370edef2c67..8359f2faa2dba6 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_R600TARGETMACHINE_H
 #define LLVM_LIB_TARGET_AMDGPU_R600TARGETMACHINE_H
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "R600Subtarget.h"
 #include "llvm/Target/TargetMachine.h"
 #include <optional>
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index ad4aaa8fdef84c..f8e0e991c2c6ef 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -16,7 +16,7 @@
 
 #include "R600TargetTransformInfo.h"
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "R600Subtarget.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index edd881c84078c6..f765c06bda6d14 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "GCNSubtarget.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 86fc100f1c2da0..7f2e7b8d2ca9a7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13,8 +13,8 @@
 
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c5251826b117cb..4818f1b13caa6a 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -8,7 +8,6 @@
 
 #include "SIMachineFunctionInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7af5e7388f841e..bf423e70dc8265 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -14,8 +14,8 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
 
 #include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUMachineFunction.h"
-#include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
index d0a3cfa84ee018..98c8f94a8a7c96 100644
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUUnitTests.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "GCNSubtarget.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
index 56da4ce7b43af0..b6ce354af8a4bf 100644
--- a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
+++ b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUUnitTests.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
index 5ac4edae5f0dfa..6efbdc9eae9aa0 100644
--- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
+++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUUnitTests.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
index 86aa4753a21237..d84587cc7a4429 100644
--- a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
+++ b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "GCNSubtarget.h"
 #include "SIProgramInfo.h"
 #include "Utils/AMDGPUPALMetadata.h"