[llvm-branch-commits] [llvm] NewPM/AMDGPU: Port AMDGPUPerfHintAnalysis to new pass manager (PR #102645)

Fri Aug 9 09:49:05 PDT 2024

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/102645

This was much more difficult than I anticipated. The pass is
not in a good state, with poor test coverage. The legacy PM
does seem to be relying on maintaining the map state between
different SCCs, which seems bad. The pass is going out of its
way to avoid putting the attributes it introduces onto non-callee
functions. If it just added them, we could use them directly
instead of relying on the map, I would think.

The NewPM path uses a ModulePass; I'm not sure if we should be
using CGSCC here but there seems to be some missing infrastructure
to support backend defined ones.

>From 353885ddcab1182b25915766b9235ba411386359 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 9 Aug 2024 17:27:53 +0400
Subject: [PATCH] NewPM/AMDGPU: Port AMDGPUPerfHintAnalysis to new pass manager

This was much more difficult than I anticipated. The pass is
not in a good state, with poor test coverage. The legacy PM
does seem to be relying on maintaining the map state between
different SCCs, which seems bad. The pass is going out of its
way to avoid putting the attributes it introduces onto non-callee
functions. If it just added them, we could use them directly
instead of relying on the map, I would think.

The NewPM path uses a ModulePass; I'm not sure if we should be
using CGSCC here but there seems to be some missing infrastructure
to support backend defined ones.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   4 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   2 +-
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp  | 113 ++++++++++++++----
 .../Target/AMDGPU/AMDGPUPerfHintAnalysis.h    |  62 ++++++----
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   3 +-
 llvm/test/CodeGen/AMDGPU/perfhint.ll          |   1 +
 7 files changed, 136 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 195e2a19214e80..5b8d37a8ae7944 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -209,8 +209,8 @@ extern char &SIPreAllocateWWMRegsID;
 void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &);
 extern char &AMDGPUImageIntrinsicOptimizerID;
 
-void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
-extern char &AMDGPUPerfHintAnalysisID;
+void initializeAMDGPUPerfHintAnalysisLegacyPass(PassRegistry &);
+extern char &AMDGPUPerfHintAnalysisLegacyID;
 
 void initializeGCNRegPressurePrinterPass(PassRegistry &);
 extern char &GCNRegPressurePrinterID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8579774f522309..bbb4573655ab79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -102,7 +102,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
                       "AMDGPU DAG->DAG Pattern Instruction Selection", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
-INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 #ifdef EXPENSIVE_CHECKS
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6a6c33d85f83c..23fb1dba7b084c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
             AMDGPULowerBufferFatPointersPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-perf-hint", AMDGPUPerfHintAnalysisPass(*static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 1213d5e0b41db1..5797f02cb374e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -12,12 +12,15 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "AMDGPUPerfHintAnalysis.h"
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -54,12 +57,6 @@ static cl::opt<unsigned>
 STATISTIC(NumMemBound, "Number of functions marked as memory bound");
 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
 
-char llvm::AMDGPUPerfHintAnalysis::ID = 0;
-char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
-
-INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
-                "Analysis if a function is memory bound", true, true)
-
 namespace {
 
 struct AMDGPUPerfHint {
@@ -67,7 +64,7 @@ struct AMDGPUPerfHint {
 
 public:
   AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
-                 const TargetLowering *TLI_)
+                 const SITargetLowering *TLI_)
       : FIM(FIM_), TLI(TLI_) {}
 
   bool runOnFunction(Function &F);
@@ -97,7 +94,7 @@ struct AMDGPUPerfHint {
 
   const DataLayout *DL = nullptr;
 
-  const TargetLowering *TLI;
+  const SITargetLowering *TLI;
 
   AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
   static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
@@ -388,23 +385,52 @@ bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
                << Reference.print() << "Result:" << Result << '\n');
   return Result;
 }
+
+class AMDGPUPerfHintAnalysisLegacy : public CallGraphSCCPass {
+private:
+  // FIXME: This is relying on maintaining state between different SCCs.
+  AMDGPUPerfHintAnalysis Impl;
+
+public:
+  static char ID;
+
+  AMDGPUPerfHintAnalysisLegacy() : CallGraphSCCPass(ID) {}
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+
 } // namespace
 
-bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
+bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
+  auto FI = FIM.find(F);
+  if (FI == FIM.end())
     return false;
 
-  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  return AMDGPUPerfHint::isMemBound(FI->second);
+}
+
+bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
+  auto FI = FIM.find(F);
+  if (FI == FIM.end())
+    return false;
+
+  return AMDGPUPerfHint::needLimitWave(FI->second);
+}
 
+bool AMDGPUPerfHintAnalysis::runOnSCC(const GCNTargetMachine &TM,
+                                      CallGraphSCC &SCC) {
   bool Changed = false;
   for (CallGraphNode *I : SCC) {
     Function *F = I->getFunction();
     if (!F || F->isDeclaration())
       continue;
 
-    const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
-    AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
+    AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
 
     if (Analyzer.runOnFunction(*F))
       Changed = true;
@@ -413,18 +439,57 @@ bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
   return Changed;
 }
 
-bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
-  auto FI = FIM.find(F);
-  if (FI == FIM.end())
-    return false;
+bool AMDGPUPerfHintAnalysis::run(const GCNTargetMachine &TM,
+                                 LazyCallGraph &CG) {
 
-  return AMDGPUPerfHint::isMemBound(FI->second);
+  SmallVector<Function *, 16> Worklist;
+  CG.buildRefSCCs();
+  for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
+    for (LazyCallGraph::SCC &SCC : RC) {
+      if (SCC.size() != 1)
+        continue;
+      Function &F = SCC.begin()->getFunction();
+      if (!F.isDeclaration() && !F.doesNotRecurse() && F.hasInternalLinkage())
+        Worklist.push_back(&F);
+    }
+  }
+
+  bool Changed = false;
+  for (auto *F : llvm::reverse(Worklist)) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
+    AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
+
+    if (Analyzer.runOnFunction(*F))
+      Changed = true;
+  }
+
+  return Changed;
 }
 
-bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
-  auto FI = FIM.find(F);
-  if (FI == FIM.end())
+char AMDGPUPerfHintAnalysisLegacy::ID = 0;
+char &llvm::AMDGPUPerfHintAnalysisLegacyID = AMDGPUPerfHintAnalysisLegacy::ID;
+
+INITIALIZE_PASS(AMDGPUPerfHintAnalysisLegacy, DEBUG_TYPE,
+                "Analysis if a function is memory bound", true, true)
+
+bool AMDGPUPerfHintAnalysisLegacy::runOnSCC(CallGraphSCC &SCC) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
     return false;
 
-  return AMDGPUPerfHint::needLimitWave(FI->second);
+  const GCNTargetMachine &TM = TPC->getTM<GCNTargetMachine>();
+  return Impl.runOnSCC(TM, SCC);
+}
+
+PreservedAnalyses AMDGPUPerfHintAnalysisPass::run(Module &M,
+                                                  ModuleAnalysisManager &AM) {
+  auto &CG = AM.getResult<LazyCallGraphAnalysis>(M);
+
+  bool Changed = Impl->run(TM, CG);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<LazyCallGraphAnalysis>();
+  return PA;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 2db8db6957cee3..cf2ab825378009 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -12,35 +12,29 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
-#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPERFHINTANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPERFHINTANALYSIS_H
 
-#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueMap.h"
 
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+
 namespace llvm {
 
-struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass {
-  static char ID;
+class AMDGPUPerfHintAnalysis;
+class CallGraphSCC;
+class GCNTargetMachine;
+class LazyCallGraph;
 
+class AMDGPUPerfHintAnalysis {
 public:
-  AMDGPUPerfHintAnalysis() : CallGraphSCCPass(ID) {}
-
-  bool runOnSCC(CallGraphSCC &SCC) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-
-  bool isMemoryBound(const Function *F) const;
-
-  bool needsWaveLimiter(const Function *F) const;
-
   struct FuncInfo {
     unsigned MemInstCost;
     unsigned InstCost;
-    unsigned IAMInstCost; // Indirect access memory instruction count
-    unsigned LSMInstCost; // Large stride memory instruction count
+    unsigned IAMInstCost;      // Indirect access memory instruction count
+    unsigned LSMInstCost;      // Large stride memory instruction count
     bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
                                // high global memory access
     FuncInfo()
@@ -48,11 +42,35 @@ struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass {
           HasDenseGlobalMemAcc(false) {}
   };
 
-  typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
+  typedef ValueMap<const Function *, FuncInfo> FuncInfoMap;
 
 private:
-
   FuncInfoMap FIM;
+
+public:
+  AMDGPUPerfHintAnalysis() {}
+
+  // OldPM
+  bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC);
+
+  // NewPM
+  bool run(const GCNTargetMachine &TM, LazyCallGraph &CG);
+
+  bool isMemoryBound(const Function *F) const;
+
+  bool needsWaveLimiter(const Function *F) const;
 };
+
+struct AMDGPUPerfHintAnalysisPass
+    : public PassInfoMixin<AMDGPUPerfHintAnalysisPass> {
+  const GCNTargetMachine &TM;
+  std::unique_ptr<AMDGPUPerfHintAnalysis> Impl;
+
+  AMDGPUPerfHintAnalysisPass(const GCNTargetMachine &TM)
+      : TM(TM), Impl(std::make_unique<AMDGPUPerfHintAnalysis>()) {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 } // namespace llvm
-#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPERFHINTANALYSIS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e80daff96c431c..676b0b5046b243 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPUMacroFusion.h"
+#include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPURegBankSelect.h"
 #include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetObjectFile.h"
@@ -1230,7 +1231,7 @@ bool GCNPassConfig::addPreISel() {
   addPass(createLCSSAPass());
 
   if (TM->getOptLevel() > CodeGenOptLevel::Less)
-    addPass(&AMDGPUPerfHintAnalysisID);
+    addPass(&AMDGPUPerfHintAnalysisLegacyID);
 
   return false;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
index 77e0f46a3d4578..f4ee4fb82e7a33 100644
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}test_membound: