[llvm] [KernelInfo] Implement new LLVM IR pass for GPU code analysis (PR #102944)

Mon Aug 12 10:58:09 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: Joel E. Denny (jdenny-ornl)

<details>
<summary>Changes</summary>

This patch implements an LLVM IR pass, named kernel-info, that reports various statistics for codes compiled for GPUs.  The ultimate goal of these statistics to help identify bad code patterns and ways to mitigate them.  The pass operates at the LLVM IR level so that it can, in theory, support any LLVM-based compiler for programming languages supporting GPUs.  It has been tested so far with LLVM IR generated by Clang for OpenMP offload codes targeting NVIDIA GPUs and AMD GPUs.

By default, the pass is disabled.  For convenience, `-kernel-info-end-lto` inserts it at the end of LTO, and options like `-Rpass=kernel-info` enable its remarks.  Example opt and clang command lines appear in comments in
`llvm/include/llvm/Analysis/KernelInfo.h`.  Remarks include summary statistics (e.g., total size of static allocas) and individual occurrences (e.g., source location of each alloca).  Examples of its output appear in tests in `llvm/test/Analysis/KernelInfo`.

---

Patch is 129.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102944.diff


20 Files Affected:

- (added) llvm/include/llvm/Analysis/KernelInfo.h (+148) 
- (modified) llvm/include/llvm/Target/TargetMachine.h (+3) 
- (modified) llvm/lib/Analysis/CMakeLists.txt (+1) 
- (added) llvm/lib/Analysis/KernelInfo.cpp (+350) 
- (modified) llvm/lib/Passes/PassBuilder.cpp (+1) 
- (modified) llvm/lib/Passes/PassRegistry.def (+2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+10) 
- (modified) llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp (+10) 
- (modified) llvm/lib/Target/TargetMachine.cpp (+5) 
- (added) llvm/test/Analysis/KernelInfo/addrspace0.ll (+152) 
- (added) llvm/test/Analysis/KernelInfo/allocas.ll (+78) 
- (added) llvm/test/Analysis/KernelInfo/calls.ll (+112) 
- (added) llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll (+47) 
- (added) llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll (+47) 
- (added) llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll (+40) 
- (added) llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll (+36) 
- (added) llvm/test/Analysis/KernelInfo/linkage.ll (+51) 
- (added) llvm/test/Analysis/KernelInfo/openmp/README.md (+40) 
- (added) llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll (+217) 
- (added) llvm/test/Analysis/KernelInfo/openmp/nvptx.ll (+811) 


``````````diff

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
new file mode 100644
index 00000000000000..5495bb2fd4d925
--- /dev/null
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -0,0 +1,148 @@
+//=- KernelInfo.h - Kernel Analysis -------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
+// classes used to extract function properties from a GPU kernel.
+//
+// To analyze a C program as it appears to an LLVM GPU backend at the end of
+// LTO:
+//
+//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+//       -Rpass=kernel-info -mllvm -kernel-info-end-lto
+//
+// To analyze specified LLVM IR, perhaps previously generated by something like
+// 'clang -save-temps -g -fopenmp --offload-arch=native test.c':
+//
+//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+//       -pass-remarks=kernel-info -passes=kernel-info
+//
+// kernel-info can also be inserted into a specified LLVM pass pipeline using
+// -kernel-info-end-lto, or it can be positioned explicitly in that pipeline:
+//
+//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+//       -Rpass=kernel-info -mllvm -kernel-info-end-lto \
+//       -Xoffload-linker --lto-newpm-passes='lto<O2>'
+//
+//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+//       -Rpass=kernel-info \
+//       -Xoffload-linker --lto-newpm-passes='lto<O2>,module(kernel-info)'
+//
+//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+//       -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto<O2>'
+//
+//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+//       -pass-remarks=kernel-info -passes='lto<O2>,module(kernel-info)'
+// ===---------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_KERNELINFO_H
+#define LLVM_ANALYSIS_KERNELINFO_H
+
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+
+namespace llvm {
+class DominatorTree;
+class Function;
+
+/// Data structure holding function info for kernels.
+class KernelInfo {
+  void updateForBB(const BasicBlock &BB, int64_t Direction,
+                   OptimizationRemarkEmitter &ORE);
+
+public:
+  static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
+
+  bool operator==(const KernelInfo &FPI) const {
+    return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0;
+  }
+
+  bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); }
+
+  /// If false, nothing was recorded here because the supplied function didn't
+  /// appear in a module compiled for a GPU.
+  bool IsValid = false;
+
+  /// Whether the function has external linkage and is not a kernel function.
+  bool ExternalNotKernel = false;
+
+  /// OpenMP Launch bounds.
+  ///@{
+  std::optional<int64_t> OmpTargetNumTeams;
+  std::optional<int64_t> OmpTargetThreadLimit;
+  ///@}
+
+  /// AMDGPU launch bounds.
+  ///@{
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsX;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsY;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsZ;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMin;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMax;
+  std::optional<int64_t> AmdgpuWavesPerEuMin;
+  std::optional<int64_t> AmdgpuWavesPerEuMax;
+  ///@}
+
+  /// NVPTX launch bounds.
+  ///@{
+  std::optional<int64_t> Maxclusterrank;
+  std::optional<int64_t> Maxntidx;
+  ///@}
+
+  /// The number of alloca instructions inside the function, the number of those
+  /// with allocation sizes that cannot be determined at compile time, and the
+  /// sum of the sizes that can be.
+  ///
+  /// With the current implementation for at least some GPU archs,
+  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
+  /// case the implementation changes.
+  int64_t Allocas = 0;
+  int64_t AllocasDyn = 0;
+  int64_t AllocasStaticSizeSum = 0;
+
+  /// Number of direct/indirect calls (anything derived from CallBase).
+  int64_t DirectCalls = 0;
+  int64_t IndirectCalls = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  /// Number of calls of type InvokeInst.
+  int64_t Invokes = 0;
+
+  /// Number of addrspace(0) memory accesses (via load, store, etc.).
+  int64_t AddrspaceZeroAccesses = 0;
+};
+
+/// Analysis class for KernelInfo.
+class KernelInfoAnalysis : public AnalysisInfoMixin<KernelInfoAnalysis> {
+public:
+  static AnalysisKey Key;
+
+  using Result = const KernelInfo;
+
+  KernelInfo run(Function &F, FunctionAnalysisManager &FAM) {
+    return KernelInfo::getKernelInfo(F, FAM);
+  }
+};
+
+/// Printer pass for KernelInfoAnalysis.
+///
+/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled.
+class KernelInfoPrinter : public PassInfoMixin<KernelInfoPrinter> {
+public:
+  explicit KernelInfoPrinter() {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
+    AM.getResult<KernelInfoAnalysis>(F);
+    return PreservedAnalyses::all();
+  }
+
+  static bool isRequired() { return true; }
+};
+} // namespace llvm
+#endif // LLVM_ANALYSIS_KERNELINFO_H
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index c3e9d41315f617..5c338a8fcd0cfb 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -18,6 +18,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -27,6 +28,8 @@
 #include <string>
 #include <utility>
 
+extern llvm::cl::opt<bool> KernelInfoEndLTO;
+
 namespace llvm {
 
 class AAManager;
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 2cb3547ec40473..02e76af8d903de 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -78,6 +78,7 @@ add_llvm_component_library(LLVMAnalysis
   InstructionPrecedenceTracking.cpp
   InstructionSimplify.cpp
   InteractiveModelRunner.cpp
+  KernelInfo.cpp
   LazyBranchProbabilityInfo.cpp
   LazyBlockFrequencyInfo.cpp
   LazyCallGraph.cpp
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
new file mode 100644
index 00000000000000..9df3b5b32afcb4
--- /dev/null
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -0,0 +1,350 @@
+//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
+// classes used to extract function properties from a kernel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/KernelInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "kernel-info"
+
+static bool isKernelFunction(Function &F) {
+  // TODO: Is this general enough?  Consider languages beyond OpenMP.
+  return F.hasFnAttribute("kernel");
+}
+
+static void identifyFunction(OptimizationRemark &R, const Function &F) {
+  if (auto *SubProgram = F.getSubprogram()) {
+    if (SubProgram->isArtificial())
+      R << "artificial ";
+  }
+  R << "function '" << F.getName() << "'";
+}
+
+static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                         const AllocaInst &Alloca,
+                         TypeSize::ScalarTy StaticSize) {
+  ORE.emit([&] {
+    StringRef Name;
+    DebugLoc Loc;
+    bool Artificial = false;
+    auto DVRs = findDVRDeclares(&const_cast<AllocaInst &>(Alloca));
+    if (!DVRs.empty()) {
+      const DbgVariableRecord &DVR = **DVRs.begin();
+      Name = DVR.getVariable()->getName();
+      Loc = DVR.getDebugLoc();
+      Artificial = DVR.Variable->isArtificial();
+    }
+    OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc),
+                         Alloca.getParent());
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", ";
+    if (Artificial)
+      R << "artificial ";
+    if (Name.empty()) {
+      R << "unnamed alloca ";
+      if (DVRs.empty())
+        R << "(missing debug metadata) ";
+    } else {
+      R << "alloca '" << Name << "' ";
+    }
+    R << "with ";
+    if (StaticSize)
+      R << "static size of " << itostr(StaticSize) << " bytes";
+    else
+      R << "dynamic size";
+    return R;
+  });
+}
+
+static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                       const CallBase &Call, StringRef CallKind,
+                       StringRef RemarkKind) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call);
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", " << CallKind;
+    if (const Function *Callee =
+            dyn_cast_or_null<Function>(Call.getCalledOperand())) {
+      R << ", callee is";
+      StringRef Name = Callee->getName();
+      if (auto *SubProgram = Callee->getSubprogram()) {
+        if (SubProgram->isArtificial())
+          R << " artificial";
+      }
+      if (!Name.empty())
+        R << " '" << Name << "'";
+      else
+        R << " with unknown name";
+    }
+    return R;
+  });
+}
+
+static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE,
+                                      const Function &Caller,
+                                      const Instruction &Inst) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, "AddrspaceZeroAccess", &Inst);
+    R << "in ";
+    identifyFunction(R, Caller);
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
+      R << ", '" << II->getCalledFunction()->getName() << "' call";
+    } else {
+      R << ", '" << Inst.getOpcodeName() << "' instruction";
+    }
+    if (Inst.hasName())
+      R << " ('%" << Inst.getName() << "')";
+    R << " accesses memory in addrspace(0)";
+    return R;
+  });
+}
+
+void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
+                             OptimizationRemarkEmitter &ORE) {
+  assert(Direction == 1 || Direction == -1);
+  const Function &F = *BB.getParent();
+  const Module &M = *F.getParent();
+  const DataLayout &DL = M.getDataLayout();
+  for (const Instruction &I : BB.instructionsWithoutDebug()) {
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+      Allocas += Direction;
+      TypeSize::ScalarTy StaticSize = 0;
+      if (std::optional<TypeSize> Size = Alloca->getAllocationSize(DL)) {
+        StaticSize = Size->getFixedValue();
+        assert(StaticSize <= std::numeric_limits<int64_t>::max());
+        AllocasStaticSizeSum += Direction * StaticSize;
+      } else {
+        AllocasDyn += Direction;
+      }
+      remarkAlloca(ORE, F, *Alloca, StaticSize);
+    } else if (const CallBase *Call = dyn_cast<CallBase>(&I)) {
+      std::string CallKind;
+      std::string RemarkKind;
+      if (Call->isIndirectCall()) {
+        IndirectCalls += Direction;
+        CallKind += "indirect";
+        RemarkKind += "Indirect";
+      } else {
+        DirectCalls += Direction;
+        CallKind += "direct";
+        RemarkKind += "Direct";
+      }
+      if (isa<InvokeInst>(Call)) {
+        Invokes += Direction;
+        CallKind += " invoke";
+        RemarkKind += "Invoke";
+      } else {
+        CallKind += " call";
+        RemarkKind += "Call";
+      }
+      if (!Call->isIndirectCall()) {
+        if (const Function *Callee = Call->getCalledFunction()) {
+          if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) {
+            DirectCallsToDefinedFunctions += Direction;
+            CallKind += " to defined function";
+            RemarkKind += "ToDefinedFunction";
+          }
+        }
+      }
+      remarkCall(ORE, F, *Call, CallKind, RemarkKind);
+      if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
+        if (MI->getDestAddressSpace() == 0) {
+          AddrspaceZeroAccesses += Direction;
+          remarkAddrspaceZeroAccess(ORE, F, I);
+        } else if (const AnyMemTransferInst *MT =
+                       dyn_cast<AnyMemTransferInst>(MI)) {
+          if (MT->getSourceAddressSpace() == 0) {
+            AddrspaceZeroAccesses += Direction;
+            remarkAddrspaceZeroAccess(ORE, F, I);
+          }
+        }
+      }
+    } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
+      if (Load->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+      if (Store->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
+      if (At->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
+      if (At->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    }
+  }
+}
+
+static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                           StringRef Name, int64_t Value) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, Name, &F);
+    R << "in ";
+    identifyFunction(R, F);
+    R << ", " << Name << " = " << itostr(Value);
+    return R;
+  });
+}
+
+static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                           StringRef Name, std::optional<int64_t> Value) {
+  if (!Value)
+    return;
+  remarkProperty(ORE, F, Name, Value.value());
+}
+
+static std::vector<std::optional<int64_t>>
+parseFnAttrAsIntegerFields(Function &F, StringRef Name, unsigned NumFields) {
+  std::vector<std::optional<int64_t>> Result(NumFields);
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Result;
+  StringRef Rest = A.getValueAsString();
+  for (unsigned I = 0; I < NumFields; ++I) {
+    StringRef Field;
+    std::tie(Field, Rest) = Rest.split(',');
+    if (Field.empty())
+      break;
+    int64_t Val;
+    if (Field.getAsInteger(0, Val)) {
+      F.getContext().emitError("cannot parse integer in attribute '" + Name +
+                               "': " + Field);
+      break;
+    }
+    Result[I] = Val;
+  }
+  if (!Rest.empty())
+    F.getContext().emitError("too many fields in attribute " + Name);
+  return Result;
+}
+
+static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
+                                                   StringRef Name) {
+  return parseFnAttrAsIntegerFields(F, Name, 1)[0];
+}
+
+// TODO: This nearly duplicates the same function in OMPIRBuilder.cpp.  Can we
+// share?
+static MDNode *getNVPTXMDNode(Function &F, StringRef Name) {
+  Module &M = *F.getParent();
+  NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
+  if (!MD)
+    return nullptr;
+  for (auto *Op : MD->operands()) {
+    if (Op->getNumOperands() != 3)
+      continue;
+    auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
+    if (!KernelOp || KernelOp->getValue() != &F)
+      continue;
+    auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
+    if (!Prop || Prop->getString() != Name)
+      continue;
+    return Op;
+  }
+  return nullptr;
+}
+
+static std::optional<int64_t> parseNVPTXMDNodeAsInteger(Function &F,
+                                                        StringRef Name) {
+  std::optional<int64_t> Result;
+  if (MDNode *ExistingOp = getNVPTXMDNode(F, Name)) {
+    auto *Op = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
+    Result = cast<ConstantInt>(Op->getValue())->getZExtValue();
+  }
+  return Result;
+}
+
+KernelInfo KernelInfo::getKernelInfo(Function &F,
+                                     FunctionAnalysisManager &FAM) {
+  KernelInfo KI;
+  // Only analyze modules for GPUs.
+  // TODO: This would be more maintainable if there were an isGPU.
+  const std::string &TT = F.getParent()->getTargetTriple();
+  llvm::Triple T(TT);
+  if (!T.isAMDGPU() && !T.isNVPTX())
+    return KI;
+  KI.IsValid = true;
+
+  // Record function properties.
+  KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F);
+  KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams");
+  KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit");
+  auto AmdgpuMaxNumWorkgroups =
+      parseFnAttrAsIntegerFields(F, "amdgpu-max-num-workgroups", 3);
+  KI.AmdgpuMaxNumWorkgroupsX = AmdgpuMaxNumWorkgroups[0];
+  KI.AmdgpuMaxNumWorkgroupsY = AmdgpuMaxNumWorkgroups[1];
+  KI.AmdgpuMaxNumWorkgroupsZ = AmdgpuMaxNumWorkgroups[2];
+  auto AmdgpuFlatWorkGroupSize =
+      parseFnAttrAsIntegerFields(F, "amdgpu-flat-work-group-size", 2);
+  KI.AmdgpuFlatWorkGroupSizeMin = AmdgpuFlatWorkGroupSize[0];
+  KI.AmdgpuFlatWorkGroupSizeMax = AmdgpuFlatWorkGroupSize[1];
+  auto AmdgpuWavesPerEu =
+      parseFnAttrAsIntegerFields(F, "amdgpu-waves-per-eu", 2);
+  KI.AmdgpuWavesPerEuMin = AmdgpuWavesPerEu[0];
+  KI.AmdgpuWavesPerEuMax = AmdgpuWavesPerEu[1];
+  KI.Maxclusterrank = parseNVPTXMDNodeAsInteger(F, "maxclusterrank");
+  KI.Maxntidx = parseNVPTXMDNodeAsInteger(F, "maxntidx");
+
+  const DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  for (const auto &BB : F)
+    if (DT.isReachableFromEntry(&BB))
+      KI.updateForBB(BB, +1, ORE);
+
+#define REMARK_PROPERTY(PROP_NAME)                                             \
+  remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
+  REMARK_PROPERTY(ExternalNotKernel);
+  REMARK_PROPERTY(OmpTargetNumTeams);
+  REMARK_PROPERTY(OmpTargetThreadLimit);
+  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsX);
+  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsY);
+  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsZ);
+  REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMin);
+  REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMax);
+  REMARK_PROPERTY(AmdgpuWavesPerEuMin);
+  REMARK_PROPERTY(AmdgpuWavesPerEuMax);
+  REMARK_PROPERTY(Maxclusterrank);
+  REMARK_PROPERTY(Maxntidx);
+  REMARK_PROPERTY(Allocas);
+  REMARK_PROPERTY(AllocasStaticSizeSum);
+  REMARK_PROPERTY(AllocasDyn);
+  REMARK_PROPERTY(DirectCalls);
+  REMARK_PROPERTY(IndirectCalls);
+  REMARK_PROPERTY(DirectCallsToDefinedFunctions);
+  REMARK_PROPERTY(Invokes);
+  REMARK_PROPERTY(AddrspaceZeroAccesses);
+#undef REMARK_PROPERTY
+
+  return KI;
+}
+
+AnalysisKey KernelInfoAnalysis::Key;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 46f43f3de4705c..61677f02783cc9 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/InstCount.h"
+#include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/Lint.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 0cec9fbd7cd05e..dcfa732f410b38 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -278,6 +278,7 @@ FUNCTION_ANALYSIS(
     MachineFunctionAnalysis(static_ca...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/102944