[llvm] [KernelInfo] Implement new LLVM IR pass for GPU code analysis (PR #102944)

Sat Sep 28 10:16:59 PDT 2024

https://github.com/jdenny-ornl updated https://github.com/llvm/llvm-project/pull/102944

>From 5a671f685921b5cc02ced87a410645e8ad1b5c98 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 12 Aug 2024 13:55:13 -0400
Subject: [PATCH 01/27] [KernelInfo] Implement new LLVM IR pass for GPU code
 analysis

This patch implements an LLVM IR pass, named kernel-info, that reports
various statistics for codes compiled for GPUs.  The ultimate goal of
these statistics to help identify bad code patterns and ways to
mitigate them.  The pass operates at the LLVM IR level so that it can,
in theory, support any LLVM-based compiler for programming languages
supporting GPUs.  It has been tested so far with LLVM IR generated by
Clang for OpenMP offload codes targeting NVIDIA GPUs and AMD GPUs.

By default, the pass is disabled.  For convenience,
`-kernel-info-end-lto` inserts it at the end of LTO, and options like
`-Rpass=kernel-info` enable its remarks.  Example opt and clang
command lines appear in comments in
`llvm/include/llvm/Analysis/KernelInfo.h`.  Remarks include summary
statistics (e.g., total size of static allocas) and individual
occurrences (e.g., source location of each alloca).  Examples of its
output appear in tests in `llvm/test/Analysis/KernelInfo`.
---
 llvm/include/llvm/Analysis/KernelInfo.h       | 148 ++++
 llvm/include/llvm/Target/TargetMachine.h      |   3 +
 llvm/lib/Analysis/CMakeLists.txt              |   1 +
 llvm/lib/Analysis/KernelInfo.cpp              | 350 ++++++++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   2 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  10 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |  10 +
 llvm/lib/Target/TargetMachine.cpp             |   5 +
 llvm/test/Analysis/KernelInfo/addrspace0.ll   | 152 ++++
 llvm/test/Analysis/KernelInfo/allocas.ll      |  78 ++
 llvm/test/Analysis/KernelInfo/calls.ll        | 112 +++
 .../kernel-info-after-lto/amdgpu.ll           |  47 +
 .../KernelInfo/kernel-info-after-lto/nvptx.ll |  47 +
 .../KernelInfo/launch-bounds/amdgpu.ll        |  40 +
 .../KernelInfo/launch-bounds/nvptx.ll         |  36 +
 llvm/test/Analysis/KernelInfo/linkage.ll      |  51 ++
 .../test/Analysis/KernelInfo/openmp/README.md |  40 +
 .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 217 +++++
 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 811 ++++++++++++++++++
 20 files changed, 2161 insertions(+)
 create mode 100644 llvm/include/llvm/Analysis/KernelInfo.h
 create mode 100644 llvm/lib/Analysis/KernelInfo.cpp
 create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/allocas.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/calls.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/linkage.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/openmp/README.md
 create mode 100644 llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
new file mode 100644
index 00000000000000..5495bb2fd4d925
--- /dev/null
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -0,0 +1,148 @@
+//=- KernelInfo.h - Kernel Analysis -------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
+// classes used to extract function properties from a GPU kernel.
+//
+// To analyze a C program as it appears to an LLVM GPU backend at the end of
+// LTO:
+//
+//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+//       -Rpass=kernel-info -mllvm -kernel-info-end-lto
+//
+// To analyze specified LLVM IR, perhaps previously generated by something like
+// 'clang -save-temps -g -fopenmp --offload-arch=native test.c':
+//
+//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+//       -pass-remarks=kernel-info -passes=kernel-info
+//
+// kernel-info can also be inserted into a specified LLVM pass pipeline using
+// -kernel-info-end-lto, or it can be positioned explicitly in that pipeline:
+//
+//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+//       -Rpass=kernel-info -mllvm -kernel-info-end-lto \
+//       -Xoffload-linker --lto-newpm-passes='lto<O2>'
+//
+//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+//       -Rpass=kernel-info \
+//       -Xoffload-linker --lto-newpm-passes='lto<O2>,module(kernel-info)'
+//
+//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+//       -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto<O2>'
+//
+//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+//       -pass-remarks=kernel-info -passes='lto<O2>,module(kernel-info)'
+// ===---------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_KERNELINFO_H
+#define LLVM_ANALYSIS_KERNELINFO_H
+
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+
+namespace llvm {
+class DominatorTree;
+class Function;
+
+/// Data structure holding function info for kernels.
+class KernelInfo {
+  void updateForBB(const BasicBlock &BB, int64_t Direction,
+                   OptimizationRemarkEmitter &ORE);
+
+public:
+  static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
+
+  bool operator==(const KernelInfo &FPI) const {
+    return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0;
+  }
+
+  bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); }
+
+  /// If false, nothing was recorded here because the supplied function didn't
+  /// appear in a module compiled for a GPU.
+  bool IsValid = false;
+
+  /// Whether the function has external linkage and is not a kernel function.
+  bool ExternalNotKernel = false;
+
+  /// OpenMP Launch bounds.
+  ///@{
+  std::optional<int64_t> OmpTargetNumTeams;
+  std::optional<int64_t> OmpTargetThreadLimit;
+  ///@}
+
+  /// AMDGPU launch bounds.
+  ///@{
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsX;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsY;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsZ;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMin;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMax;
+  std::optional<int64_t> AmdgpuWavesPerEuMin;
+  std::optional<int64_t> AmdgpuWavesPerEuMax;
+  ///@}
+
+  /// NVPTX launch bounds.
+  ///@{
+  std::optional<int64_t> Maxclusterrank;
+  std::optional<int64_t> Maxntidx;
+  ///@}
+
+  /// The number of alloca instructions inside the function, the number of those
+  /// with allocation sizes that cannot be determined at compile time, and the
+  /// sum of the sizes that can be.
+  ///
+  /// With the current implementation for at least some GPU archs,
+  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
+  /// case the implementation changes.
+  int64_t Allocas = 0;
+  int64_t AllocasDyn = 0;
+  int64_t AllocasStaticSizeSum = 0;
+
+  /// Number of direct/indirect calls (anything derived from CallBase).
+  int64_t DirectCalls = 0;
+  int64_t IndirectCalls = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  /// Number of calls of type InvokeInst.
+  int64_t Invokes = 0;
+
+  /// Number of addrspace(0) memory accesses (via load, store, etc.).
+  int64_t AddrspaceZeroAccesses = 0;
+};
+
+/// Analysis class for KernelInfo.
+class KernelInfoAnalysis : public AnalysisInfoMixin<KernelInfoAnalysis> {
+public:
+  static AnalysisKey Key;
+
+  using Result = const KernelInfo;
+
+  KernelInfo run(Function &F, FunctionAnalysisManager &FAM) {
+    return KernelInfo::getKernelInfo(F, FAM);
+  }
+};
+
+/// Printer pass for KernelInfoAnalysis.
+///
+/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled.
+class KernelInfoPrinter : public PassInfoMixin<KernelInfoPrinter> {
+public:
+  explicit KernelInfoPrinter() {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
+    AM.getResult<KernelInfoAnalysis>(F);
+    return PreservedAnalyses::all();
+  }
+
+  static bool isRequired() { return true; }
+};
+} // namespace llvm
+#endif // LLVM_ANALYSIS_KERNELINFO_H
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index c3e9d41315f617..5c338a8fcd0cfb 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -18,6 +18,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -27,6 +28,8 @@
 #include <string>
 #include <utility>
 
+extern llvm::cl::opt<bool> KernelInfoEndLTO;
+
 namespace llvm {
 
 class AAManager;
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 2cb3547ec40473..02e76af8d903de 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -78,6 +78,7 @@ add_llvm_component_library(LLVMAnalysis
   InstructionPrecedenceTracking.cpp
   InstructionSimplify.cpp
   InteractiveModelRunner.cpp
+  KernelInfo.cpp
   LazyBranchProbabilityInfo.cpp
   LazyBlockFrequencyInfo.cpp
   LazyCallGraph.cpp
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
new file mode 100644
index 00000000000000..9df3b5b32afcb4
--- /dev/null
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -0,0 +1,350 @@
+//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
+// classes used to extract function properties from a kernel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/KernelInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "kernel-info"
+
+static bool isKernelFunction(Function &F) {
+  // TODO: Is this general enough?  Consider languages beyond OpenMP.
+  return F.hasFnAttribute("kernel");
+}
+
+static void identifyFunction(OptimizationRemark &R, const Function &F) {
+  if (auto *SubProgram = F.getSubprogram()) {
+    if (SubProgram->isArtificial())
+      R << "artificial ";
+  }
+  R << "function '" << F.getName() << "'";
+}
+
+static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                         const AllocaInst &Alloca,
+                         TypeSize::ScalarTy StaticSize) {
+  ORE.emit([&] {
+    StringRef Name;
+    DebugLoc Loc;
+    bool Artificial = false;
+    auto DVRs = findDVRDeclares(&const_cast<AllocaInst &>(Alloca));
+    if (!DVRs.empty()) {
+      const DbgVariableRecord &DVR = **DVRs.begin();
+      Name = DVR.getVariable()->getName();
+      Loc = DVR.getDebugLoc();
+      Artificial = DVR.Variable->isArtificial();
+    }
+    OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc),
+                         Alloca.getParent());
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", ";
+    if (Artificial)
+      R << "artificial ";
+    if (Name.empty()) {
+      R << "unnamed alloca ";
+      if (DVRs.empty())
+        R << "(missing debug metadata) ";
+    } else {
+      R << "alloca '" << Name << "' ";
+    }
+    R << "with ";
+    if (StaticSize)
+      R << "static size of " << itostr(StaticSize) << " bytes";
+    else
+      R << "dynamic size";
+    return R;
+  });
+}
+
+static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                       const CallBase &Call, StringRef CallKind,
+                       StringRef RemarkKind) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call);
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", " << CallKind;
+    if (const Function *Callee =
+            dyn_cast_or_null<Function>(Call.getCalledOperand())) {
+      R << ", callee is";
+      StringRef Name = Callee->getName();
+      if (auto *SubProgram = Callee->getSubprogram()) {
+        if (SubProgram->isArtificial())
+          R << " artificial";
+      }
+      if (!Name.empty())
+        R << " '" << Name << "'";
+      else
+        R << " with unknown name";
+    }
+    return R;
+  });
+}
+
+static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE,
+                                      const Function &Caller,
+                                      const Instruction &Inst) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, "AddrspaceZeroAccess", &Inst);
+    R << "in ";
+    identifyFunction(R, Caller);
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
+      R << ", '" << II->getCalledFunction()->getName() << "' call";
+    } else {
+      R << ", '" << Inst.getOpcodeName() << "' instruction";
+    }
+    if (Inst.hasName())
+      R << " ('%" << Inst.getName() << "')";
+    R << " accesses memory in addrspace(0)";
+    return R;
+  });
+}
+
+void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
+                             OptimizationRemarkEmitter &ORE) {
+  assert(Direction == 1 || Direction == -1);
+  const Function &F = *BB.getParent();
+  const Module &M = *F.getParent();
+  const DataLayout &DL = M.getDataLayout();
+  for (const Instruction &I : BB.instructionsWithoutDebug()) {
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+      Allocas += Direction;
+      TypeSize::ScalarTy StaticSize = 0;
+      if (std::optional<TypeSize> Size = Alloca->getAllocationSize(DL)) {
+        StaticSize = Size->getFixedValue();
+        assert(StaticSize <= std::numeric_limits<int64_t>::max());
+        AllocasStaticSizeSum += Direction * StaticSize;
+      } else {
+        AllocasDyn += Direction;
+      }
+      remarkAlloca(ORE, F, *Alloca, StaticSize);
+    } else if (const CallBase *Call = dyn_cast<CallBase>(&I)) {
+      std::string CallKind;
+      std::string RemarkKind;
+      if (Call->isIndirectCall()) {
+        IndirectCalls += Direction;
+        CallKind += "indirect";
+        RemarkKind += "Indirect";
+      } else {
+        DirectCalls += Direction;
+        CallKind += "direct";
+        RemarkKind += "Direct";
+      }
+      if (isa<InvokeInst>(Call)) {
+        Invokes += Direction;
+        CallKind += " invoke";
+        RemarkKind += "Invoke";
+      } else {
+        CallKind += " call";
+        RemarkKind += "Call";
+      }
+      if (!Call->isIndirectCall()) {
+        if (const Function *Callee = Call->getCalledFunction()) {
+          if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) {
+            DirectCallsToDefinedFunctions += Direction;
+            CallKind += " to defined function";
+            RemarkKind += "ToDefinedFunction";
+          }
+        }
+      }
+      remarkCall(ORE, F, *Call, CallKind, RemarkKind);
+      if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
+        if (MI->getDestAddressSpace() == 0) {
+          AddrspaceZeroAccesses += Direction;
+          remarkAddrspaceZeroAccess(ORE, F, I);
+        } else if (const AnyMemTransferInst *MT =
+                       dyn_cast<AnyMemTransferInst>(MI)) {
+          if (MT->getSourceAddressSpace() == 0) {
+            AddrspaceZeroAccesses += Direction;
+            remarkAddrspaceZeroAccess(ORE, F, I);
+          }
+        }
+      }
+    } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
+      if (Load->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+      if (Store->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
+      if (At->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
+      if (At->getPointerAddressSpace() == 0) {
+        AddrspaceZeroAccesses += Direction;
+        remarkAddrspaceZeroAccess(ORE, F, I);
+      }
+    }
+  }
+}
+
+static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                           StringRef Name, int64_t Value) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, Name, &F);
+    R << "in ";
+    identifyFunction(R, F);
+    R << ", " << Name << " = " << itostr(Value);
+    return R;
+  });
+}
+
+static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                           StringRef Name, std::optional<int64_t> Value) {
+  if (!Value)
+    return;
+  remarkProperty(ORE, F, Name, Value.value());
+}
+
+static std::vector<std::optional<int64_t>>
+parseFnAttrAsIntegerFields(Function &F, StringRef Name, unsigned NumFields) {
+  std::vector<std::optional<int64_t>> Result(NumFields);
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Result;
+  StringRef Rest = A.getValueAsString();
+  for (unsigned I = 0; I < NumFields; ++I) {
+    StringRef Field;
+    std::tie(Field, Rest) = Rest.split(',');
+    if (Field.empty())
+      break;
+    int64_t Val;
+    if (Field.getAsInteger(0, Val)) {
+      F.getContext().emitError("cannot parse integer in attribute '" + Name +
+                               "': " + Field);
+      break;
+    }
+    Result[I] = Val;
+  }
+  if (!Rest.empty())
+    F.getContext().emitError("too many fields in attribute " + Name);
+  return Result;
+}
+
+static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
+                                                   StringRef Name) {
+  return parseFnAttrAsIntegerFields(F, Name, 1)[0];
+}
+
+// TODO: This nearly duplicates the same function in OMPIRBuilder.cpp.  Can we
+// share?
+static MDNode *getNVPTXMDNode(Function &F, StringRef Name) {
+  Module &M = *F.getParent();
+  NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
+  if (!MD)
+    return nullptr;
+  for (auto *Op : MD->operands()) {
+    if (Op->getNumOperands() != 3)
+      continue;
+    auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
+    if (!KernelOp || KernelOp->getValue() != &F)
+      continue;
+    auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
+    if (!Prop || Prop->getString() != Name)
+      continue;
+    return Op;
+  }
+  return nullptr;
+}
+
+static std::optional<int64_t> parseNVPTXMDNodeAsInteger(Function &F,
+                                                        StringRef Name) {
+  std::optional<int64_t> Result;
+  if (MDNode *ExistingOp = getNVPTXMDNode(F, Name)) {
+    auto *Op = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
+    Result = cast<ConstantInt>(Op->getValue())->getZExtValue();
+  }
+  return Result;
+}
+
+KernelInfo KernelInfo::getKernelInfo(Function &F,
+                                     FunctionAnalysisManager &FAM) {
+  KernelInfo KI;
+  // Only analyze modules for GPUs.
+  // TODO: This would be more maintainable if there were an isGPU.
+  const std::string &TT = F.getParent()->getTargetTriple();
+  llvm::Triple T(TT);
+  if (!T.isAMDGPU() && !T.isNVPTX())
+    return KI;
+  KI.IsValid = true;
+
+  // Record function properties.
+  KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F);
+  KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams");
+  KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit");
+  auto AmdgpuMaxNumWorkgroups =
+      parseFnAttrAsIntegerFields(F, "amdgpu-max-num-workgroups", 3);
+  KI.AmdgpuMaxNumWorkgroupsX = AmdgpuMaxNumWorkgroups[0];
+  KI.AmdgpuMaxNumWorkgroupsY = AmdgpuMaxNumWorkgroups[1];
+  KI.AmdgpuMaxNumWorkgroupsZ = AmdgpuMaxNumWorkgroups[2];
+  auto AmdgpuFlatWorkGroupSize =
+      parseFnAttrAsIntegerFields(F, "amdgpu-flat-work-group-size", 2);
+  KI.AmdgpuFlatWorkGroupSizeMin = AmdgpuFlatWorkGroupSize[0];
+  KI.AmdgpuFlatWorkGroupSizeMax = AmdgpuFlatWorkGroupSize[1];
+  auto AmdgpuWavesPerEu =
+      parseFnAttrAsIntegerFields(F, "amdgpu-waves-per-eu", 2);
+  KI.AmdgpuWavesPerEuMin = AmdgpuWavesPerEu[0];
+  KI.AmdgpuWavesPerEuMax = AmdgpuWavesPerEu[1];
+  KI.Maxclusterrank = parseNVPTXMDNodeAsInteger(F, "maxclusterrank");
+  KI.Maxntidx = parseNVPTXMDNodeAsInteger(F, "maxntidx");
+
+  const DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  for (const auto &BB : F)
+    if (DT.isReachableFromEntry(&BB))
+      KI.updateForBB(BB, +1, ORE);
+
+#define REMARK_PROPERTY(PROP_NAME)                                             \
+  remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
+  REMARK_PROPERTY(ExternalNotKernel);
+  REMARK_PROPERTY(OmpTargetNumTeams);
+  REMARK_PROPERTY(OmpTargetThreadLimit);
+  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsX);
+  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsY);
+  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsZ);
+  REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMin);
+  REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMax);
+  REMARK_PROPERTY(AmdgpuWavesPerEuMin);
+  REMARK_PROPERTY(AmdgpuWavesPerEuMax);
+  REMARK_PROPERTY(Maxclusterrank);
+  REMARK_PROPERTY(Maxntidx);
+  REMARK_PROPERTY(Allocas);
+  REMARK_PROPERTY(AllocasStaticSizeSum);
+  REMARK_PROPERTY(AllocasDyn);
+  REMARK_PROPERTY(DirectCalls);
+  REMARK_PROPERTY(IndirectCalls);
+  REMARK_PROPERTY(DirectCallsToDefinedFunctions);
+  REMARK_PROPERTY(Invokes);
+  REMARK_PROPERTY(AddrspaceZeroAccesses);
+#undef REMARK_PROPERTY
+
+  return KI;
+}
+
+AnalysisKey KernelInfoAnalysis::Key;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 46f43f3de4705c..61677f02783cc9 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/InstCount.h"
+#include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/Lint.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 0cec9fbd7cd05e..dcfa732f410b38 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -278,6 +278,7 @@ FUNCTION_ANALYSIS(
     MachineFunctionAnalysis(static_cast<const LLVMTargetMachine *>(TM)))
 FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis())
 FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
+FUNCTION_ANALYSIS("kernel-info", KernelInfoAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
@@ -374,6 +375,7 @@ FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
 FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass());
 FUNCTION_PASS("kcfi", KCFIPass())
+FUNCTION_PASS("kernel-info", KernelInfoPrinter())
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
 FUNCTION_PASS("lint", LintPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0523fee5bcf9f4..3b2ed9fe4236c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -40,6 +40,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/KernelInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -772,6 +773,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           return onlyAllocateVGPRs;
         return nullptr;
       });
+
+  PB.registerFullLinkTimeOptimizationLastEPCallback(
+      [](ModulePassManager &PM, OptimizationLevel Level) {
+        if (KernelInfoEndLTO) {
+          FunctionPassManager FPM;
+          FPM.addPass(KernelInfoPrinter());
+          PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        }
+      });
 }
 
 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 097e29527eed9f..8d77c8e53f7a6a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "NVPTXTargetTransformInfo.h"
 #include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -238,6 +239,15 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         FPM.addPass(NVVMIntrRangePass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
+
+  PB.registerFullLinkTimeOptimizationLastEPCallback(
+      [](ModulePassManager &PM, OptimizationLevel Level) {
+        if (KernelInfoEndLTO) {
+          FunctionPassManager FPM;
+          FPM.addPass(KernelInfoPrinter());
+          PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        }
+      });
 }
 
 TargetTransformInfo
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index c0985f3be91a53..b235fd8f6f49a4 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -26,6 +26,11 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
+cl::opt<bool> KernelInfoEndLTO(
+    "kernel-info-end-lto",
+    cl::desc("add the kernel-info pass at the end of the full LTO pipeline"),
+    cl::init(false), cl::Hidden);
+
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
diff --git a/llvm/test/Analysis/KernelInfo/addrspace0.ll b/llvm/test/Analysis/KernelInfo/addrspace0.ll
new file mode 100644
index 00000000000000..4c472396443f52
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/addrspace0.ll
@@ -0,0 +1,152 @@
+; Check info on addrspace(0) memory accesses.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines --implicit-check-not='addrspace(0)' %s
+
+target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define void @f() !dbg !3 {
+entry:
+  ; load
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0)
+  %0 = load i32, ptr null, align 4, !dbg !6
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0)
+  %load = load i32, ptr null, align 4, !dbg !6
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in addrspace(0)
+  %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6
+  %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6
+  %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6
+
+  ; store
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0)
+  store i32 0, ptr null, align 4, !dbg !7
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0)
+  store i32 0, ptr addrspace(0) null, align 4, !dbg !7
+  store i32 0, ptr addrspace(1) null, align 4, !dbg !7
+  store i32 0, ptr addrspace(8) null, align 4, !dbg !7
+
+  ; atomicrmw
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0)
+  atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0)
+  atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8
+  atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8
+  atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8
+
+  ; cmpxchg
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0)
+  cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0)
+  cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+  cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+  cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+
+  ; llvm.memcpy
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+
+  ; llvm.memcpy.inline
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+
+  ; llvm.memcpy.element.unordered.atomic
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10
+  call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10
+
+  ; llvm.memmove
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11
+
+  ; llvm.memmove.element.unordered.atomic
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11
+  call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11
+
+  ; llvm.memset
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+
+  ; llvm.memset.inline
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+
+  ; llvm.memset.element.unordered.atomic
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0)
+  call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12
+  call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12
+  call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12
+
+  ret void
+}
+; CHECK: remark: test.c:2:0: in function 'f', AddrspaceZeroAccesses = 36
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !5)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 3, column: 11, scope: !3)
+!7 = !DILocation(line: 4, column: 6, scope: !3)
+!8 = !DILocation(line: 5, column: 1, scope: !3)
+!9 = !DILocation(line: 6, column: 2, scope: !3)
+!10 = !DILocation(line: 7, column: 3, scope: !3)
+!11 = !DILocation(line: 8, column: 4, scope: !3)
+!12 = !DILocation(line: 9, column: 5, scope: !3)
diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll
new file mode 100644
index 00000000000000..048d53799c33ef
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/allocas.ll
@@ -0,0 +1,78 @@
+; Check info on allocas.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define void @h() !dbg !3 {
+entry:
+  ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca 'dyn_ptr' with static size of 8 bytes
+  %dyn_ptr.addr = alloca ptr, align 8
+  ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca 'i' with static size of 4 bytes
+  %i = alloca i32, align 4
+  ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca 'a' with static size of 8 bytes
+  %a = alloca [2 x i32], align 4
+  tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !7, metadata !DIExpression()), !dbg !11
+  tail call void @llvm.dbg.declare(metadata ptr %i, metadata !12, metadata !DIExpression()), !dbg !15
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !16, metadata !DIExpression()), !dbg !20
+  ret void
+}
+; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 3
+; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 20
+; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 0
+
+define void @g() !dbg !21 {
+entry:
+  ; CHECK: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes
+  %i = alloca i32, align 4
+  ; CHECK: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes
+  %a = alloca [2 x i32], align 4
+  tail call void @llvm.dbg.declare(metadata ptr %i, metadata !23, metadata !DIExpression()), !dbg !24
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !25, metadata !DIExpression()), !dbg !26
+  ret void
+}
+; CHECK: remark: test.c:3:0: in function 'g', Allocas = 2
+; CHECK: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
+; CHECK: remark: test.c:3:0: in function 'g', AllocasDyn = 0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+
+; uselistorder directives
+uselistorder ptr @llvm.dbg.declare, { 4, 3, 2, 1, 0 }
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !4, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !6)
+!4 = distinct !DISubroutineType(types: !5)
+!5 = !{null}
+!6 = !{}
+!7 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !3, type: !8, flags: DIFlagArtificial)
+!8 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !9)
+!9 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !10)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!11 = !DILocation(line: 0, scope: !3)
+!12 = !DILocalVariable(name: "i", scope: !13, file: !2, line: 14, type: !14)
+!13 = distinct !DILexicalBlock(scope: !3, file: !2, line: 13, column: 3)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !DILocation(line: 14, column: 9, scope: !13)
+!16 = !DILocalVariable(name: "a", scope: !13, file: !2, line: 15, type: !17)
+!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, size: 64, elements: !18)
+!18 = !{!19}
+!19 = !DISubrange(count: 2)
+!20 = !DILocation(line: 15, column: 9, scope: !13)
+!21 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !22, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !6)
+!22 = !DISubroutineType(types: !5)
+!23 = !DILocalVariable(name: "i", scope: !21, file: !2, line: 4, type: !14)
+!24 = !DILocation(line: 4, column: 7, scope: !21)
+!25 = !DILocalVariable(name: "a", scope: !21, file: !2, line: 5, type: !17)
+!26 = !DILocation(line: 5, column: 7, scope: !21)
diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll
new file mode 100644
index 00000000000000..6101a712548981
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/calls.ll
@@ -0,0 +1,112 @@
+; Check info on calls.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @personality()
+
+define void @h() personality ptr @personality !dbg !100 {
+entry:
+  ; CHECK: remark: test.c:16:5: in artificial function 'h', direct call, callee is 'f'
+  call void @f(), !dbg !102
+  ; CHECK: remark: test.c:17:5: in artificial function 'h', direct call to defined function, callee is 'g'
+  call void @g(), !dbg !104
+  ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h'
+  call void @h(), !dbg !105
+  %0 = load ptr, ptr null, align 8
+  ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call
+  call void %0(), !dbg !106
+  ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f'
+  invoke void @f() to label %fcont unwind label %cleanup, !dbg !107
+fcont:
+  ; CHECK: remark: test.c:21:5: in artificial function 'h', direct invoke to defined function, callee is 'g'
+  invoke void @g() to label %gcont unwind label %cleanup, !dbg !108
+gcont:
+  ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h'
+  invoke void @h() to label %hcont unwind label %cleanup, !dbg !109
+hcont:
+  ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke
+  invoke void %0() to label %end unwind label %cleanup, !dbg !110
+cleanup:
+  %ll = landingpad { ptr, i32 }
+  cleanup
+  br label %end
+end:
+  ret void
+}
+; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 6
+; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2
+; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4
+; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 4
+
+declare void @f()
+
+define void @g() personality ptr @personality !dbg !200 {
+entry:
+  ; CHECK: remark: test.c:6:3: in function 'g', direct call, callee is 'f'
+  call void @f(), !dbg !202
+  ; CHECK: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g'
+  call void @g(), !dbg !203
+  ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h'
+  call void @h(), !dbg !204
+  %0 = load ptr, ptr null, align 8
+  ; CHECK: remark: test.c:9:3: in function 'g', indirect call
+  call void %0(), !dbg !205
+  ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f'
+  invoke void @f() to label %fcont unwind label %cleanup, !dbg !206
+fcont:
+  ; CHECK: remark: test.c:11:3: in function 'g', direct invoke to defined function, callee is 'g'
+  invoke void @g() to label %gcont unwind label %cleanup, !dbg !207
+gcont:
+  ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h'
+  invoke void @h() to label %hcont unwind label %cleanup, !dbg !208
+hcont:
+  ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke
+  invoke void %0() to label %end unwind label %cleanup, !dbg !209
+cleanup:
+  %ll = landingpad { ptr, i32 }
+  cleanup
+  br label %end
+end:
+  ret void
+}
+; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 6
+; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2
+; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4
+; CHECK: remark: test.c:3:0: in function 'g', Invokes = 4
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{null}
+!4 = !{}
+
+!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!101 = distinct !DISubroutineType(types: !3)
+!102 = !DILocation(line: 16, column: 5, scope: !103)
+!103 = distinct !DILexicalBlock(scope: !100, file: !2, line: 13, column: 3)
+!104 = !DILocation(line: 17, column: 5, scope: !103)
+!105 = !DILocation(line: 18, column: 5, scope: !103)
+!106 = !DILocation(line: 19, column: 5, scope: !103)
+!107 = !DILocation(line: 20, column: 5, scope: !103)
+!108 = !DILocation(line: 21, column: 5, scope: !103)
+!109 = !DILocation(line: 22, column: 5, scope: !103)
+!110 = !DILocation(line: 23, column: 5, scope: !103)
+
+!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!201 = !DISubroutineType(types: !3)
+!202 = !DILocation(line: 6, column: 3, scope: !200)
+!203 = !DILocation(line: 7, column: 3, scope: !200)
+!204 = !DILocation(line: 8, column: 3, scope: !200)
+!205 = !DILocation(line: 9, column: 3, scope: !200)
+!206 = !DILocation(line: 10, column: 3, scope: !200)
+!207 = !DILocation(line: 11, column: 3, scope: !200)
+!208 = !DILocation(line: 12, column: 3, scope: !200)
+!209 = !DILocation(line: 13, column: 3, scope: !200)
diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll
new file mode 100644
index 00000000000000..7d190ece46e166
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll
@@ -0,0 +1,47 @@
+; Check that -kernel-info-end-lto enables kernel-info in the AMD GPU target
+; backend.
+
+; REQUIRES: amdgpu-registered-target
+
+; -kernel-info-end-lto inserts kernel-info into LTO pipeline.
+; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN:     -passes='lto<O2>' -kernel-info-end-lto 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+; Omitting -kernel-info-end-lto disables kernel-info.
+; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN:     -passes='lto<O2>' 2>&1 | \
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
+
+; Omitting LTO disables kernel-info.
+; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN:     -passes='default<O2>' -kernel-info-end-lto 2>&1 | \
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
+; NONE-NOT: remark:
+define void @test() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!6, !7, !8}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = !{ptr @test, !"maxclusterrank", i32 200}
+!7 = !{ptr @test, !"maxntidx", i32 210}
+!8 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll
new file mode 100644
index 00000000000000..4e790123c313a5
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll
@@ -0,0 +1,47 @@
+; Check that -kernel-info-end-lto enables kernel-info in the NVPTX target
+; backend.
+
+; REQUIRES: nvptx-registered-target
+
+; -kernel-info-end-lto inserts kernel-info into LTO pipeline.
+; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN:     -passes='lto<O2>' -kernel-info-end-lto 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+; Omitting -kernel-info-end-lto disables kernel-info.
+; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN:     -passes='lto<O2>' 2>&1 | \
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
+
+; Omitting LTO disables kernel-info.
+; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN:     -passes='default<O2>' -kernel-info-end-lto 2>&1 | \
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
+; NONE-NOT: remark:
+define void @test() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!6, !7, !8}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = !{ptr @test, !"maxclusterrank", i32 200}
+!7 = !{ptr @test, !"maxntidx", i32 210}
+!8 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
new file mode 100644
index 00000000000000..0c98f4ad45950a
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
@@ -0,0 +1,40 @@
+; Check info on launch bounds for AMD GPU.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
+; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101
+; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsX = 200
+; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsY = 201
+; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsZ = 202
+; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMin = 210
+; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMax = 211
+; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMin = 220
+; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMax = 221
+define void @test() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+  "omp_target_thread_limit"="101"
+  "amdgpu-max-num-workgroups"="200,201,202"
+  "amdgpu-flat-work-group-size"="210,211"
+  "amdgpu-waves-per-eu"="220,221"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
new file mode 100644
index 00000000000000..c7339f90e3ca92
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
@@ -0,0 +1,36 @@
+; Check info on launch bounds for NVPTX.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
+; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101
+; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200
+; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210
+define void @test() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+  "omp_target_thread_limit"="101"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!6, !7, !8}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = !{ptr @test, !"maxclusterrank", i32 200}
+!7 = !{ptr @test, !"maxntidx", i32 210}
+!8 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll
new file mode 100644
index 00000000000000..43154d2379825c
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/linkage.ll
@@ -0,0 +1,51 @@
+; Check info on linkage.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK: remark: test.c:3:0: in function 'f', ExternalNotKernel = 1
+define external void @f() !dbg !10 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:13:0: in artificial function 'g', ExternalNotKernel = 1
+define void @g() !dbg !20 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:23:0: in function 'h', ExternalNotKernel = 0
+define external void @h() #0 !dbg !30 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:33:0: in artificial function 'i', ExternalNotKernel = 0
+define weak void @i() !dbg !40 {
+entry:
+  ret void
+}
+
+attributes #0 = { "kernel" }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{null}
+!4 = !{}
+!10 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!11 = !DISubroutineType(types: !3)
+!20 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!21 = distinct !DISubroutineType(types: !3)
+!30 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!31 = distinct !DISubroutineType(types: !3)
+!40 = distinct !DISubprogram(name: "i", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!41 = distinct !DISubroutineType(types: !3)
diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md
new file mode 100644
index 00000000000000..0d13950e198edd
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/openmp/README.md
@@ -0,0 +1,40 @@
+The tests in this directory check that basic KernelInfoAnalysis functionality
+behaves reasonably for LLVM IR produced by Clang OpenMP codegen.
+
+So that these tests are straightforward to maintain and faithfully represent
+Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them.  Other tests
+more exhaustively check KernelInfoAnalysis features using reduced LLVM IR.
+
+The LLVM IR in each test file `$TEST` can be regenerated as follows in the case
+that Clang OpenMP codegen changes or it becomes desirable to adjust the source
+OpenMP program below.  First, remove the existing LLVM IR from `$TEST`.  Then,
+where `$TARGET` (e.g., `nvptx64-nvidia-cuda` or `amdgcn-amd-amdhsa`) depends on
+`$TEST`:
+
+```
+$ cd /tmp
+$ cat test.c
+#pragma omp declare target
+void f();
+void g() {
+  int i;
+  int a[2];
+  f();
+  g();
+}
+#pragma omp end declare target
+
+void h(int i) {
+  #pragma omp target map(tofrom:i)
+  {
+    int i;
+    int a[2];
+    f();
+    g();
+  }
+}
+
+$ clang -g -fopenmp -fopenmp-targets=$TARGET -save-temps -c test.c
+$ llvm-dis test-openmp-$TARGET.bc
+$ cat test-openmp-$TARGET.ll >> $TEST
+```
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
new file mode 100644
index 00000000000000..ee5f65b8e5ab72
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -0,0 +1,217 @@
+; See ./README.md for how to maintain the LLVM IR in this test.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+; For some builds, we see a warning like:
+;
+;   opt: WARNING: failed to create target machine for 'amdgcn-amd-amdhsa': unable to get target for 'amdgcn-amd-amdhsa', see --version and --triple.
+;
+; But there should be no other remarks here.
+; CHECK-NOT: remark:
+
+;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init'
+; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f'
+; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit'
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1
+
+; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2
+
+; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f'
+; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0
+;  CHECK-NOT: {{.}}
+
+
+; ModuleID = 'test-openmp-amdgcn-amd-amdhsa.bc'
+source_filename = "test.c"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+%struct.DynamicEnvironmentTy = type { i16 }
+%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
+
+ at __omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0
+ at __omp_rtl_assume_teams_oversubscription = weak_odr hidden addrspace(1) constant i32 0
+ at __omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0
+ at __omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0
+ at __omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0
+ at 0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_71f35_h_l12_debug__;13;3;;\00", align 1
+ at 1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8
+ at __omp_offloading_fd02_71f35_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer
+ at __omp_offloading_fd02_71f35_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_dynamic_environment to ptr) }
+ at __oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define internal void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !16 {
+entry:
+  %dyn_ptr.addr = alloca ptr, align 8, addrspace(5)
+  %i = alloca i32, align 4, addrspace(5)
+  %a = alloca [2 x i32], align 4, addrspace(5)
+  %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr
+  %i.ascast = addrspacecast ptr addrspace(5) %i to ptr
+  %a.ascast = addrspacecast ptr addrspace(5) %a to ptr
+  store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8
+  tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25
+  %0 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_kernel_environment to ptr), ptr %dyn_ptr), !dbg !26
+  %exec_user_code = icmp eq i32 %0, -1, !dbg !26
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26
+
+user_code.entry:                                  ; preds = %entry
+  tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !27, metadata !DIExpression()), !dbg !30
+  tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !31, metadata !DIExpression()), !dbg !35
+  call void @f() #5, !dbg !36
+  call void @g() #5, !dbg !37
+  call void @__kmpc_target_deinit(), !dbg !38
+  ret void, !dbg !39
+
+worker.exit:                                      ; preds = %entry
+  ret void, !dbg !26
+}
+
+declare i32 @__kmpc_target_init(ptr, ptr)
+
+; Function Attrs: convergent
+declare void @f(...) #1
+
+declare void @__kmpc_target_deinit()
+
+; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_71f35_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 {
+entry:
+  %dyn_ptr.addr = alloca ptr, align 8, addrspace(5)
+  %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr
+  store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8
+  tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42
+  %0 = load ptr, ptr %dyn_ptr.addr.ascast, align 8, !dbg !43
+  call void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr %0) #6, !dbg !43
+  ret void, !dbg !43
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define hidden void @g() #3 !dbg !44 {
+entry:
+  %i = alloca i32, align 4, addrspace(5)
+  %a = alloca [2 x i32], align 4, addrspace(5)
+  %i.ascast = addrspacecast ptr addrspace(5) %i to ptr
+  %a.ascast = addrspacecast ptr addrspace(5) %a to ptr
+  tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !47, metadata !DIExpression()), !dbg !48
+  tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !49, metadata !DIExpression()), !dbg !50
+  call void @f() #5, !dbg !51
+  call void @g() #5, !dbg !52
+  ret void, !dbg !53
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #4
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #5 = { convergent }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!omp_offload.info = !{!2}
+!nvvm.annotations = !{!3}
+!llvm.module.flags = !{!4, !5, !6, !7, !8, !9, !10, !11, !12}
+!llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14}
+!opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "eff61a7cf33c8dd1bd6933250fc90157")
+!2 = !{i32 0, i32 64770, i32 466741, !"h", i32 12, i32 0, i32 0}
+!3 = !{ptr @__omp_offloading_fd02_71f35_h_l12, !"kernel", i32 1}
+!4 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!5 = !{i32 7, !"Dwarf Version", i32 5}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{i32 7, !"openmp", i32 51}
+!9 = !{i32 7, !"openmp-device", i32 51}
+!10 = !{i32 8, !"PIC Level", i32 2}
+!11 = !{i32 7, !"frame-pointer", i32 2}
+!12 = !{i32 4, !"amdgpu_hostcall", i32 1}
+!13 = !{!"clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)"}
+!14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"}
+!15 = !{i32 2, i32 0}
+!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23)
+!17 = !DIFile(filename: "test.c", directory: "/tmp")
+!18 = !DISubroutineType(types: !19)
+!19 = !{null, !20}
+!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21)
+!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!23 = !{}
+!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !16, type: !20, flags: DIFlagArtificial)
+!25 = !DILocation(line: 0, scope: !16)
+!26 = !DILocation(line: 13, column: 3, scope: !16)
+!27 = !DILocalVariable(name: "i", scope: !28, file: !17, line: 14, type: !29)
+!28 = distinct !DILexicalBlock(scope: !16, file: !17, line: 13, column: 3)
+!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!30 = !DILocation(line: 14, column: 9, scope: !28)
+!31 = !DILocalVariable(name: "a", scope: !28, file: !17, line: 15, type: !32)
+!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33)
+!33 = !{!34}
+!34 = !DISubrange(count: 2)
+!35 = !DILocation(line: 15, column: 9, scope: !28)
+!36 = !DILocation(line: 16, column: 5, scope: !28)
+!37 = !DILocation(line: 17, column: 5, scope: !28)
+!38 = !DILocation(line: 18, column: 3, scope: !28)
+!39 = !DILocation(line: 18, column: 3, scope: !16)
+!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23)
+!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial)
+!42 = !DILocation(line: 0, scope: !40)
+!43 = !DILocation(line: 12, column: 1, scope: !40)
+!44 = distinct !DISubprogram(name: "g", scope: !17, file: !17, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !23)
+!45 = !DISubroutineType(types: !46)
+!46 = !{null}
+!47 = !DILocalVariable(name: "i", scope: !44, file: !17, line: 4, type: !29)
+!48 = !DILocation(line: 4, column: 7, scope: !44)
+!49 = !DILocalVariable(name: "a", scope: !44, file: !17, line: 5, type: !32)
+!50 = !DILocation(line: 5, column: 7, scope: !44)
+!51 = !DILocation(line: 6, column: 3, scope: !44)
+!52 = !DILocation(line: 7, column: 3, scope: !44)
+!53 = !DILocation(line: 8, column: 1, scope: !44)
diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
new file mode 100644
index 00000000000000..41d068b03548b0
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
@@ -0,0 +1,811 @@
+; See ./README.md for how to maintain the LLVM IR in this test.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+; For some builds, we see a warning like:
+;
+;   opt: WARNING: failed to create target machine for 'nvptx64-nvidia-cuda': unable to get target for 'nvptx64-nvidia-cuda', see --version and --triple.
+;
+; But there should be no other remarks here.
+; CHECK-NOT: remark:
+
+;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init'
+; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f'
+; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_deinit'
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 128
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Maxntidx = 128
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1
+
+; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2
+
+; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f'
+; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0
+;  CHECK-NOT: remark: {{.*: in function 'g',.*}}
+
+; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't
+; want to maintain a list of their allocas, calls, etc. in this test.
+
+
+; ModuleID = 'test-openmp-nvptx64-nvidia-cuda.bc'
+source_filename = "test.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+%struct.DynamicEnvironmentTy = type { i16 }
+%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
+%struct.DeviceMemoryPoolTy = type { ptr, i64 }
+%struct.DeviceMemoryPoolTrackingTy = type { i64, i64, i64, i64 }
+%struct.DeviceEnvironmentTy = type { i32, i32, i32, i32, i64, i64, i64, i64 }
+%"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] }
+%"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr }
+%"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 }
+%printf_args = type { ptr, i32, ptr, ptr, ptr }
+%printf_args.7 = type { ptr, i32, ptr, ptr }
+
+ at __omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
+ at __omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+ at 0 = private unnamed_addr constant [59 x i8] c";test.c;__omp_offloading_10305_5c00dd_h_l12_debug__;13;3;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 58, ptr @0 }, align 8
+ at __omp_offloading_10305_5c00dd_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer
+ at __omp_offloading_10305_5c00dd_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_10305_5c00dd_h_l12_dynamic_environment }
+ at llvm.used = appending global [3 x ptr] [ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata"
+ at __omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8
+ at __omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8
+ at __omp_rtl_debug_kind = weak_odr hidden constant i32 0
+ at __omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+ at __omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
+ at __omp_rtl_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 8
+ at .str = private unnamed_addr constant [40 x i8] c"%s:%u: %s: Assertion %s (`%s`) failed.\0A\00", align 1
+ at .str1 = private unnamed_addr constant [35 x i8] c"%s:%u: %s: Assertion `%s` failed.\0A\00", align 1
+ at .str15 = private unnamed_addr constant [43 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Kernel.cpp\00", align 1
+ at __PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy = private unnamed_addr constant [36 x i8] c"void genericStateMachine(IdentTy *)\00", align 1
+ at .str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1
+ at __PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1
+ at IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4
+ at .str1127 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1
+ at .str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1
+ at __PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1
+ at _ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8
+ at _ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8
+ at _ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16
+ at .str544 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1
+ at .str847 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1
+ at __PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1
+ at .str948 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1
+ at .str1049 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1
+ at .str1150 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1
+ at .str1251 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1
+ at .str1352 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1
+ at .str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1
+ at __PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1
+ at .str1553 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1
+ at .str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1
+ at __PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1
+ at _ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8
+ at _ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define internal void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !17 {
+entry:
+  %dyn_ptr.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  %a = alloca [2 x i32], align 4
+  store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8
+  tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_10305_5c00dd_h_l12_kernel_environment, ptr %dyn_ptr), !dbg !26
+  %exec_user_code = icmp eq i32 %0, -1, !dbg !26
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26
+
+user_code.entry:                                  ; preds = %entry
+  tail call void @llvm.dbg.declare(metadata ptr %i, metadata !27, metadata !DIExpression()), !dbg !30
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !31, metadata !DIExpression()), !dbg !35
+  call void @f() #16, !dbg !36
+  call void @g() #16, !dbg !37
+  call void @__kmpc_target_deinit(), !dbg !38
+  ret void, !dbg !39
+
+worker.exit:                                      ; preds = %entry
+  ret void, !dbg !26
+}
+
+; Function Attrs: convergent
+declare void @f(...) #1
+
+; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+define weak_odr protected void @__omp_offloading_10305_5c00dd_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 {
+entry:
+  %dyn_ptr.addr = alloca ptr, align 8
+  store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8
+  tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42
+  %0 = load ptr, ptr %dyn_ptr.addr, align 8, !dbg !43
+  call void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr %0) #17, !dbg !43
+  ret void, !dbg !43
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define hidden void @g() #3 !dbg !44 {
+entry:
+  %i = alloca i32, align 4
+  %a = alloca [2 x i32], align 4
+  tail call void @llvm.dbg.declare(metadata ptr %i, metadata !47, metadata !DIExpression()), !dbg !48
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !49, metadata !DIExpression()), !dbg !50
+  call void @f() #16, !dbg !51
+  call void @g() #16, !dbg !52
+  ret void, !dbg !53
+}
+
+; Function Attrs: convergent mustprogress nounwind
+define internal noundef i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %KernelEnvironment, ptr nofree noundef nonnull align 8 dereferenceable(16) %KernelLaunchEnvironment) #4 {
+entry:
+  %WorkFn.i = alloca ptr, align 8
+  %ExecMode = getelementptr inbounds i8, ptr %KernelEnvironment, i64 2
+  %0 = load i8, ptr %ExecMode, align 2, !tbaa !54
+  %1 = and i8 %0, 2
+  %tobool.not = icmp eq i8 %1, 0
+  %2 = load i8, ptr %KernelEnvironment, align 8, !tbaa !60
+  %tobool3.not = icmp ne i8 %2, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18
+  %cmp.i.i.i = icmp eq i32 %3, 0
+  br i1 %cmp.i.i.i, label %if.then.i, label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge
+
+if.then.i:                                        ; preds = %if.then
+  store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61
+  %idxprom.i.i = zext nneg i32 %3 to i64
+  %arrayidx.i.i = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i
+  %4 = addrspacecast ptr %arrayidx.i.i to ptr addrspace(3)
+  store i8 0, ptr addrspace(3) %4, align 1, !tbaa !62
+  store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74
+  store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75
+  store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76
+  store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76
+  store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76
+  br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit
+
+_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge: ; preds = %if.then
+  %idxprom.i.i.c = zext i32 %3 to i64
+  %arrayidx.i.i.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i.c
+  %5 = addrspacecast ptr %arrayidx.i.i.c to ptr addrspace(3)
+  store i8 0, ptr addrspace(3) %5, align 1, !tbaa !62
+  br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit
+
+_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit: ; preds = %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge, %if.then.i
+  tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18
+  %sub.i.i.i7 = add i32 %6, -1
+  %and.i.i.i8 = and i32 %sub.i.i.i7, -32
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18
+  %cmp.i.i.i9 = icmp eq i32 %7, %and.i.i.i8
+  br i1 %cmp.i.i.i9, label %if.then.i11, label %if.end.critedge
+
+if.then.i11:                                      ; preds = %if.else
+  store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61
+  %idxprom.i.i13 = zext i32 %7 to i64
+  %arrayidx.i.i14 = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13
+  %8 = addrspacecast ptr %arrayidx.i.i14 to ptr addrspace(3)
+  store i8 0, ptr addrspace(3) %8, align 1, !tbaa !62
+  store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74
+  store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75
+  store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76
+  store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76
+  store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76
+  br label %if.end
+
+if.end.critedge:                                  ; preds = %if.else
+  %idxprom.i.i13.c = zext i32 %7 to i64
+  %arrayidx.i.i14.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13.c
+  %9 = addrspacecast ptr %arrayidx.i.i14.c to ptr addrspace(3)
+  store i8 0, ptr addrspace(3) %9, align 1, !tbaa !62
+  br label %if.end
+
+if.end:                                           ; preds = %if.end.critedge, %if.then.i11, %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit
+  br i1 %tobool.not, label %if.end9, label %if.then7
+
+if.then7:                                         ; preds = %if.end
+  %10 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61
+  %11 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77
+  %and.i.i.i21 = and i32 %10, 1
+  %and.i.i = and i32 %and.i.i.i21, %11
+  %tobool.i.i = icmp ne i32 %and.i.i, 0
+  %.pre67.i.i.i = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !80
+  %cmp.i.i.i22 = icmp ne i32 %.pre67.i.i.i, 0
+  %or.cond.not.i.i.i = select i1 %tobool.i.i, i1 %cmp.i.i.i22, i1 false
+  br i1 %or.cond.not.i.i.i, label %if.then.i.i.i, label %if.else.i.i.i
+
+if.then.i.i.i:                                    ; preds = %if.then7
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str847, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20
+  unreachable
+
+if.else.i.i.i:                                    ; preds = %if.then7
+  %cmp5.i.i.i = icmp eq i32 %.pre67.i.i.i, 0
+  tail call void @llvm.assume(i1 noundef %cmp5.i.i.i) #21
+  %12 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !81
+  br i1 %tobool.i.i, label %land.lhs.true7.i.i.i, label %if.else11.i.i.i
+
+land.lhs.true7.i.i.i:                             ; preds = %if.else.i.i.i
+  %cmp9.i.i.i = icmp eq i32 %12, 0
+  br i1 %cmp9.i.i.i, label %if.else11.i.i.i, label %if.then10.i.i.i
+
+if.then10.i.i.i:                                  ; preds = %land.lhs.true7.i.i.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str948, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20
+  unreachable
+
+if.else11.i.i.i:                                  ; preds = %land.lhs.true7.i.i.i, %if.else.i.i.i
+  %13 = phi i32 [ 0, %land.lhs.true7.i.i.i ], [ %12, %if.else.i.i.i ]
+  %cmp14.i.i.i = icmp eq i32 %13, 0
+  tail call void @llvm.assume(i1 noundef %cmp14.i.i.i) #21
+  %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !82
+  br i1 %tobool.i.i, label %land.lhs.true17.i.i.i, label %if.else21.i.i.i
+
+land.lhs.true17.i.i.i:                            ; preds = %if.else11.i.i.i
+  %cmp19.i.i.i = icmp eq i32 %14, 0
+  br i1 %cmp19.i.i.i, label %if.else21.i.i.i, label %if.then20.i.i.i
+
+if.then20.i.i.i:                                  ; preds = %land.lhs.true17.i.i.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1049, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20
+  unreachable
+
+if.else21.i.i.i:                                  ; preds = %land.lhs.true17.i.i.i, %if.else11.i.i.i
+  %15 = phi i32 [ 0, %land.lhs.true17.i.i.i ], [ %14, %if.else11.i.i.i ]
+  %cmp24.i.i.i = icmp eq i32 %15, 0
+  tail call void @llvm.assume(i1 noundef %cmp24.i.i.i) #21
+  %16 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !83
+  br i1 %tobool.i.i, label %land.lhs.true27.i.i.i, label %if.else31.i.i.i
+
+land.lhs.true27.i.i.i:                            ; preds = %if.else21.i.i.i
+  %cmp29.i.i.i = icmp eq i32 %16, 1
+  br i1 %cmp29.i.i.i, label %if.else31.i.i.i, label %if.then30.i.i.i
+
+if.then30.i.i.i:                                  ; preds = %land.lhs.true27.i.i.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1150, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20
+  unreachable
+
+if.else31.i.i.i:                                  ; preds = %land.lhs.true27.i.i.i, %if.else21.i.i.i
+  %17 = phi i32 [ 1, %land.lhs.true27.i.i.i ], [ %16, %if.else21.i.i.i ]
+  %cmp34.i.i.i = icmp eq i32 %17, 1
+  tail call void @llvm.assume(i1 noundef %cmp34.i.i.i) #21
+  %18 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !84
+  br i1 %tobool.i.i, label %land.lhs.true37.i.i.i, label %if.else.critedge.i.critedge.critedge.critedge
+
+land.lhs.true37.i.i.i:                            ; preds = %if.else31.i.i.i
+  %cmp39.i.i.i = icmp eq i32 %18, 1
+  br i1 %cmp39.i.i.i, label %if.else41.i.i.i, label %if.then40.i.i.i
+
+if.then40.i.i.i:                                  ; preds = %land.lhs.true37.i.i.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1251, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20
+  unreachable
+
+if.else41.i.i.i:                                  ; preds = %land.lhs.true37.i.i.i
+  %cmp44.i.i.i = icmp eq i32 1, 1
+  tail call void @llvm.assume(i1 noundef %cmp44.i.i.i) #21
+  br i1 %tobool.i.i, label %land.lhs.true47.i.i.i, label %if.else.critedge.i.critedge
+
+land.lhs.true47.i.i.i:                            ; preds = %if.else41.i.i.i
+  %19 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !85
+  %cmp49.i.i.i = icmp eq i32 %19, 1
+  br i1 %cmp49.i.i.i, label %if.else51.i.i.i, label %if.then50.i.i.i
+
+if.then50.i.i.i:                                  ; preds = %land.lhs.true47.i.i.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1352, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20
+  unreachable
+
+if.else51.i.i.i:                                  ; preds = %land.lhs.true47.i.i.i
+  br i1 %tobool.i.i, label %land.lhs.true.i.i, label %if.else.critedge.i.critedge
+
+land.lhs.true.i.i:                                ; preds = %if.else51.i.i.i
+  %20 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73
+  %cmp.i.i = icmp eq i32 %20, 1
+  br i1 %cmp.i.i, label %land.lhs.true8.i.i, label %if.then.i.i
+
+if.then.i.i:                                      ; preds = %land.lhs.true.i.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20
+  unreachable
+
+land.lhs.true8.i.i:                               ; preds = %land.lhs.true.i.i
+  %21 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74
+  %cmp10.i.i = icmp eq i32 %21, 0
+  br i1 %cmp10.i.i, label %land.lhs.true.i24, label %if.then11.i.i
+
+if.then11.i.i:                                    ; preds = %land.lhs.true8.i.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1553, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20
+  unreachable
+
+land.lhs.true.i24:                                ; preds = %land.lhs.true8.i.i
+  %22 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61
+  %tobool.i25.i.not = icmp eq i32 %22, 0
+  br i1 %tobool.i25.i.not, label %if.then.i25, label %_ZN4ompx5state18assumeInitialStateEb.exit
+
+if.then.i25:                                      ; preds = %land.lhs.true.i24
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20
+  unreachable
+
+if.else.critedge.i.critedge.critedge.critedge:    ; preds = %if.else31.i.i.i
+  %cmp44.i.i.i.c = icmp eq i32 %18, 1
+  tail call void @llvm.assume(i1 noundef %cmp44.i.i.i.c) #21
+  br label %if.else.critedge.i.critedge
+
+if.else.critedge.i.critedge:                      ; preds = %if.else41.i.i.i, %if.else.critedge.i.critedge.critedge.critedge, %if.else51.i.i.i
+  %.pre.i = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61
+  %23 = icmp ne i32 %.pre.i, 0
+  br label %_ZN4ompx5state18assumeInitialStateEb.exit
+
+_ZN4ompx5state18assumeInitialStateEb.exit:        ; preds = %land.lhs.true.i24, %if.else.critedge.i.critedge
+  %cmp8.i = phi i1 [ %23, %if.else.critedge.i.critedge ], [ true, %land.lhs.true.i24 ]
+  tail call void @llvm.assume(i1 noundef %cmp8.i) #21
+  tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19
+  br label %cleanup
+
+if.end9:                                          ; preds = %if.end
+  %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18
+  %sub.i.i = add i32 %24, -1
+  %and.i.i26 = and i32 %sub.i.i, -32
+  %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18
+  %cmp.i.i27 = icmp eq i32 %25, %and.i.i26
+  br i1 %cmp.i.i27, label %cleanup, label %if.end12
+
+if.end12:                                         ; preds = %if.end9
+  %sub.i = add i32 %24, -32
+  %cmp = icmp ult i32 %25, %sub.i
+  %or.cond33 = and i1 %tobool3.not, %cmp
+  br i1 %or.cond33, label %do.body.i.preheader, label %cleanup
+
+do.body.i.preheader:                              ; preds = %if.end12
+  %26 = load i32, ptr @__omp_rtl_debug_kind, align 4
+  %27 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8
+  %and.i.i29 = and i32 %26, 1
+  %and.i = and i32 %and.i.i29, %27
+  %tobool.i = icmp ne i32 %and.i, 0
+  br label %do.body.i
+
+do.body.i:                                        ; preds = %do.body.i.preheader, %if.end9.i
+  call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn.i) #22
+  store ptr null, ptr %WorkFn.i, align 8, !tbaa !76
+  tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18
+  %call1.i = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn.i) #22
+  %28 = load ptr, ptr %WorkFn.i, align 8, !tbaa !76
+  %tobool.not.not.i = icmp eq ptr %28, null
+  br i1 %tobool.not.not.i, label %_ZL19genericStateMachineP7IdentTy.exit, label %if.end.i
+
+if.end.i:                                         ; preds = %do.body.i
+  br i1 %call1.i, label %if.then3.i, label %if.end9.i
+
+if.then3.i:                                       ; preds = %if.end.i
+  %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  %tobool.i30 = icmp ne i32 %29, 0
+  %or.cond = select i1 %tobool.i, i1 %tobool.i30, i1 false
+  br i1 %or.cond, label %if.then6.i, label %if.else.i
+
+if.then6.i:                                       ; preds = %if.then3.i
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 58, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20
+  unreachable
+
+if.else.i:                                        ; preds = %if.then3.i
+  %tobool.i31.not = icmp eq i32 %29, 0
+  tail call void @llvm.assume(i1 noundef %tobool.i31.not) #21
+  tail call void %28(i32 noundef 0, i32 noundef %25) #23
+  tail call void @__kmpc_kernel_end_parallel() #24
+  br label %if.end9.i
+
+if.end9.i:                                        ; preds = %if.else.i, %if.end.i
+  tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18
+  call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22
+  br label %do.body.i, !llvm.loop !86
+
+_ZL19genericStateMachineP7IdentTy.exit:           ; preds = %do.body.i
+  call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end12, %_ZL19genericStateMachineP7IdentTy.exit, %if.end9, %_ZN4ompx5state18assumeInitialStateEb.exit
+  %retval.0 = phi i32 [ -1, %_ZN4ompx5state18assumeInitialStateEb.exit ], [ -1, %if.end9 ], [ %25, %_ZL19genericStateMachineP7IdentTy.exit ], [ %25, %if.end12 ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5
+
+; Function Attrs: convergent mustprogress noinline norecurse nounwind
+define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %Ordering) local_unnamed_addr #6 {
+entry:
+  tail call void @llvm.nvvm.barrier0() #25
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5
+
+; Function Attrs: convergent mustprogress noreturn nounwind
+define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %expr, ptr noundef %msg, ptr nofree noundef nonnull dereferenceable(69) %file, i32 noundef %line, ptr nofree noundef nonnull dereferenceable(20) %function) unnamed_addr #7 {
+entry:
+  %tmp = alloca %printf_args, align 8
+  %tmp1 = alloca %printf_args.7, align 8
+  %tobool.not = icmp eq ptr %msg, null
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  store ptr %file, ptr %tmp, align 8
+  %0 = getelementptr inbounds i8, ptr %tmp, i64 8
+  store i32 %line, ptr %0, align 8
+  %1 = getelementptr inbounds i8, ptr %tmp, i64 16
+  store ptr %function, ptr %1, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store ptr %file, ptr %tmp1, align 8
+  %2 = getelementptr inbounds i8, ptr %tmp1, i64 8
+  store i32 %line, ptr %2, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %.sink12 = phi i64 [ 16, %if.else ], [ 24, %if.then ]
+  %tmp1.sink11 = phi ptr [ %tmp1, %if.else ], [ %tmp, %if.then ]
+  %function.sink = phi ptr [ %function, %if.else ], [ %msg, %if.then ]
+  %.sink9 = phi i64 [ 24, %if.else ], [ 32, %if.then ]
+  %.str1.sink = phi ptr [ @.str1, %if.else ], [ @.str, %if.then ]
+  %3 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink12
+  store ptr %function.sink, ptr %3, align 8
+  %4 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink9
+  store ptr %expr, ptr %4, align 8
+  %call.i.i = call noundef i32 @vprintf(ptr noundef nonnull %.str1.sink, ptr noundef nonnull %tmp1.sink11) #24
+  call void @llvm.trap() #26
+  unreachable
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef) #8
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #9
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.sync(i32) #10
+
+; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none)
+define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) local_unnamed_addr #11 {
+entry:
+  %0 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76
+  store ptr %0, ptr %WorkFn, align 8, !tbaa !76
+  %tobool.not = icmp eq ptr %0, null
+  br i1 %tobool.not, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27
+  %2 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !61
+  %tobool.not.i = icmp eq i32 %2, 0
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18
+  %4 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  %tobool.i.not.i.i = icmp eq i32 %4, 0
+  %mul.neg.i.i.i = select i1 %tobool.i.not.i.i, i32 -32, i32 0
+  %sub.i.i.i = add i32 %mul.neg.i.i.i, %3
+  %cond.i = select i1 %tobool.not.i, i32 %sub.i.i.i, i32 %2
+  %cmp = icmp ult i32 %1, %cond.i
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  %retval.0 = phi i1 [ %cmp, %if.end ], [ false, %entry ]
+  ret i1 %retval.0
+}
+
+; Function Attrs: convergent mustprogress noinline nounwind
+define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #12 {
+entry:
+  %0 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61
+  %1 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77
+  %and.i.i = and i32 %0, 1
+  %and.i = and i32 %and.i.i, %1
+  %tobool.i = icmp ne i32 %and.i, 0
+  %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  %tobool.i1 = icmp ne i32 %2, 0
+  %or.cond = select i1 %tobool.i, i1 %tobool.i1, i1 false
+  br i1 %or.cond, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 297, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20
+  unreachable
+
+if.else:                                          ; preds = %entry
+  %tobool.i2.not = icmp eq i32 %2, 0
+  tail call void @llvm.assume(i1 noundef %tobool.i2.not) #21
+  %3 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !61
+  %tobool.not.i.i = icmp eq i32 %3, 0
+  %4 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8
+  %tobool.not.i = icmp ne i32 %4, 0
+  %or.cond.not.i = select i1 %tobool.not.i.i, i1 %tobool.not.i, i1 false
+  br i1 %or.cond.not.i, label %lor.rhs.i, label %_ZN4ompx5state19resetStateForThreadEj.exit
+
+lor.rhs.i:                                        ; preds = %if.else
+  %5 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27
+  %6 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76
+  %idxprom.i = zext i32 %5 to i64
+  %arrayidx.i = getelementptr inbounds ptr, ptr %6, i64 %idxprom.i
+  %7 = load ptr, ptr %arrayidx.i, align 8, !tbaa !76
+  %tobool1.not.i = icmp eq ptr %7, null
+  br i1 %tobool1.not.i, label %_ZN4ompx5state19resetStateForThreadEj.exit, label %if.end4.i, !prof !88
+
+if.end4.i:                                        ; preds = %lor.rhs.i
+  %PreviousThreadState7.i = getelementptr inbounds i8, ptr %7, i64 32
+  %8 = load ptr, ptr %PreviousThreadState7.i, align 8, !tbaa !89
+  tail call void @free(ptr noundef nonnull dereferenceable(40) %7) #28
+  %9 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76
+  %arrayidx11.i = getelementptr inbounds ptr, ptr %9, i64 %idxprom.i
+  store ptr %8, ptr %arrayidx11.i, align 8, !tbaa !76
+  %.pre = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  br label %_ZN4ompx5state19resetStateForThreadEj.exit
+
+_ZN4ompx5state19resetStateForThreadEj.exit:       ; preds = %if.else, %lor.rhs.i, %if.end4.i
+  %10 = phi i32 [ 0, %if.else ], [ 0, %lor.rhs.i ], [ %.pre, %if.end4.i ]
+  %tobool.i6 = icmp ne i32 %10, 0
+  %or.cond8 = select i1 %tobool.i, i1 %tobool.i6, i1 false
+  br i1 %or.cond8, label %if.then7, label %if.else8
+
+if.then7:                                         ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 300, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20
+  unreachable
+
+if.else8:                                         ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit
+  %tobool.i7.not = icmp eq i32 %10, 0
+  tail call void @llvm.assume(i1 noundef %tobool.i7.not) #21
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #9
+
+; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite)
+declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #13
+
+; Function Attrs: convergent
+declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #14
+
+; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write)
+declare void @llvm.trap() #15
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #10
+
+; Function Attrs: convergent mustprogress nounwind
+define internal void @__kmpc_target_deinit() #4 {
+entry:
+  %WorkFn = alloca ptr, align 8
+  %0 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61
+  %tobool.i.not = icmp eq i32 %0, 0
+  br i1 %tobool.i.not, label %if.end, label %cleanup
+
+if.end:                                           ; preds = %entry
+  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18
+  %sub.i.i = add i32 %1, -1
+  %and.i.i = and i32 %sub.i.i, -32
+  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18
+  %cmp.i.i = icmp eq i32 %2, %and.i.i
+  br i1 %cmp.i.i, label %if.then3, label %if.else
+
+if.then3:                                         ; preds = %if.end
+  store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76
+  br label %cleanup
+
+if.else:                                          ; preds = %if.end
+  %3 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76
+  %4 = load i8, ptr %3, align 8, !tbaa !91
+  %tobool6.not = icmp eq i8 %4, 0
+  br i1 %tobool6.not, label %if.then7, label %cleanup
+
+if.then7:                                         ; preds = %if.else
+  call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn) #29
+  store ptr null, ptr %WorkFn, align 8, !tbaa !76
+  %call8 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) #22
+  %5 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61
+  %6 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77
+  %and.i.i1 = and i32 %5, 1
+  %and.i = and i32 %and.i.i1, %6
+  %tobool.i2.not = icmp eq i32 %and.i, 0
+  %7 = load ptr, ptr %WorkFn, align 8
+  %cmp = icmp eq ptr %7, null
+  %or.cond = select i1 %tobool.i2.not, i1 true, i1 %cmp
+  br i1 %or.cond, label %if.else11, label %if.then10
+
+if.then10:                                        ; preds = %if.then7
+  tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 150, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20
+  unreachable
+
+if.else11:                                        ; preds = %if.then7
+  tail call void @llvm.assume(i1 noundef %cmp) #21
+  call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn) #22
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.else11, %if.else, %if.then3, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #5
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" }
+attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" }
+attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" }
+attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" }
+attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" }
+attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #6 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" }
+attributes #7 = { convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" }
+attributes #8 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #10 = { convergent nocallback nounwind }
+attributes #11 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" }
+attributes #12 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" }
+attributes #13 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" }
+attributes #14 = { convergent "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" }
+attributes #15 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+attributes #16 = { convergent }
+attributes #17 = { nounwind }
+attributes #18 = { "llvm.assume"="ompx_no_call_asm" }
+attributes #19 = { convergent nounwind "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" }
+attributes #20 = { noreturn nounwind "llvm.assume"="ompx_no_call_asm" }
+attributes #21 = { memory(write) "llvm.assume"="ompx_no_call_asm" }
+attributes #22 = { nounwind "llvm.assume"="ompx_no_call_asm" }
+attributes #23 = { convergent nounwind }
+attributes #24 = { convergent nounwind "llvm.assume"="ompx_no_call_asm" }
+attributes #25 = { "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" }
+attributes #26 = { noreturn "llvm.assume"="ompx_no_call_asm" }
+attributes #27 = { nofree willreturn "llvm.assume"="ompx_no_call_asm" }
+attributes #28 = { convergent nounwind willreturn "llvm.assume"="ompx_no_call_asm" }
+attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+!llvm.dbg.cu = !{!10}
+!nvvm.annotations = !{!12, !13}
+!omp_offload.info = !{!14}
+!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
+!1 = !{i32 7, !"Dwarf Version", i32 2}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 1, !"wchar_size", i32 4}
+!4 = !{i32 7, !"openmp", i32 51}
+!5 = !{i32 7, !"openmp-device", i32 51}
+!6 = !{i32 8, !"PIC Level", i32 2}
+!7 = !{i32 7, !"frame-pointer", i32 2}
+!8 = !{i32 1, !"ThinLTO", i32 0}
+!9 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!10 = distinct !DICompileUnit(language: DW_LANG_C11, file: !11, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!11 = !DIFile(filename: "test.c", directory: "/tmp")
+!12 = !{ptr @__omp_offloading_10305_5c00dd_h_l12_debug__, !"maxntidx", i32 128}
+!13 = !{ptr @__omp_offloading_10305_5c00dd_h_l12, !"kernel", i32 1}
+!14 = !{i32 0, i32 66309, i32 6029533, !"h", i32 12, i32 0, i32 0}
+!15 = !{!"clang version 19.0.0git"}
+!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!17 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12_debug__", scope: !11, file: !11, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23)
+!18 = !DISubroutineType(types: !19)
+!19 = !{null, !20}
+!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21)
+!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!23 = !{}
+!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !17, type: !20, flags: DIFlagArtificial)
+!25 = !DILocation(line: 0, scope: !17)
+!26 = !DILocation(line: 13, column: 3, scope: !17)
+!27 = !DILocalVariable(name: "i", scope: !28, file: !11, line: 14, type: !29)
+!28 = distinct !DILexicalBlock(scope: !17, file: !11, line: 13, column: 3)
+!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!30 = !DILocation(line: 14, column: 9, scope: !28)
+!31 = !DILocalVariable(name: "a", scope: !28, file: !11, line: 15, type: !32)
+!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33)
+!33 = !{!34}
+!34 = !DISubrange(count: 2)
+!35 = !DILocation(line: 15, column: 9, scope: !28)
+!36 = !DILocation(line: 16, column: 5, scope: !28)
+!37 = !DILocation(line: 17, column: 5, scope: !28)
+!38 = !DILocation(line: 18, column: 3, scope: !28)
+!39 = !DILocation(line: 18, column: 3, scope: !17)
+!40 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12", scope: !11, file: !11, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23)
+!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial)
+!42 = !DILocation(line: 0, scope: !40)
+!43 = !DILocation(line: 12, column: 1, scope: !40)
+!44 = distinct !DISubprogram(name: "g", scope: !11, file: !11, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !10, retainedNodes: !23)
+!45 = !DISubroutineType(types: !46)
+!46 = !{null}
+!47 = !DILocalVariable(name: "i", scope: !44, file: !11, line: 4, type: !29)
+!48 = !DILocation(line: 4, column: 7, scope: !44)
+!49 = !DILocalVariable(name: "a", scope: !44, file: !11, line: 5, type: !32)
+!50 = !DILocation(line: 5, column: 7, scope: !44)
+!51 = !DILocation(line: 6, column: 3, scope: !44)
+!52 = !DILocation(line: 7, column: 3, scope: !44)
+!53 = !DILocation(line: 8, column: 1, scope: !44)
+!54 = !{!55, !58, i64 2}
+!55 = !{!"_ZTS26ConfigurationEnvironmentTy", !56, i64 0, !56, i64 1, !58, i64 2, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24}
+!56 = !{!"omnipotent char", !57, i64 0}
+!57 = !{!"Simple C++ TBAA"}
+!58 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !56, i64 0}
+!59 = !{!"int", !56, i64 0}
+!60 = !{!55, !56, i64 0}
+!61 = !{!59, !59, i64 0}
+!62 = !{!56, !56, i64 0}
+!63 = !{!64, !59, i64 0}
+!64 = !{!"_ZTSN4ompx5state11TeamStateTyE", !65, i64 0, !59, i64 28, !59, i64 32, !66, i64 40}
+!65 = !{!"_ZTSN4ompx5state10ICVStateTyE", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24}
+!66 = !{!"any pointer", !56, i64 0}
+!67 = !{!64, !59, i64 4}
+!68 = !{!64, !59, i64 8}
+!69 = !{!64, !59, i64 12}
+!70 = !{!64, !59, i64 16}
+!71 = !{!64, !59, i64 20}
+!72 = !{!64, !59, i64 24}
+!73 = !{!64, !59, i64 28}
+!74 = !{!64, !59, i64 32}
+!75 = !{!64, !66, i64 40}
+!76 = !{!66, !66, i64 0}
+!77 = !{!78, !59, i64 0}
+!78 = !{!"_ZTS19DeviceEnvironmentTy", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !79, i64 16, !79, i64 24, !79, i64 32, !79, i64 40}
+!79 = !{!"long", !56, i64 0}
+!80 = !{!65, !59, i64 0}
+!81 = !{!65, !59, i64 4}
+!82 = !{!65, !59, i64 8}
+!83 = !{!65, !59, i64 16}
+!84 = !{!65, !59, i64 20}
+!85 = !{!65, !59, i64 24}
+!86 = distinct !{!86, !87}
+!87 = !{!"llvm.loop.mustprogress"}
+!88 = !{!"branch_weights", i32 2000, i32 1}
+!89 = !{!90, !66, i64 32}
+!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !65, i64 0, !66, i64 32}
+!91 = !{!92, !56, i64 0}
+!92 = !{!"_ZTS19KernelEnvironmentTy", !55, i64 0, !66, i64 32, !66, i64 40}

>From a7656de882610df9a7f1e60c65ce214cef70a32a Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 12 Aug 2024 17:40:35 -0400
Subject: [PATCH 02/27] Move docs to KernelInfo.rst

---
 llvm/docs/KernelInfo.rst                | 61 +++++++++++++++++++++++++
 llvm/include/llvm/Analysis/KernelInfo.h | 29 +-----------
 2 files changed, 62 insertions(+), 28 deletions(-)
 create mode 100644 llvm/docs/KernelInfo.rst

diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst
new file mode 100644
index 00000000000000..397b32602bce22
--- /dev/null
+++ b/llvm/docs/KernelInfo.rst
@@ -0,0 +1,61 @@
+==========
+KernelInfo
+==========
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This LLVM IR pass reports various statistics for codes compiled for GPUs.  The
+goal of these statistics is to help identify bad code patterns and ways to
+mitigate them.  The pass operates at the LLVM IR level so that it can, in
+theory, support any LLVM-based compiler for programming languages supporting
+GPUs.
+
+By default, the pass is disabled.  For convenience, the command-line option
+``-kernel-info-end-lto`` inserts it at the end of LTO, and options like
+``-Rpass=kernel-info`` enable its remarks.  Example ``opt`` and ``clang``
+command lines appear in the next section.
+
+Remarks include summary statistics (e.g., total size of static allocas) and
+individual occurrences (e.g., source location of each alloca).  Examples of the
+output appear in tests in `llvm/test/Analysis/KernelInfo`.
+
+Example Command Lines
+=====================
+
+To analyze a C program as it appears to an LLVM GPU backend at the end of LTO:
+
+.. code-block:: shell
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info -mllvm -kernel-info-end-lto
+
+To analyze specified LLVM IR, perhaps previously generated by something like
+``clang -save-temps -g -fopenmp --offload-arch=native test.c``:
+
+.. code-block:: shell
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -passes=kernel-info
+
+kernel-info can also be inserted into a specified LLVM pass pipeline using
+``-kernel-info-end-lto``, or it can be positioned explicitly in that pipeline:
+
+.. code-block:: shell
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info -mllvm -kernel-info-end-lto \
+      -Xoffload-linker --lto-newpm-passes='lto<O2>'
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info \
+      -Xoffload-linker --lto-newpm-passes='lto<O2>,module(kernel-info)'
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto<O2>'
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -passes='lto<O2>,module(kernel-info)'
diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index 5495bb2fd4d925..96cd5f68af6466 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -9,34 +9,7 @@
 // This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
 // classes used to extract function properties from a GPU kernel.
 //
-// To analyze a C program as it appears to an LLVM GPU backend at the end of
-// LTO:
-//
-//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
-//       -Rpass=kernel-info -mllvm -kernel-info-end-lto
-//
-// To analyze specified LLVM IR, perhaps previously generated by something like
-// 'clang -save-temps -g -fopenmp --offload-arch=native test.c':
-//
-//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
-//       -pass-remarks=kernel-info -passes=kernel-info
-//
-// kernel-info can also be inserted into a specified LLVM pass pipeline using
-// -kernel-info-end-lto, or it can be positioned explicitly in that pipeline:
-//
-//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
-//       -Rpass=kernel-info -mllvm -kernel-info-end-lto \
-//       -Xoffload-linker --lto-newpm-passes='lto<O2>'
-//
-//   $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
-//       -Rpass=kernel-info \
-//       -Xoffload-linker --lto-newpm-passes='lto<O2>,module(kernel-info)'
-//
-//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
-//       -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto<O2>'
-//
-//   $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
-//       -pass-remarks=kernel-info -passes='lto<O2>,module(kernel-info)'
+// See llvm/docs/KernelInfo.rst.
 // ===---------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_KERNELINFO_H

>From d92856ec609d4bdf7642b8186cf0458dadd80f4a Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 12 Aug 2024 17:41:02 -0400
Subject: [PATCH 03/27] Move conditional outside registration call

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 +++++-----
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp   | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3b2ed9fe4236c6..93d1d6b1b80b4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -774,14 +774,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         return nullptr;
       });
 
-  PB.registerFullLinkTimeOptimizationLastEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
-        if (KernelInfoEndLTO) {
+  if (KernelInfoEndLTO) {
+    PB.registerFullLinkTimeOptimizationLastEPCallback(
+        [](ModulePassManager &PM, OptimizationLevel Level) {
           FunctionPassManager FPM;
           FPM.addPass(KernelInfoPrinter());
           PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-        }
-      });
+        });
+  }
 }
 
 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 8d77c8e53f7a6a..1a4a9781db3338 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -240,14 +240,14 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
 
-  PB.registerFullLinkTimeOptimizationLastEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
-        if (KernelInfoEndLTO) {
+  if (KernelInfoEndLTO) {
+    PB.registerFullLinkTimeOptimizationLastEPCallback(
+        [](ModulePassManager &PM, OptimizationLevel Level) {
           FunctionPassManager FPM;
           FPM.addPass(KernelInfoPrinter());
           PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-        }
-      });
+        });
+  }
 }
 
 TargetTransformInfo

>From 6ac3f419b94e5c5ecd4e7a33b16e1f7e89fa1790 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 12 Aug 2024 18:03:51 -0400
Subject: [PATCH 04/27] Use llvm::SmallString

---
 llvm/lib/Analysis/KernelInfo.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 9df3b5b32afcb4..caeada91c31af8 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/KernelInfo.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/IR/DebugInfo.h"
@@ -139,8 +140,8 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
       }
       remarkAlloca(ORE, F, *Alloca, StaticSize);
     } else if (const CallBase *Call = dyn_cast<CallBase>(&I)) {
-      std::string CallKind;
-      std::string RemarkKind;
+      SmallString<40> CallKind;
+      SmallString<40> RemarkKind;
       if (Call->isIndirectCall()) {
         IndirectCalls += Direction;
         CallKind += "indirect";

>From 6367ad7ea65d7ef1da51b4fe8cf6e50af90b1f36 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 12 Aug 2024 19:22:45 -0400
Subject: [PATCH 05/27] Use TTI.getFlatAddressSpace for addrspace(0)

We have to be more careful about targets in the test suite now because
`getFlatAddressSpace` returns garbage for unsupported targets.

Should we change the remarks to say flat addrspace instead of
addrspace(0)?
---
 llvm/include/llvm/Analysis/KernelInfo.h        |  4 +++-
 llvm/lib/Analysis/KernelInfo.cpp               | 18 ++++++++++--------
 .../Inputs/test.ll}                            |  9 ---------
 .../Analysis/KernelInfo/addrspace0/amdgpu.ll   | 12 ++++++++++++
 .../Analysis/KernelInfo/addrspace0/nvptx.ll    | 12 ++++++++++++
 llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll | 10 +++-------
 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll  | 10 +++-------
 7 files changed, 43 insertions(+), 32 deletions(-)
 rename llvm/test/Analysis/KernelInfo/{addrspace0.ll => addrspace0/Inputs/test.ll} (97%)
 create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll
 create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index 96cd5f68af6466..c4a18d47723ab1 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -16,6 +16,7 @@
 #define LLVM_ANALYSIS_KERNELINFO_H
 
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 
 namespace llvm {
 class DominatorTree;
@@ -24,7 +25,8 @@ class Function;
 /// Data structure holding function info for kernels.
 class KernelInfo {
   void updateForBB(const BasicBlock &BB, int64_t Direction,
-                   OptimizationRemarkEmitter &ORE);
+                   OptimizationRemarkEmitter &ORE,
+                   const TargetTransformInfo &TTI);
 
 public:
   static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index caeada91c31af8..de08bd49aacfc4 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -122,7 +122,8 @@ static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE,
 }
 
 void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
-                             OptimizationRemarkEmitter &ORE) {
+                             OptimizationRemarkEmitter &ORE,
+                             const TargetTransformInfo &TTI) {
   assert(Direction == 1 || Direction == -1);
   const Function &F = *BB.getParent();
   const Module &M = *F.getParent();
@@ -170,34 +171,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
       }
       remarkCall(ORE, F, *Call, CallKind, RemarkKind);
       if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
-        if (MI->getDestAddressSpace() == 0) {
+        if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) {
           AddrspaceZeroAccesses += Direction;
           remarkAddrspaceZeroAccess(ORE, F, I);
         } else if (const AnyMemTransferInst *MT =
                        dyn_cast<AnyMemTransferInst>(MI)) {
-          if (MT->getSourceAddressSpace() == 0) {
+          if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) {
             AddrspaceZeroAccesses += Direction;
             remarkAddrspaceZeroAccess(ORE, F, I);
           }
         }
       }
     } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
-      if (Load->getPointerAddressSpace() == 0) {
+      if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
         AddrspaceZeroAccesses += Direction;
         remarkAddrspaceZeroAccess(ORE, F, I);
       }
     } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
-      if (Store->getPointerAddressSpace() == 0) {
+      if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
         AddrspaceZeroAccesses += Direction;
         remarkAddrspaceZeroAccess(ORE, F, I);
       }
     } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
-      if (At->getPointerAddressSpace() == 0) {
+      if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
         AddrspaceZeroAccesses += Direction;
         remarkAddrspaceZeroAccess(ORE, F, I);
       }
     } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
-      if (At->getPointerAddressSpace() == 0) {
+      if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
         AddrspaceZeroAccesses += Direction;
         remarkAddrspaceZeroAccess(ORE, F, I);
       }
@@ -286,6 +287,7 @@ static std::optional<int64_t> parseNVPTXMDNodeAsInteger(Function &F,
 
 KernelInfo KernelInfo::getKernelInfo(Function &F,
                                      FunctionAnalysisManager &FAM) {
+  const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
   KernelInfo KI;
   // Only analyze modules for GPUs.
   // TODO: This would be more maintainable if there were an isGPU.
@@ -319,7 +321,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F,
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   for (const auto &BB : F)
     if (DT.isReachableFromEntry(&BB))
-      KI.updateForBB(BB, +1, ORE);
+      KI.updateForBB(BB, +1, ORE, TTI);
 
 #define REMARK_PROPERTY(PROP_NAME)                                             \
   remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
diff --git a/llvm/test/Analysis/KernelInfo/addrspace0.ll b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
similarity index 97%
rename from llvm/test/Analysis/KernelInfo/addrspace0.ll
rename to llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
index 4c472396443f52..79d3cd2562e909 100644
--- a/llvm/test/Analysis/KernelInfo/addrspace0.ll
+++ b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
@@ -1,12 +1,3 @@
-; Check info on addrspace(0) memory accesses.
-
-; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
-; RUN:     -disable-output %s 2>&1 | \
-; RUN:   FileCheck -match-full-lines --implicit-check-not='addrspace(0)' %s
-
-target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
 define void @f() !dbg !3 {
 entry:
   ; load
diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll b/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll
new file mode 100644
index 00000000000000..b7a26d6cb47baa
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll
@@ -0,0 +1,12 @@
+; Check info on addrspace(0) memory accesses when the target is amdgpu.
+;
+; The target matters because kernel-info calls
+; TargetTransformInfo::getFlatAddressSpace to select addrspace(0).
+
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -mtriple="amdgcn-amd-amdhsa" \
+; RUN:     -disable-output %S/Inputs/test.ll 2>&1 | \
+; RUN:   FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \
+; RUN:       %S/Inputs/test.ll
diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll b/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll
new file mode 100644
index 00000000000000..43bb985744e0c8
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll
@@ -0,0 +1,12 @@
+; Check info on addrspace(0) memory accesses when the target is nvptx.
+;
+; The target matters because kernel-info calls
+; TargetTransformInfo::getFlatAddressSpace to select addrspace(0).
+
+; REQUIRES: nvptx-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -mtriple="nvptx64-nvidia-cuda" \
+; RUN:     -disable-output %S/Inputs/test.ll 2>&1 | \
+; RUN:   FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \
+; RUN:       %S/Inputs/test.ll
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index ee5f65b8e5ab72..d417f8b866f734 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -1,16 +1,12 @@
 ; See ./README.md for how to maintain the LLVM IR in this test.
 
+; REQUIRES: amdgpu-registered-target
+
 ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
 ; RUN:     -disable-output %s 2>&1 | \
 ; RUN:   FileCheck -match-full-lines %s
 
-; For some builds, we see a warning like:
-;
-;   opt: WARNING: failed to create target machine for 'amdgcn-amd-amdhsa': unable to get target for 'amdgcn-amd-amdhsa', see --version and --triple.
-;
-; But there should be no other remarks here.
-; CHECK-NOT: remark:
-
+;  CHECK-NOT: remark:
 ;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes
diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
index 41d068b03548b0..1222267a8fe576 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
@@ -1,16 +1,12 @@
 ; See ./README.md for how to maintain the LLVM IR in this test.
 
+; REQUIRES: nvptx-registered-target
+
 ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
 ; RUN:     -disable-output %s 2>&1 | \
 ; RUN:   FileCheck -match-full-lines %s
 
-; For some builds, we see a warning like:
-;
-;   opt: WARNING: failed to create target machine for 'nvptx64-nvidia-cuda': unable to get target for 'nvptx64-nvidia-cuda', see --version and --triple.
-;
-; But there should be no other remarks here.
-; CHECK-NOT: remark:
-
+;  CHECK-NOT: remark:
 ;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes

>From 78446bbb9e1caed303288a2962dd7c78a8779c06 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 12 Aug 2024 19:31:58 -0400
Subject: [PATCH 06/27] Avoid repetition between amdgpu and nvptx tests

---
 .../kernel-info-after-lto/Inputs/test.ll      | 22 ++++++++++
 .../kernel-info-after-lto/amdgpu.ll           | 43 ++++---------------
 .../KernelInfo/kernel-info-after-lto/nvptx.ll | 43 ++++---------------
 3 files changed, 40 insertions(+), 68 deletions(-)
 create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll

diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll
new file mode 100644
index 00000000000000..b85e3c581867c2
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll
@@ -0,0 +1,22 @@
+; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
+; NONE-NOT: remark:
+define void @test() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll
index 7d190ece46e166..6d6e83e8d317f4 100644
--- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll
@@ -4,44 +4,19 @@
 ; REQUIRES: amdgpu-registered-target
 
 ; -kernel-info-end-lto inserts kernel-info into LTO pipeline.
-; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \
+; RUN:     -mtriple="amdgcn-amd-amdhsa" \
 ; RUN:     -passes='lto<O2>' -kernel-info-end-lto 2>&1 | \
-; RUN:   FileCheck -match-full-lines %s
+; RUN:   FileCheck -match-full-lines %S/Inputs/test.ll
 
 ; Omitting -kernel-info-end-lto disables kernel-info.
-; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \
+; RUN:     -mtriple="amdgcn-amd-amdhsa" \
 ; RUN:     -passes='lto<O2>' 2>&1 | \
-; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll
 
 ; Omitting LTO disables kernel-info.
-; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \
+; RUN:     -mtriple="amdgcn-amd-amdhsa" \
 ; RUN:     -passes='default<O2>' -kernel-info-end-lto 2>&1 | \
-; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-amd-amdhsa"
-
-; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
-; NONE-NOT: remark:
-define void @test() #0 !dbg !5 {
-entry:
-  ret void
-}
-
-attributes #0 = {
-  "omp_target_num_teams"="100"
-}
-
-!llvm.module.flags = !{!0}
-!llvm.dbg.cu = !{!1}
-!nvvm.annotations = !{!6, !7, !8}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-!2 = !DIFile(filename: "test.c", directory: "/tmp")
-!3 = !{}
-!4 = !DISubroutineType(types: !3)
-!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
-!6 = !{ptr @test, !"maxclusterrank", i32 200}
-!7 = !{ptr @test, !"maxntidx", i32 210}
-!8 = distinct !{ptr null, !"kernel", i32 1}
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll
diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll
index 4e790123c313a5..1e427daed671e8 100644
--- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll
@@ -4,44 +4,19 @@
 ; REQUIRES: nvptx-registered-target
 
 ; -kernel-info-end-lto inserts kernel-info into LTO pipeline.
-; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \
+; RUN:     -mtriple="nvptx64-nvidia-cuda" \
 ; RUN:     -passes='lto<O2>' -kernel-info-end-lto 2>&1 | \
-; RUN:   FileCheck -match-full-lines %s
+; RUN:   FileCheck -match-full-lines %S/Inputs/test.ll
 
 ; Omitting -kernel-info-end-lto disables kernel-info.
-; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \
+; RUN:     -mtriple="nvptx64-nvidia-cuda" \
 ; RUN:     -passes='lto<O2>' 2>&1 | \
-; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll
 
 ; Omitting LTO disables kernel-info.
-; RUN: opt -pass-remarks=kernel-info -disable-output %s \
+; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \
+; RUN:     -mtriple="nvptx64-nvidia-cuda" \
 ; RUN:     -passes='default<O2>' -kernel-info-end-lto 2>&1 | \
-; RUN:   FileCheck -allow-empty -check-prefixes=NONE %s
-
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
-; NONE-NOT: remark:
-define void @test() #0 !dbg !5 {
-entry:
-  ret void
-}
-
-attributes #0 = {
-  "omp_target_num_teams"="100"
-}
-
-!llvm.module.flags = !{!0}
-!llvm.dbg.cu = !{!1}
-!nvvm.annotations = !{!6, !7, !8}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-!2 = !DIFile(filename: "test.c", directory: "/tmp")
-!3 = !{}
-!4 = !DISubroutineType(types: !3)
-!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
-!6 = !{ptr @test, !"maxclusterrank", i32 200}
-!7 = !{ptr @test, !"maxntidx", i32 210}
-!8 = distinct !{ptr null, !"kernel", i32 1}
+; RUN:   FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll

>From fede524269915edb51b7d6680a7280a79ca0f710 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 12 Aug 2024 19:39:14 -0400
Subject: [PATCH 07/27] Use named values in tests

---
 .../Analysis/KernelInfo/addrspace0/Inputs/test.ll    |  2 +-
 llvm/test/Analysis/KernelInfo/calls.ll               | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
index 79d3cd2562e909..0821fde8e25b1e 100644
--- a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
+++ b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
@@ -1,6 +1,6 @@
 define void @f() !dbg !3 {
 entry:
-  ; load
+  ; load: check remarks for both unnamed and named values.
   ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0)
   %0 = load i32, ptr null, align 4, !dbg !6
   ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0)
diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll
index 6101a712548981..25b8e3d8803037 100644
--- a/llvm/test/Analysis/KernelInfo/calls.ll
+++ b/llvm/test/Analysis/KernelInfo/calls.ll
@@ -17,9 +17,9 @@ entry:
   call void @g(), !dbg !104
   ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h'
   call void @h(), !dbg !105
-  %0 = load ptr, ptr null, align 8
+  %fnPtr = load ptr, ptr null, align 8
   ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call
-  call void %0(), !dbg !106
+  call void %fnPtr(), !dbg !106
   ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f'
   invoke void @f() to label %fcont unwind label %cleanup, !dbg !107
 fcont:
@@ -30,7 +30,7 @@ gcont:
   invoke void @h() to label %hcont unwind label %cleanup, !dbg !109
 hcont:
   ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke
-  invoke void %0() to label %end unwind label %cleanup, !dbg !110
+  invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110
 cleanup:
   %ll = landingpad { ptr, i32 }
   cleanup
@@ -53,9 +53,9 @@ entry:
   call void @g(), !dbg !203
   ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h'
   call void @h(), !dbg !204
-  %0 = load ptr, ptr null, align 8
+  %fnPtr = load ptr, ptr null, align 8
   ; CHECK: remark: test.c:9:3: in function 'g', indirect call
-  call void %0(), !dbg !205
+  call void %fnPtr(), !dbg !205
   ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f'
   invoke void @f() to label %fcont unwind label %cleanup, !dbg !206
 fcont:
@@ -66,7 +66,7 @@ gcont:
   invoke void @h() to label %hcont unwind label %cleanup, !dbg !208
 hcont:
   ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke
-  invoke void %0() to label %end unwind label %cleanup, !dbg !209
+  invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209
 cleanup:
   %ll = landingpad { ptr, i32 }
   cleanup

>From 4c30b8a767c8e5fcaa4c6e8979d5515b9f4656f1 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Tue, 13 Aug 2024 12:03:06 -0400
Subject: [PATCH 08/27] Say flat address space instead of addrspace(0)

---
 llvm/include/llvm/Analysis/KernelInfo.h       |  4 +-
 llvm/lib/Analysis/KernelInfo.cpp              | 32 ++++----
 .../Inputs/test.ll                            | 74 +++++++++----------
 .../{addrspace0 => flat-addrspace}/amdgpu.ll  |  6 +-
 .../{addrspace0 => flat-addrspace}/nvptx.ll   |  6 +-
 .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 12 +--
 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 12 +--
 7 files changed, 73 insertions(+), 73 deletions(-)
 rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/Inputs/test.ll (82%)
 rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/amdgpu.ll (53%)
 rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/nvptx.ll (54%)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index c4a18d47723ab1..66dd95046dd97d 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -89,8 +89,8 @@ class KernelInfo {
   /// Number of calls of type InvokeInst.
   int64_t Invokes = 0;
 
-  /// Number of addrspace(0) memory accesses (via load, store, etc.).
-  int64_t AddrspaceZeroAccesses = 0;
+  /// Number of flat addrspace memory accesses (via load, store, etc.).
+  int64_t FlatAddrspaceAccesses = 0;
 };
 
 /// Analysis class for KernelInfo.
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index de08bd49aacfc4..4eccc8807106b8 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -102,11 +102,11 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
   });
 }
 
-static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE,
+static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
                                       const Function &Caller,
                                       const Instruction &Inst) {
   ORE.emit([&] {
-    OptimizationRemark R(DEBUG_TYPE, "AddrspaceZeroAccess", &Inst);
+    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst);
     R << "in ";
     identifyFunction(R, Caller);
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
@@ -116,7 +116,7 @@ static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE,
     }
     if (Inst.hasName())
       R << " ('%" << Inst.getName() << "')";
-    R << " accesses memory in addrspace(0)";
+    R << " accesses memory in flat address space";
     return R;
   });
 }
@@ -172,35 +172,35 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
       remarkCall(ORE, F, *Call, CallKind, RemarkKind);
       if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
         if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) {
-          AddrspaceZeroAccesses += Direction;
-          remarkAddrspaceZeroAccess(ORE, F, I);
+          FlatAddrspaceAccesses += Direction;
+          remarkFlatAddrspaceAccess(ORE, F, I);
         } else if (const AnyMemTransferInst *MT =
                        dyn_cast<AnyMemTransferInst>(MI)) {
           if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) {
-            AddrspaceZeroAccesses += Direction;
-            remarkAddrspaceZeroAccess(ORE, F, I);
+            FlatAddrspaceAccesses += Direction;
+            remarkFlatAddrspaceAccess(ORE, F, I);
           }
         }
       }
     } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
       if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
-        AddrspaceZeroAccesses += Direction;
-        remarkAddrspaceZeroAccess(ORE, F, I);
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
       }
     } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
       if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
-        AddrspaceZeroAccesses += Direction;
-        remarkAddrspaceZeroAccess(ORE, F, I);
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
       }
     } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
       if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
-        AddrspaceZeroAccesses += Direction;
-        remarkAddrspaceZeroAccess(ORE, F, I);
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
       }
     } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
       if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
-        AddrspaceZeroAccesses += Direction;
-        remarkAddrspaceZeroAccess(ORE, F, I);
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
       }
     }
   }
@@ -344,7 +344,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F,
   REMARK_PROPERTY(IndirectCalls);
   REMARK_PROPERTY(DirectCallsToDefinedFunctions);
   REMARK_PROPERTY(Invokes);
-  REMARK_PROPERTY(AddrspaceZeroAccesses);
+  REMARK_PROPERTY(FlatAddrspaceAccesses);
 #undef REMARK_PROPERTY
 
   return KI;
diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
similarity index 82%
rename from llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
rename to llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
index 0821fde8e25b1e..07c884792f45cd 100644
--- a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
@@ -1,129 +1,129 @@
 define void @f() !dbg !3 {
 entry:
   ; load: check remarks for both unnamed and named values.
-  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in flat address space
   %0 = load i32, ptr null, align 4, !dbg !6
-  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space
   %load = load i32, ptr null, align 4, !dbg !6
-  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in flat address space
   %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6
   %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6
   %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6
 
   ; store
-  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space
   store i32 0, ptr null, align 4, !dbg !7
-  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space
   store i32 0, ptr addrspace(0) null, align 4, !dbg !7
   store i32 0, ptr addrspace(1) null, align 4, !dbg !7
   store i32 0, ptr addrspace(8) null, align 4, !dbg !7
 
   ; atomicrmw
-  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space
   atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8
-  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space
   atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8
   atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8
   atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8
 
   ; cmpxchg
-  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space
   cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9
-  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space
   cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
   cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
   cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
 
   ; llvm.memcpy
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space
   call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space
   call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
   call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
   call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
 
   ; llvm.memcpy.inline
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in flat address space
   call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
   call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
 
   ; llvm.memcpy.element.unordered.atomic
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space
   call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10
-  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space
   call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10
   call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10
 
   ; llvm.memmove
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space
   call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space
   call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
   call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
   call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space
   call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space
   call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11
   call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
   call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space
   call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space
   call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11
 
   ; llvm.memmove.element.unordered.atomic
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space
   call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space
   call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11
-  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space
   call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11
   call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11
 
   ; llvm.memset
-  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space
   call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12
-  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space
   call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12
   call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12
   call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12
 
   ; llvm.memset.inline
-  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space
   call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12
-  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space
   call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12
   call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12
   call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12
 
   ; llvm.memset.element.unordered.atomic
-  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space
   call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12
-  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0)
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space
   call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12
   call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12
   call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12
 
   ret void
 }
-; CHECK: remark: test.c:2:0: in function 'f', AddrspaceZeroAccesses = 36
+; CHECK: remark: test.c:2:0: in function 'f', FlatAddrspaceAccesses = 36
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2}
diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll
similarity index 53%
rename from llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll
rename to llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll
index b7a26d6cb47baa..7447dcf51cc895 100644
--- a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll
@@ -1,12 +1,12 @@
-; Check info on addrspace(0) memory accesses when the target is amdgpu.
+; Check info on flat address space memory accesses when the target is amdgpu.
 ;
 ; The target matters because kernel-info calls
-; TargetTransformInfo::getFlatAddressSpace to select addrspace(0).
+; TargetTransformInfo::getFlatAddressSpace to select the flat address space.
 
 ; REQUIRES: amdgpu-registered-target
 
 ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
 ; RUN:     -mtriple="amdgcn-amd-amdhsa" \
 ; RUN:     -disable-output %S/Inputs/test.ll 2>&1 | \
-; RUN:   FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \
+; RUN:   FileCheck -match-full-lines -implicit-check-not='flat address space' \
 ; RUN:       %S/Inputs/test.ll
diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll
similarity index 54%
rename from llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll
rename to llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll
index 43bb985744e0c8..02321c19e022dd 100644
--- a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll
@@ -1,12 +1,12 @@
-; Check info on addrspace(0) memory accesses when the target is nvptx.
+; Check info on flat address space memory accesses when the target is nvptx.
 ;
 ; The target matters because kernel-info calls
-; TargetTransformInfo::getFlatAddressSpace to select addrspace(0).
+; TargetTransformInfo::getFlatAddressSpace to select the flat address space.
 
 ; REQUIRES: nvptx-registered-target
 
 ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
 ; RUN:     -mtriple="nvptx64-nvidia-cuda" \
 ; RUN:     -disable-output %S/Inputs/test.ll 2>&1 | \
-; RUN:   FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \
+; RUN:   FileCheck -match-full-lines -implicit-check-not='flat address space' \
 ; RUN:       %S/Inputs/test.ll
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index d417f8b866f734..56ee35810ef26a 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -10,7 +10,7 @@
 ;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init'
 ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f'
 ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
@@ -26,11 +26,11 @@
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
-; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
 
 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0)
-; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
@@ -40,7 +40,7 @@
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
-; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
 
 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes
@@ -54,7 +54,7 @@
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
-; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
 ;  CHECK-NOT: {{.}}
 
 
diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
index 1222267a8fe576..ee76ecdf5d795d 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
@@ -10,7 +10,7 @@
 ;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init'
 ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f'
 ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
@@ -25,11 +25,11 @@
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
-; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
 
 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes
-; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0)
-; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0)
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
@@ -39,7 +39,7 @@
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
-; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
 
 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes
 ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes
@@ -53,7 +53,7 @@
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
-; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
 ;  CHECK-NOT: remark: {{.*: in function 'g',.*}}
 
 ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't

>From 33f0d4dd276eda64f495cdf66411bc77d20517c6 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Tue, 13 Aug 2024 12:23:42 -0400
Subject: [PATCH 09/27] Cache the flat address space

---
 llvm/include/llvm/Analysis/KernelInfo.h |  8 +++++---
 llvm/lib/Analysis/KernelInfo.cpp        | 20 ++++++++++----------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index 66dd95046dd97d..3cf5bec58cf551 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -25,8 +25,7 @@ class Function;
 /// Data structure holding function info for kernels.
 class KernelInfo {
   void updateForBB(const BasicBlock &BB, int64_t Direction,
-                   OptimizationRemarkEmitter &ORE,
-                   const TargetTransformInfo &TTI);
+                   OptimizationRemarkEmitter &ORE);
 
 public:
   static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
@@ -89,7 +88,10 @@ class KernelInfo {
   /// Number of calls of type InvokeInst.
   int64_t Invokes = 0;
 
-  /// Number of flat addrspace memory accesses (via load, store, etc.).
+  /// Target-specific flat address space.
+  unsigned FlatAddrspace;
+
+  /// Number of flat address space memory accesses (via load, store, etc.).
   int64_t FlatAddrspaceAccesses = 0;
 };
 
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 4eccc8807106b8..b5b9145641550f 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -122,8 +122,7 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
 }
 
 void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
-                             OptimizationRemarkEmitter &ORE,
-                             const TargetTransformInfo &TTI) {
+                             OptimizationRemarkEmitter &ORE) {
   assert(Direction == 1 || Direction == -1);
   const Function &F = *BB.getParent();
   const Module &M = *F.getParent();
@@ -171,34 +170,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
       }
       remarkCall(ORE, F, *Call, CallKind, RemarkKind);
       if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
-        if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) {
+        if (MI->getDestAddressSpace() == FlatAddrspace) {
           FlatAddrspaceAccesses += Direction;
           remarkFlatAddrspaceAccess(ORE, F, I);
         } else if (const AnyMemTransferInst *MT =
                        dyn_cast<AnyMemTransferInst>(MI)) {
-          if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) {
+          if (MT->getSourceAddressSpace() == FlatAddrspace) {
             FlatAddrspaceAccesses += Direction;
             remarkFlatAddrspaceAccess(ORE, F, I);
           }
         }
       }
     } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
-      if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
+      if (Load->getPointerAddressSpace() == FlatAddrspace) {
         FlatAddrspaceAccesses += Direction;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
     } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
-      if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
+      if (Store->getPointerAddressSpace() == FlatAddrspace) {
         FlatAddrspaceAccesses += Direction;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
     } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
-      if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
+      if (At->getPointerAddressSpace() == FlatAddrspace) {
         FlatAddrspaceAccesses += Direction;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
     } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
-      if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) {
+      if (At->getPointerAddressSpace() == FlatAddrspace) {
         FlatAddrspaceAccesses += Direction;
         remarkFlatAddrspaceAccess(ORE, F, I);
       }
@@ -287,7 +286,6 @@ static std::optional<int64_t> parseNVPTXMDNodeAsInteger(Function &F,
 
 KernelInfo KernelInfo::getKernelInfo(Function &F,
                                      FunctionAnalysisManager &FAM) {
-  const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
   KernelInfo KI;
   // Only analyze modules for GPUs.
   // TODO: This would be more maintainable if there were an isGPU.
@@ -297,6 +295,8 @@ KernelInfo KernelInfo::getKernelInfo(Function &F,
     return KI;
   KI.IsValid = true;
 
+  KI.FlatAddrspace = FAM.getResult<TargetIRAnalysis>(F).getFlatAddressSpace();
+
   // Record function properties.
   KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F);
   KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams");
@@ -321,7 +321,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F,
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   for (const auto &BB : F)
     if (DT.isReachableFromEntry(&BB))
-      KI.updateForBB(BB, +1, ORE, TTI);
+      KI.updateForBB(BB, +1, ORE);
 
 #define REMARK_PROPERTY(PROP_NAME)                                             \
   remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)

>From a2a512c5bfbea1bbe14f4db2574631b0703106ea Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Tue, 13 Aug 2024 13:18:20 -0400
Subject: [PATCH 10/27] Link KernelInfo.rst from Passes.rst

---
 llvm/docs/Passes.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst
index 49f633e98d16fe..939aeabd599b9c 100644
--- a/llvm/docs/Passes.rst
+++ b/llvm/docs/Passes.rst
@@ -5,6 +5,11 @@ LLVM's Analysis and Transform Passes
 .. contents::
     :local:
 
+.. toctree::
+   :hidden:
+
+   KernelInfo
+
 Introduction
 ============
 .. warning:: This document is not updated frequently, and the list of passes
@@ -148,6 +153,12 @@ This pass collects the count of all instructions and reports them.
 Bookkeeping for "interesting" users of expressions computed from induction
 variables.
 
+``kernel-info``: GPU Kernel Info
+--------------------------------
+
+Reports various statistics for codes compiled for GPUs.  This pass is
+:doc:`documented separately<KernelInfo>`.
+
 ``lazy-value-info``: Lazy Value Information Analysis
 ----------------------------------------------------
 

>From de04ac4fee83f24bad8510f055cc7b303cf76939 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Tue, 13 Aug 2024 13:48:32 -0400
Subject: [PATCH 11/27] Don't filter out cpus

-kernel-info-end-lto doesn't insert kernel-info for cpu modules.  If
the user explicitly specifies the pass for a cpu module, then it will
run now.
---
 llvm/include/llvm/Analysis/KernelInfo.h | 4 ----
 llvm/lib/Analysis/KernelInfo.cpp        | 8 --------
 2 files changed, 12 deletions(-)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index 3cf5bec58cf551..951c58cfc02180 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -36,10 +36,6 @@ class KernelInfo {
 
   bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); }
 
-  /// If false, nothing was recorded here because the supplied function didn't
-  /// appear in a module compiled for a GPU.
-  bool IsValid = false;
-
   /// Whether the function has external linkage and is not a kernel function.
   bool ExternalNotKernel = false;
 
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index b5b9145641550f..b29c3c3fecd164 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -287,14 +287,6 @@ static std::optional<int64_t> parseNVPTXMDNodeAsInteger(Function &F,
 KernelInfo KernelInfo::getKernelInfo(Function &F,
                                      FunctionAnalysisManager &FAM) {
   KernelInfo KI;
-  // Only analyze modules for GPUs.
-  // TODO: This would be more maintainable if there were an isGPU.
-  const std::string &TT = F.getParent()->getTargetTriple();
-  llvm::Triple T(TT);
-  if (!T.isAMDGPU() && !T.isNVPTX())
-    return KI;
-  KI.IsValid = true;
-
   KI.FlatAddrspace = FAM.getResult<TargetIRAnalysis>(F).getFlatAddressSpace();
 
   // Record function properties.

>From ec5d2bd00ed0c9305a0820d56f69f1be25ebdd6b Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 12:19:16 -0400
Subject: [PATCH 12/27] Include less in header

---
 llvm/include/llvm/Analysis/KernelInfo.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index 951c58cfc02180..c3bc0849efa0ff 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -15,12 +15,11 @@
 #ifndef LLVM_ANALYSIS_KERNELINFO_H
 #define LLVM_ANALYSIS_KERNELINFO_H
 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/PassManager.h"
 
 namespace llvm {
-class DominatorTree;
-class Function;
+class BasicBlock;
+class OptimizationRemarkEmitter;
 
 /// Data structure holding function info for kernels.
 class KernelInfo {

>From c06b9052e6f18e2f290f54eb1ca2583aa3bbeee0 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 12:19:42 -0400
Subject: [PATCH 13/27] Removed unused comparison operators

They wouldn't have worked reliably anyway given uninitialized padding
in the struct.
---
 llvm/include/llvm/Analysis/KernelInfo.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index c3bc0849efa0ff..6d4edfb3525cc4 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -29,12 +29,6 @@ class KernelInfo {
 public:
   static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
 
-  bool operator==(const KernelInfo &FPI) const {
-    return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0;
-  }
-
-  bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); }
-
   /// Whether the function has external linkage and is not a kernel function.
   bool ExternalNotKernel = false;
 

>From d83d22a1079eb66487b084905af114ec384a8319 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 12:19:52 -0400
Subject: [PATCH 14/27] Remove redundant null check

---
 llvm/lib/Analysis/KernelInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index b29c3c3fecd164..c039d495ee6edf 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -161,7 +161,7 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
       }
       if (!Call->isIndirectCall()) {
         if (const Function *Callee = Call->getCalledFunction()) {
-          if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) {
+          if (!Callee->isIntrinsic() && !Callee->isDeclaration()) {
             DirectCallsToDefinedFunctions += Direction;
             CallKind += " to defined function";
             RemarkKind += "ToDefinedFunction";

>From 1649cf8d3af43fd4bdcb5bf6335fffb52f9d92af Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 15:41:36 -0400
Subject: [PATCH 15/27] Move KernelInfo to KernelInfo.cpp, remove
 KernelInfoAnalysis

For now, analysis results will not be used beyond emitting remarks.
If that changes, we can revert.
---
 llvm/include/llvm/Analysis/KernelInfo.h       | 90 +------------------
 llvm/lib/Analysis/KernelInfo.cpp              | 73 ++++++++++++++-
 llvm/lib/Passes/PassRegistry.def              |  1 -
 .../test/Analysis/KernelInfo/openmp/README.md |  4 +-
 4 files changed, 75 insertions(+), 93 deletions(-)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index 6d4edfb3525cc4..c5c33fac346554 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
-// classes used to extract function properties from a GPU kernel.
+// This file defines the KernelInfoPrinter class used to emit remarks about
+// function properties from a GPU kernel.
 //
 // See llvm/docs/KernelInfo.rst.
 // ===---------------------------------------------------------------------===//
@@ -18,95 +18,11 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
-class BasicBlock;
-class OptimizationRemarkEmitter;
-
-/// Data structure holding function info for kernels.
-class KernelInfo {
-  void updateForBB(const BasicBlock &BB, int64_t Direction,
-                   OptimizationRemarkEmitter &ORE);
-
-public:
-  static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
-
-  /// Whether the function has external linkage and is not a kernel function.
-  bool ExternalNotKernel = false;
-
-  /// OpenMP Launch bounds.
-  ///@{
-  std::optional<int64_t> OmpTargetNumTeams;
-  std::optional<int64_t> OmpTargetThreadLimit;
-  ///@}
-
-  /// AMDGPU launch bounds.
-  ///@{
-  std::optional<int64_t> AmdgpuMaxNumWorkgroupsX;
-  std::optional<int64_t> AmdgpuMaxNumWorkgroupsY;
-  std::optional<int64_t> AmdgpuMaxNumWorkgroupsZ;
-  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMin;
-  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMax;
-  std::optional<int64_t> AmdgpuWavesPerEuMin;
-  std::optional<int64_t> AmdgpuWavesPerEuMax;
-  ///@}
-
-  /// NVPTX launch bounds.
-  ///@{
-  std::optional<int64_t> Maxclusterrank;
-  std::optional<int64_t> Maxntidx;
-  ///@}
-
-  /// The number of alloca instructions inside the function, the number of those
-  /// with allocation sizes that cannot be determined at compile time, and the
-  /// sum of the sizes that can be.
-  ///
-  /// With the current implementation for at least some GPU archs,
-  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
-  /// case the implementation changes.
-  int64_t Allocas = 0;
-  int64_t AllocasDyn = 0;
-  int64_t AllocasStaticSizeSum = 0;
-
-  /// Number of direct/indirect calls (anything derived from CallBase).
-  int64_t DirectCalls = 0;
-  int64_t IndirectCalls = 0;
-
-  /// Number of direct calls made from this function to other functions
-  /// defined in this module.
-  int64_t DirectCallsToDefinedFunctions = 0;
-
-  /// Number of calls of type InvokeInst.
-  int64_t Invokes = 0;
-
-  /// Target-specific flat address space.
-  unsigned FlatAddrspace;
-
-  /// Number of flat address space memory accesses (via load, store, etc.).
-  int64_t FlatAddrspaceAccesses = 0;
-};
-
-/// Analysis class for KernelInfo.
-class KernelInfoAnalysis : public AnalysisInfoMixin<KernelInfoAnalysis> {
-public:
-  static AnalysisKey Key;
-
-  using Result = const KernelInfo;
-
-  KernelInfo run(Function &F, FunctionAnalysisManager &FAM) {
-    return KernelInfo::getKernelInfo(F, FAM);
-  }
-};
-
-/// Printer pass for KernelInfoAnalysis.
-///
-/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled.
 class KernelInfoPrinter : public PassInfoMixin<KernelInfoPrinter> {
 public:
   explicit KernelInfoPrinter() {}
 
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
-    AM.getResult<KernelInfoAnalysis>(F);
-    return PreservedAnalyses::all();
-  }
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   static bool isRequired() { return true; }
 };
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index c039d495ee6edf..a628f370c802ef 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
-// classes used to extract function properties from a kernel.
+// This file defines the KernelInfoPrinter class used to emit remarks about
+// function properties from a GPU kernel.
 //
 //===----------------------------------------------------------------------===//
 
@@ -27,6 +27,69 @@ using namespace llvm;
 
 #define DEBUG_TYPE "kernel-info"
 
+/// Data structure holding function info for kernels.
+class KernelInfo {
+  void updateForBB(const BasicBlock &BB, int64_t Direction,
+                   OptimizationRemarkEmitter &ORE);
+
+public:
+  static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
+
+  /// Whether the function has external linkage and is not a kernel function.
+  bool ExternalNotKernel = false;
+
+  /// OpenMP Launch bounds.
+  ///@{
+  std::optional<int64_t> OmpTargetNumTeams;
+  std::optional<int64_t> OmpTargetThreadLimit;
+  ///@}
+
+  /// AMDGPU launch bounds.
+  ///@{
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsX;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsY;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsZ;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMin;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMax;
+  std::optional<int64_t> AmdgpuWavesPerEuMin;
+  std::optional<int64_t> AmdgpuWavesPerEuMax;
+  ///@}
+
+  /// NVPTX launch bounds.
+  ///@{
+  std::optional<int64_t> Maxclusterrank;
+  std::optional<int64_t> Maxntidx;
+  ///@}
+
+  /// The number of alloca instructions inside the function, the number of those
+  /// with allocation sizes that cannot be determined at compile time, and the
+  /// sum of the sizes that can be.
+  ///
+  /// With the current implementation for at least some GPU archs,
+  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
+  /// case the implementation changes.
+  int64_t Allocas = 0;
+  int64_t AllocasDyn = 0;
+  int64_t AllocasStaticSizeSum = 0;
+
+  /// Number of direct/indirect calls (anything derived from CallBase).
+  int64_t DirectCalls = 0;
+  int64_t IndirectCalls = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  /// Number of calls of type InvokeInst.
+  int64_t Invokes = 0;
+
+  /// Target-specific flat address space.
+  unsigned FlatAddrspace;
+
+  /// Number of flat address space memory accesses (via load, store, etc.).
+  int64_t FlatAddrspaceAccesses = 0;
+};
+
 static bool isKernelFunction(Function &F) {
   // TODO: Is this general enough?  Consider languages beyond OpenMP.
   return F.hasFnAttribute("kernel");
@@ -342,4 +405,8 @@ KernelInfo KernelInfo::getKernelInfo(Function &F,
   return KI;
 }
 
-AnalysisKey KernelInfoAnalysis::Key;
+PreservedAnalyses KernelInfoPrinter::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  KernelInfo::getKernelInfo(F, AM);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index dcfa732f410b38..391cca0da2ea1c 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -278,7 +278,6 @@ FUNCTION_ANALYSIS(
     MachineFunctionAnalysis(static_cast<const LLVMTargetMachine *>(TM)))
 FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis())
 FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
-FUNCTION_ANALYSIS("kernel-info", KernelInfoAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md
index 0d13950e198edd..5471b2e1b220db 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/README.md
+++ b/llvm/test/Analysis/KernelInfo/openmp/README.md
@@ -1,9 +1,9 @@
-The tests in this directory check that basic KernelInfoAnalysis functionality
+The tests in this directory check that basic KernelInfoPrinter functionality
 behaves reasonably for LLVM IR produced by Clang OpenMP codegen.
 
 So that these tests are straightforward to maintain and faithfully represent
 Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them.  Other tests
-more exhaustively check KernelInfoAnalysis features using reduced LLVM IR.
+more exhaustively check KernelInfoPrinter features using reduced LLVM IR.
 
 The LLVM IR in each test file `$TEST` can be regenerated as follows in the case
 that Clang OpenMP codegen changes or it becomes desirable to adjust the source

>From 1a3c0aef034087e235fda909c69cc9e75b0bb874 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 15:42:20 -0400
Subject: [PATCH 16/27] Use printAsOperand not getName to identify instruction

---
 llvm/lib/Analysis/KernelInfo.cpp                       |  8 ++++++--
 .../Analysis/KernelInfo/flat-addrspace/Inputs/test.ll  | 10 +++++-----
 llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll         |  2 +-
 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll          |  2 +-
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index a628f370c802ef..41acde725b4710 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -177,8 +177,12 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
     } else {
       R << ", '" << Inst.getOpcodeName() << "' instruction";
     }
-    if (Inst.hasName())
-      R << " ('%" << Inst.getName() << "')";
+    if (!Inst.getType()->isVoidTy()) {
+      std::string Name;
+      raw_string_ostream OS(Name);
+      Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
+      R << " ('" << Name << "')";
+    }
     R << " accesses memory in flat address space";
     return R;
   });
diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
index 07c884792f45cd..b54c3a18f3e70e 100644
--- a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
@@ -1,7 +1,7 @@
 define void @f() !dbg !3 {
 entry:
   ; load: check remarks for both unnamed and named values.
-  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in flat address space
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%0') accesses memory in flat address space
   %0 = load i32, ptr null, align 4, !dbg !6
   ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space
   %load = load i32, ptr null, align 4, !dbg !6
@@ -19,17 +19,17 @@ entry:
   store i32 0, ptr addrspace(8) null, align 4, !dbg !7
 
   ; atomicrmw
-  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space
   atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8
-  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space
   atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8
   atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8
   atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8
 
   ; cmpxchg
-  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space
   cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9
-  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space
   cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
   cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
   cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index 56ee35810ef26a..82f6f243264bc4 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -30,7 +30,7 @@
 
 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
-; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
index ee76ecdf5d795d..eb2cba596be223 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
@@ -29,7 +29,7 @@
 
 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes
 ; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
-; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1

>From ea89a81b0ebf30fa331f3bcd0dbfced21478846d Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 15:42:28 -0400
Subject: [PATCH 17/27] Use printAsOperand to report indirect callee

---
 llvm/lib/Analysis/KernelInfo.cpp       | 21 +++++++++++----------
 llvm/test/Analysis/KernelInfo/calls.ll |  8 ++++----
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 41acde725b4710..9768fe90b14330 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -147,20 +147,21 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
     OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call);
     R << "in ";
     identifyFunction(R, Caller);
-    R << ", " << CallKind;
-    if (const Function *Callee =
-            dyn_cast_or_null<Function>(Call.getCalledOperand())) {
-      R << ", callee is";
-      StringRef Name = Callee->getName();
-      if (auto *SubProgram = Callee->getSubprogram()) {
+    R << ", " << CallKind << ", callee is";
+    Value *Callee = Call.getCalledOperand();
+    std::string Name;
+    if (const Function *FnCallee = dyn_cast<Function>(Callee)) {
+      if (auto *SubProgram = FnCallee->getSubprogram()) {
         if (SubProgram->isArtificial())
           R << " artificial";
       }
-      if (!Name.empty())
-        R << " '" << Name << "'";
-      else
-        R << " with unknown name";
+      Name = FnCallee->getName();
     }
+    if (Name.empty()) {
+      raw_string_ostream OS(Name);
+      Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
+    }
+    R << " '" << Name << "'";
     return R;
   });
 }
diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll
index 25b8e3d8803037..d00ab2b74d3985 100644
--- a/llvm/test/Analysis/KernelInfo/calls.ll
+++ b/llvm/test/Analysis/KernelInfo/calls.ll
@@ -18,7 +18,7 @@ entry:
   ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h'
   call void @h(), !dbg !105
   %fnPtr = load ptr, ptr null, align 8
-  ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call
+  ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr'
   call void %fnPtr(), !dbg !106
   ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f'
   invoke void @f() to label %fcont unwind label %cleanup, !dbg !107
@@ -29,7 +29,7 @@ gcont:
   ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h'
   invoke void @h() to label %hcont unwind label %cleanup, !dbg !109
 hcont:
-  ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke
+  ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr'
   invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110
 cleanup:
   %ll = landingpad { ptr, i32 }
@@ -54,7 +54,7 @@ entry:
   ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h'
   call void @h(), !dbg !204
   %fnPtr = load ptr, ptr null, align 8
-  ; CHECK: remark: test.c:9:3: in function 'g', indirect call
+  ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr'
   call void %fnPtr(), !dbg !205
   ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f'
   invoke void @f() to label %fcont unwind label %cleanup, !dbg !206
@@ -65,7 +65,7 @@ gcont:
   ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h'
   invoke void @h() to label %hcont unwind label %cleanup, !dbg !208
 hcont:
-  ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke
+  ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr'
   invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209
 cleanup:
   %ll = landingpad { ptr, i32 }

>From 8da602b92369af0d9a4f794b1956bd15ecac0263 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 16:36:36 -0400
Subject: [PATCH 18/27] Report inline assembly calls

---
 llvm/lib/Analysis/KernelInfo.cpp              |  8 +++++++
 llvm/test/Analysis/KernelInfo/calls.ll        | 24 +++++++++++++++----
 .../test/Analysis/KernelInfo/openmp/amdgpu.ll |  3 +++
 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll |  3 +++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 9768fe90b14330..034194e27f9fb4 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -80,6 +80,9 @@ class KernelInfo {
   /// defined in this module.
   int64_t DirectCallsToDefinedFunctions = 0;
 
+  /// Number of direct calls to inline assembly.
+  int64_t InlineAssemblyCalls = 0;
+
   /// Number of calls of type InvokeInst.
   int64_t Invokes = 0;
 
@@ -234,6 +237,10 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
             CallKind += " to defined function";
             RemarkKind += "ToDefinedFunction";
           }
+        } else if (Call->isInlineAsm()) {
+          InlineAssemblyCalls += Direction;
+          CallKind += " to inline assembly";
+          RemarkKind += "ToInlineAssembly";
         }
       }
       remarkCall(ORE, F, *Call, CallKind, RemarkKind);
@@ -403,6 +410,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F,
   REMARK_PROPERTY(DirectCalls);
   REMARK_PROPERTY(IndirectCalls);
   REMARK_PROPERTY(DirectCallsToDefinedFunctions);
+  REMARK_PROPERTY(InlineAssemblyCalls);
   REMARK_PROPERTY(Invokes);
   REMARK_PROPERTY(FlatAddrspaceAccesses);
 #undef REMARK_PROPERTY
diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll
index d00ab2b74d3985..2a2672c70b85cb 100644
--- a/llvm/test/Analysis/KernelInfo/calls.ll
+++ b/llvm/test/Analysis/KernelInfo/calls.ll
@@ -17,6 +17,8 @@ entry:
   call void @g(), !dbg !104
   ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h'
   call void @h(), !dbg !105
+  ; CHECK: remark: test.c:24:5: in artificial function 'h', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  call void asm sideeffect "eieio", ""(), !dbg !111
   %fnPtr = load ptr, ptr null, align 8
   ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr'
   call void %fnPtr(), !dbg !106
@@ -29,6 +31,9 @@ gcont:
   ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h'
   invoke void @h() to label %hcont unwind label %cleanup, !dbg !109
 hcont:
+  ; CHECK: remark: test.c:25:5: in artificial function 'h', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !112
+asmcont:
   ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr'
   invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110
 cleanup:
@@ -38,10 +43,11 @@ cleanup:
 end:
   ret void
 }
-; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 6
+; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 8
 ; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2
 ; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4
-; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 4
+; CHECK: remark: test.c:13:0: in artificial function 'h', InlineAssemblyCalls = 2
+; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 5
 
 declare void @f()
 
@@ -53,6 +59,8 @@ entry:
   call void @g(), !dbg !203
   ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h'
   call void @h(), !dbg !204
+  ; CHECK: remark: test.c:14:3: in function 'g', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  call void asm sideeffect "eieio", ""(), !dbg !210
   %fnPtr = load ptr, ptr null, align 8
   ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr'
   call void %fnPtr(), !dbg !205
@@ -65,6 +73,9 @@ gcont:
   ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h'
   invoke void @h() to label %hcont unwind label %cleanup, !dbg !208
 hcont:
+  ; CHECK: remark: test.c:15:3: in function 'g', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !211
+asmcont:
   ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr'
   invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209
 cleanup:
@@ -74,10 +85,11 @@ cleanup:
 end:
   ret void
 }
-; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 6
+; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 8
 ; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2
 ; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4
-; CHECK: remark: test.c:3:0: in function 'g', Invokes = 4
+; CHECK: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 2
+; CHECK: remark: test.c:3:0: in function 'g', Invokes = 5
 
 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!1}
@@ -99,6 +111,8 @@ end:
 !108 = !DILocation(line: 21, column: 5, scope: !103)
 !109 = !DILocation(line: 22, column: 5, scope: !103)
 !110 = !DILocation(line: 23, column: 5, scope: !103)
+!111 = !DILocation(line: 24, column: 5, scope: !103)
+!112 = !DILocation(line: 25, column: 5, scope: !103)
 
 !200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
 !201 = !DISubroutineType(types: !3)
@@ -110,3 +124,5 @@ end:
 !207 = !DILocation(line: 11, column: 3, scope: !200)
 !208 = !DILocation(line: 12, column: 3, scope: !200)
 !209 = !DILocation(line: 13, column: 3, scope: !200)
+!210 = !DILocation(line: 14, column: 3, scope: !200)
+!211 = !DILocation(line: 15, column: 3, scope: !200)
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index 82f6f243264bc4..be3b357cc45309 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -25,6 +25,7 @@
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
 
@@ -39,6 +40,7 @@
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
 
@@ -53,6 +55,7 @@
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
 ;  CHECK-NOT: {{.}}
diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
index eb2cba596be223..2dbd04b2536c45 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
@@ -24,6 +24,7 @@
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
 
@@ -38,6 +39,7 @@
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
 
@@ -52,6 +54,7 @@
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
 ;  CHECK-NOT: remark: {{.*: in function 'g',.*}}

>From 45114fd9d85d614f2f3bc18543fb6779cab1053d Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 16:41:34 -0400
Subject: [PATCH 19/27] Use llvm::SmallString

---
 llvm/lib/Analysis/KernelInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 034194e27f9fb4..96a77c96dc1f33 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -152,7 +152,7 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
     identifyFunction(R, Caller);
     R << ", " << CallKind << ", callee is";
     Value *Callee = Call.getCalledOperand();
-    std::string Name;
+    SmallString<100> Name; // might be function name or asm expression
     if (const Function *FnCallee = dyn_cast<Function>(Callee)) {
       if (auto *SubProgram = FnCallee->getSubprogram()) {
         if (SubProgram->isArtificial())
@@ -161,7 +161,7 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
       Name = FnCallee->getName();
     }
     if (Name.empty()) {
-      raw_string_ostream OS(Name);
+      raw_svector_ostream OS(Name);
       Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
     }
     R << " '" << Name << "'";

>From eea139c63cde6f900962c5e999ffce79568b4391 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 16:47:20 -0400
Subject: [PATCH 20/27] Use llvm::SmallString

---
 llvm/lib/Analysis/KernelInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 96a77c96dc1f33..ff713235162385 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -182,8 +182,8 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
       R << ", '" << Inst.getOpcodeName() << "' instruction";
     }
     if (!Inst.getType()->isVoidTy()) {
-      std::string Name;
-      raw_string_ostream OS(Name);
+      SmallString<20> Name;
+      raw_svector_ostream OS(Name);
       Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
       R << " ('" << Name << "')";
     }

>From 8bf6e4e4bb262e0866d3e2098bb1a16c7293e2be Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 16 Aug 2024 17:17:07 -0400
Subject: [PATCH 21/27] getKernelInfo -> emitKernelInfo because return is
 unused

---
 llvm/lib/Analysis/KernelInfo.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index ff713235162385..282dc092bfd624 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -33,7 +33,7 @@ class KernelInfo {
                    OptimizationRemarkEmitter &ORE);
 
 public:
-  static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
+  static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM);
 
   /// Whether the function has external linkage and is not a kernel function.
   bool ExternalNotKernel = false;
@@ -359,8 +359,7 @@ static std::optional<int64_t> parseNVPTXMDNodeAsInteger(Function &F,
   return Result;
 }
 
-KernelInfo KernelInfo::getKernelInfo(Function &F,
-                                     FunctionAnalysisManager &FAM) {
+void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) {
   KernelInfo KI;
   KI.FlatAddrspace = FAM.getResult<TargetIRAnalysis>(F).getFlatAddressSpace();
 
@@ -415,11 +414,11 @@ KernelInfo KernelInfo::getKernelInfo(Function &F,
   REMARK_PROPERTY(FlatAddrspaceAccesses);
 #undef REMARK_PROPERTY
 
-  return KI;
+  return;
 }
 
 PreservedAnalyses KernelInfoPrinter::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-  KernelInfo::getKernelInfo(F, AM);
+  KernelInfo::emitKernelInfo(F, AM);
   return PreservedAnalyses::all();
 }

>From 62d494d9a9f13e5b58d71f083a6cb9f67f19579b Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Fri, 13 Sep 2024 17:30:23 -0400
Subject: [PATCH 22/27] Clean up launch bounds

* For amdgpu, use AMGPUSubtarget functions to query values.  Thus, we
  end up with logical values that don't appear explicitly in the IR,
  and we ignore some impossible values that do appear explicitly.
* For nvptx, use NVPTXUtilities.h functions to query values.  Thus,
  drop KernelInfo.cpp's implementation of NVVM annotation parsing.
  Also, add support for a few more launch bounds.
* Move target-specific collection of launch bounds to target-specific
  classes (GCNSubtarget and NVPTXSubtarget).  While making the above
  changes, I struggled to find the right headers to enable keeping the
  implementation in KernelInfo.cpp, and one reviewer wanted to see
  some reorganization along these lines anyway.
---
 llvm/include/llvm/Analysis/KernelInfo.h       |   8 +-
 .../llvm/CodeGen/TargetSubtargetInfo.h        |   5 +
 llvm/lib/Analysis/KernelInfo.cpp              | 127 ++++--------------
 llvm/lib/Passes/PassRegistry.def              |   2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   4 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |  16 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   4 +
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp      |  16 +++
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h        |   4 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |   4 +-
 .../KernelInfo/launch-bounds/amdgpu.ll        |  67 +++++++--
 .../KernelInfo/launch-bounds/nvptx.ll         |  10 +-
 .../test/Analysis/KernelInfo/openmp/amdgpu.ll |  19 +++
 13 files changed, 166 insertions(+), 120 deletions(-)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index c5c33fac346554..6633c28858a2f3 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -18,9 +18,15 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+
+class TargetMachine;
+
 class KernelInfoPrinter : public PassInfoMixin<KernelInfoPrinter> {
+private:
+  TargetMachine *TM;
+
 public:
-  explicit KernelInfoPrinter() {}
+  explicit KernelInfoPrinter(TargetMachine *TM) : TM(TM) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index b4b018f080914a..5d75510e915135 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -338,6 +338,11 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   /// the pass, with architecture specific overrides providing the information
   /// where they are implemented.
   virtual bool supportsInitUndef() const { return false; }
+
+  /// For \p F, call \p Body with the name and value of each launch bound.
+  virtual void forEachLaunchBound(
+      const Function &F,
+      std::function<void(StringRef Name, unsigned Value)> Body) const {}
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 282dc092bfd624..6d0efdfec83444 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
@@ -22,6 +23,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -33,7 +35,8 @@ class KernelInfo {
                    OptimizationRemarkEmitter &ORE);
 
 public:
-  static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM);
+  static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
+                             TargetMachine *TM);
 
   /// Whether the function has external linkage and is not a kernel function.
   bool ExternalNotKernel = false;
@@ -44,23 +47,6 @@ class KernelInfo {
   std::optional<int64_t> OmpTargetThreadLimit;
   ///@}
 
-  /// AMDGPU launch bounds.
-  ///@{
-  std::optional<int64_t> AmdgpuMaxNumWorkgroupsX;
-  std::optional<int64_t> AmdgpuMaxNumWorkgroupsY;
-  std::optional<int64_t> AmdgpuMaxNumWorkgroupsZ;
-  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMin;
-  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMax;
-  std::optional<int64_t> AmdgpuWavesPerEuMin;
-  std::optional<int64_t> AmdgpuWavesPerEuMax;
-  ///@}
-
-  /// NVPTX launch bounds.
-  ///@{
-  std::optional<int64_t> Maxclusterrank;
-  std::optional<int64_t> Maxntidx;
-  ///@}
-
   /// The number of alloca instructions inside the function, the number of those
   /// with allocation sizes that cannot be determined at compile time, and the
   /// sum of the sizes that can be.
@@ -298,68 +284,23 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
   remarkProperty(ORE, F, Name, Value.value());
 }
 
-static std::vector<std::optional<int64_t>>
-parseFnAttrAsIntegerFields(Function &F, StringRef Name, unsigned NumFields) {
-  std::vector<std::optional<int64_t>> Result(NumFields);
-  Attribute A = F.getFnAttribute(Name);
-  if (!A.isStringAttribute())
-    return Result;
-  StringRef Rest = A.getValueAsString();
-  for (unsigned I = 0; I < NumFields; ++I) {
-    StringRef Field;
-    std::tie(Field, Rest) = Rest.split(',');
-    if (Field.empty())
-      break;
-    int64_t Val;
-    if (Field.getAsInteger(0, Val)) {
-      F.getContext().emitError("cannot parse integer in attribute '" + Name +
-                               "': " + Field);
-      break;
-    }
-    Result[I] = Val;
-  }
-  if (!Rest.empty())
-    F.getContext().emitError("too many fields in attribute " + Name);
-  return Result;
-}
-
 static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
                                                    StringRef Name) {
-  return parseFnAttrAsIntegerFields(F, Name, 1)[0];
-}
-
-// TODO: This nearly duplicates the same function in OMPIRBuilder.cpp.  Can we
-// share?
-static MDNode *getNVPTXMDNode(Function &F, StringRef Name) {
-  Module &M = *F.getParent();
-  NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
-  if (!MD)
-    return nullptr;
-  for (auto *Op : MD->operands()) {
-    if (Op->getNumOperands() != 3)
-      continue;
-    auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
-    if (!KernelOp || KernelOp->getValue() != &F)
-      continue;
-    auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
-    if (!Prop || Prop->getString() != Name)
-      continue;
-    return Op;
-  }
-  return nullptr;
-}
-
-static std::optional<int64_t> parseNVPTXMDNodeAsInteger(Function &F,
-                                                        StringRef Name) {
-  std::optional<int64_t> Result;
-  if (MDNode *ExistingOp = getNVPTXMDNode(F, Name)) {
-    auto *Op = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
-    Result = cast<ConstantInt>(Op->getValue())->getZExtValue();
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return std::nullopt;
+  StringRef Field = A.getValueAsString();
+  int64_t Val;
+  if (Field.getAsInteger(0, Val)) {
+    F.getContext().emitError("cannot parse integer in attribute '" + Name +
+                             "': " + Field);
+    return std::nullopt;
   }
-  return Result;
+  return Val;
 }
 
-void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) {
+void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
+                                TargetMachine *TM) {
   KernelInfo KI;
   KI.FlatAddrspace = FAM.getResult<TargetIRAnalysis>(F).getFlatAddressSpace();
 
@@ -367,21 +308,6 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) {
   KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F);
   KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams");
   KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit");
-  auto AmdgpuMaxNumWorkgroups =
-      parseFnAttrAsIntegerFields(F, "amdgpu-max-num-workgroups", 3);
-  KI.AmdgpuMaxNumWorkgroupsX = AmdgpuMaxNumWorkgroups[0];
-  KI.AmdgpuMaxNumWorkgroupsY = AmdgpuMaxNumWorkgroups[1];
-  KI.AmdgpuMaxNumWorkgroupsZ = AmdgpuMaxNumWorkgroups[2];
-  auto AmdgpuFlatWorkGroupSize =
-      parseFnAttrAsIntegerFields(F, "amdgpu-flat-work-group-size", 2);
-  KI.AmdgpuFlatWorkGroupSizeMin = AmdgpuFlatWorkGroupSize[0];
-  KI.AmdgpuFlatWorkGroupSizeMax = AmdgpuFlatWorkGroupSize[1];
-  auto AmdgpuWavesPerEu =
-      parseFnAttrAsIntegerFields(F, "amdgpu-waves-per-eu", 2);
-  KI.AmdgpuWavesPerEuMin = AmdgpuWavesPerEu[0];
-  KI.AmdgpuWavesPerEuMax = AmdgpuWavesPerEu[1];
-  KI.Maxclusterrank = parseNVPTXMDNodeAsInteger(F, "maxclusterrank");
-  KI.Maxntidx = parseNVPTXMDNodeAsInteger(F, "maxntidx");
 
   const DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
@@ -394,15 +320,16 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) {
   REMARK_PROPERTY(ExternalNotKernel);
   REMARK_PROPERTY(OmpTargetNumTeams);
   REMARK_PROPERTY(OmpTargetThreadLimit);
-  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsX);
-  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsY);
-  REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsZ);
-  REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMin);
-  REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMax);
-  REMARK_PROPERTY(AmdgpuWavesPerEuMin);
-  REMARK_PROPERTY(AmdgpuWavesPerEuMax);
-  REMARK_PROPERTY(Maxclusterrank);
-  REMARK_PROPERTY(Maxntidx);
+  // TM might be nullptr if support for the target was not built.  For example,
+  // we currently have some KernelInfo tests where the choice of target isn't
+  // important, so they arbitrarily choose a target triple.  Those tests are
+  // expected to run successfully even if support for that target was not built.
+  if (TM) {
+    TM->getSubtargetImpl(F)->forEachLaunchBound(
+        F, [&](StringRef Name, unsigned Value) {
+          remarkProperty(ORE, F, Name, Value);
+        });
+  }
   REMARK_PROPERTY(Allocas);
   REMARK_PROPERTY(AllocasStaticSizeSum);
   REMARK_PROPERTY(AllocasDyn);
@@ -419,6 +346,6 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) {
 
 PreservedAnalyses KernelInfoPrinter::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-  KernelInfo::emitKernelInfo(F, AM);
+  KernelInfo::emitKernelInfo(F, AM, TM);
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 6b3ccfccf3ae0b..10b0b3f57c2890 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -382,7 +382,7 @@ FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
 FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass());
 FUNCTION_PASS("kcfi", KCFIPass())
-FUNCTION_PASS("kernel-info", KernelInfoPrinter())
+FUNCTION_PASS("kernel-info", KernelInfoPrinter(TM))
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
 FUNCTION_PASS("lint", LintPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 555302b290da2a..c0e3df93264c9d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -790,9 +790,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   if (KernelInfoEndLTO) {
     PB.registerFullLinkTimeOptimizationLastEPCallback(
-        [](ModulePassManager &PM, OptimizationLevel Level) {
+        [this](ModulePassManager &PM, OptimizationLevel Level) {
           FunctionPassManager FPM;
-          FPM.addPass(KernelInfoPrinter());
+          FPM.addPass(KernelInfoPrinter(this));
           PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         });
   }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 52c24a5c25ec24..f1eb5fcb2c06fb 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -711,6 +711,22 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
   return NSAThreshold;
 }
 
+void GCNSubtarget::forEachLaunchBound(
+    const Function &F,
+    std::function<void(StringRef Name, unsigned Value)> Body) const {
+  auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F);
+  Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]);
+  Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]);
+  Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]);
+  auto AmdgpuFlatWorkGroupSize = getFlatWorkGroupSizes(F);
+  Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first);
+  Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second);
+  auto AmdgpuWavesPerEU = getWavesPerEU(F);
+  Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first);
+  Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second);
+  // TODO: Any others we should add?
+}
+
 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
                                            const GCNSubtarget &ST)
     : ST(ST) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7b74eab96c5677..a514945a5e6f56 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1587,6 +1587,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     // the nop.
     return true;
   }
+
+  virtual void forEachLaunchBound(
+      const Function &F,
+      std::function<void(StringRef Name, unsigned Value)> Body) const override;
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 420065585b3849..fccb3de4537349 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,6 +12,7 @@
 
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
 
 using namespace llvm;
 
@@ -69,3 +70,18 @@ bool NVPTXSubtarget::hasImageHandles() const {
 bool NVPTXSubtarget::allowFP16Math() const {
   return hasFP16Math() && NoF16Math == false;
 }
+
+void NVPTXSubtarget::forEachLaunchBound(
+    const Function &F,
+    std::function<void(StringRef Name, unsigned Value)> Body) const {
+  unsigned Val;
+  if (getMaxClusterRank(F, Val))
+    Body("Maxclusterrank", Val);
+  if (auto Val = getMaxNTIDx(F))
+    Body("Maxntidx", *Val);
+  if (auto Val = getMaxNTIDy(F))
+    Body("Maxntidy", *Val);
+  if (auto Val = getMaxNTIDz(F))
+    Body("Maxntidz", *Val);
+  // TODO: Any others we should add?
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 457f10f1d64a26..6cc8b6764cf8e6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -119,6 +119,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
 
   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+  virtual void forEachLaunchBound(
+      const Function &F,
+      std::function<void(StringRef Name, unsigned Value)> Body) const override;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 777d1215214ec1..8fd3dacbab87e9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -242,9 +242,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   if (KernelInfoEndLTO) {
     PB.registerFullLinkTimeOptimizationLastEPCallback(
-        [](ModulePassManager &PM, OptimizationLevel Level) {
+        [this](ModulePassManager &PM, OptimizationLevel Level) {
           FunctionPassManager FPM;
-          FPM.addPass(KernelInfoPrinter());
+          FPM.addPass(KernelInfoPrinter(this));
           PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         });
   }
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
index 0c98f4ad45950a..472d7c0286b013 100644
--- a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
@@ -1,5 +1,7 @@
 ; Check info on launch bounds for AMD GPU.
 
+; REQUIRES: amdgpu-registered-target
+
 ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
 ; RUN:     -disable-output %s 2>&1 | \
 ; RUN:   FileCheck -match-full-lines %s
@@ -7,16 +9,44 @@
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 target triple = "amdgcn-amd-amdhsa"
 
-; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100
-; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101
-; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsX = 200
-; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsY = 201
-; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsZ = 202
-; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMin = 210
-; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMax = 211
-; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMin = 220
-; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMax = 221
-define void @test() #0 !dbg !5 {
+; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetNumTeams = 100
+; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetThreadLimit = 101
+; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsX = 200
+; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsY = 201
+; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsZ = 202
+; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMin = 210
+; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMax = 211
+; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMin = 2
+; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMax = 9
+define void @all() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetNumTeams = {{.*}}
+; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetThreadLimit = {{.*}}
+; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsX = 0
+; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsY = 0
+; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsZ = 0
+; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMin = 1
+; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMax = 1024
+; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMin = 4
+; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMax = 10
+define void @none() !dbg !6 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetNumTeams = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetThreadLimit = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsX = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsY = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsZ = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMin = 1
+; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMax = 1024
+; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMin = 4
+; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMax = 10
+define void @bogus() #1 !dbg !7 {
 entry:
   ret void
 }
@@ -26,7 +56,18 @@ attributes #0 = {
   "omp_target_thread_limit"="101"
   "amdgpu-max-num-workgroups"="200,201,202"
   "amdgpu-flat-work-group-size"="210,211"
-  "amdgpu-waves-per-eu"="220,221"
+  "amdgpu-waves-per-eu"="2,9"
+}
+
+; We choose values that are small enough to parse successfully but that are
+; impossibly large.  For values that are validated, we check that they are
+; overridden with realistic values.
+attributes #1 = {
+  "omp_target_num_teams"="987654321"
+  "omp_target_thread_limit"="987654321"
+  "amdgpu-max-num-workgroups"="987654321,987654321,987654321"
+  "amdgpu-flat-work-group-size"="987654321,987654321"
+  "amdgpu-waves-per-eu"="987654321,987654321"
 }
 
 !llvm.module.flags = !{!0}
@@ -37,4 +78,6 @@ attributes #0 = {
 !2 = !DIFile(filename: "test.c", directory: "/tmp")
 !3 = !{}
 !4 = !DISubroutineType(types: !3)
-!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!5 = distinct !DISubprogram(name: "all", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = distinct !DISubprogram(name: "none", scope: !2, file: !2, line: 11, type: !4, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!7 = distinct !DISubprogram(name: "bogus", scope: !2, file: !2, line: 12, type: !4, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
index c7339f90e3ca92..d9a024f38652ea 100644
--- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
@@ -1,5 +1,7 @@
 ; Check info on launch bounds for NVPTX.
 
+; REQUIRES: nvptx-registered-target
+
 ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
 ; RUN:     -disable-output %s 2>&1 | \
 ; RUN:   FileCheck -match-full-lines %s
@@ -11,6 +13,8 @@ target triple = "nvptx64-nvidia-cuda"
 ; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101
 ; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200
 ; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210
+; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidy = 211
+; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidz = 212
 define void @test() #0 !dbg !5 {
 entry:
   ret void
@@ -23,7 +27,7 @@ attributes #0 = {
 
 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!1}
-!nvvm.annotations = !{!6, !7, !8}
+!nvvm.annotations = !{!6, !7, !8, !9, !10}
 
 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
@@ -33,4 +37,6 @@ attributes #0 = {
 !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
 !6 = !{ptr @test, !"maxclusterrank", i32 200}
 !7 = !{ptr @test, !"maxntidx", i32 210}
-!8 = distinct !{ptr null, !"kernel", i32 1}
+!8 = !{ptr @test, !"maxntidy", i32 211}
+!9 = !{ptr @test, !"maxntidz", i32 212}
+!10 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index be3b357cc45309..d21dde10f979a6 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -17,8 +17,13 @@
 ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit'
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsX = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsY = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsZ = 0
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMin = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMax = 10
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20
 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0
@@ -34,6 +39,13 @@
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
 ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsX = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsY = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsZ = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMin = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMax = 1024
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMin = 4
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMax = 10
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8
 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0
@@ -49,6 +61,13 @@
 ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f'
 ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g'
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsX = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsY = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsZ = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMin = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMax = 1024
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMin = 4
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMax = 10
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0

>From 94d90d17e156f6a8e89cf3155bde2138a65c4f42 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 16 Sep 2024 16:21:32 -0400
Subject: [PATCH 23/27] Adjust forEachLaunchBound param

* std::function -> llvm::function_ref
* unsigned -> int64_t
---
 llvm/include/llvm/CodeGen/TargetSubtargetInfo.h | 2 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp         | 2 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h           | 7 ++++---
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp        | 2 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h          | 7 ++++---
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 485aa7e13fe69d..d301304a47275d 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -343,7 +343,7 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   /// For \p F, call \p Body with the name and value of each launch bound.
   virtual void forEachLaunchBound(
       const Function &F,
-      std::function<void(StringRef Name, unsigned Value)> Body) const {}
+      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {}
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index f1eb5fcb2c06fb..1ec7a6f64bbf55 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -713,7 +713,7 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
 
 void GCNSubtarget::forEachLaunchBound(
     const Function &F,
-    std::function<void(StringRef Name, unsigned Value)> Body) const {
+    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
   auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F);
   Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]);
   Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7fb7fcd496ade7..0df0a3e8ecca65 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1594,9 +1594,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return false;
   }
 
-  virtual void forEachLaunchBound(
-      const Function &F,
-      std::function<void(StringRef Name, unsigned Value)> Body) const override;
+  virtual void
+  forEachLaunchBound(const Function &F,
+                     llvm::function_ref<void(StringRef Name, int64_t Value)>
+                         Body) const override;
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index fccb3de4537349..ab68f54f0473c7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -73,7 +73,7 @@ bool NVPTXSubtarget::allowFP16Math() const {
 
 void NVPTXSubtarget::forEachLaunchBound(
     const Function &F,
-    std::function<void(StringRef Name, unsigned Value)> Body) const {
+    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
   unsigned Val;
   if (getMaxClusterRank(F, Val))
     Body("Maxclusterrank", Val);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 6cc8b6764cf8e6..710faf0665054c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -120,9 +120,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
-  virtual void forEachLaunchBound(
-      const Function &F,
-      std::function<void(StringRef Name, unsigned Value)> Body) const override;
+  virtual void
+  forEachLaunchBound(const Function &F,
+                     llvm::function_ref<void(StringRef Name, int64_t Value)>
+                         Body) const override;
 };
 
 } // End llvm namespace

>From 762a217705f0ffd90723e2d8d9d54f1c39975c2a Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 16 Sep 2024 16:21:58 -0400
Subject: [PATCH 24/27] Reuse Function::getFnAttributeAsParsedInteger

---
 llvm/lib/Analysis/KernelInfo.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 6d0efdfec83444..85d923a97740d8 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -286,17 +286,9 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
 
 static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
                                                    StringRef Name) {
-  Attribute A = F.getFnAttribute(Name);
-  if (!A.isStringAttribute())
+  if (!F.hasFnAttribute(Name))
     return std::nullopt;
-  StringRef Field = A.getValueAsString();
-  int64_t Val;
-  if (Field.getAsInteger(0, Val)) {
-    F.getContext().emitError("cannot parse integer in attribute '" + Name +
-                             "': " + Field);
-    return std::nullopt;
-  }
-  return Val;
+  return F.getFnAttributeAsParsedInteger(Name);
 }
 
 void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,

>From df66a3d2c28339f2f3d6cc515a550894e5a05bef Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Mon, 16 Sep 2024 16:22:09 -0400
Subject: [PATCH 25/27] Move forEachLaunchBound to TargetTransformInfo

---
 .../include/llvm/Analysis/TargetTransformInfo.h | 15 +++++++++++++++
 .../llvm/Analysis/TargetTransformInfoImpl.h     |  4 ++++
 llvm/include/llvm/CodeGen/TargetSubtargetInfo.h |  5 -----
 llvm/lib/Analysis/KernelInfo.cpp                | 17 +++++------------
 llvm/lib/Analysis/TargetTransformInfo.cpp       |  6 ++++++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 16 ++++++++++++++++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h   |  3 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp         | 16 ----------------
 llvm/lib/Target/AMDGPU/GCNSubtarget.h           |  5 -----
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp        | 16 ----------------
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h          |  5 -----
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp   | 15 +++++++++++++++
 .../lib/Target/NVPTX/NVPTXTargetTransformInfo.h |  3 +++
 13 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b2124c6106198e..e55aed11e53c92 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1788,6 +1788,11 @@ class TargetTransformInfo {
 
   /// @}
 
+  /// For \p F, call \p Body with the name and value of each launch bound.
+  void forEachLaunchBound(
+      const Function &F,
+      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const;
+
 private:
   /// The abstract base class used to type erase specific TTI
   /// implementations.
@@ -2179,6 +2184,9 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual void forEachLaunchBound(
+      const Function &F,
+      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const = 0;
 };
 
 template <typename T>
@@ -2952,6 +2960,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  void
+  forEachLaunchBound(const Function &F,
+                     llvm::function_ref<void(StringRef Name, int64_t Value)>
+                         Body) const override {
+    return Impl.forEachLaunchBound(F, Body);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 90eef93a2a54d5..684aa44cb945ef 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -973,6 +973,10 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  void forEachLaunchBound(
+      const Function &F,
+      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {}
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index d301304a47275d..bfaa6450779ae0 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -339,11 +339,6 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
     // Conservatively assume such instructions exist by default.
     return true;
   }
-
-  /// For \p F, call \p Body with the name and value of each launch bound.
-  virtual void forEachLaunchBound(
-      const Function &F,
-      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {}
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index 85d923a97740d8..a71d8b3acd09fb 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
@@ -294,7 +293,8 @@ static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
 void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
                                 TargetMachine *TM) {
   KernelInfo KI;
-  KI.FlatAddrspace = FAM.getResult<TargetIRAnalysis>(F).getFlatAddressSpace();
+  TargetTransformInfo &TheTTI = FAM.getResult<TargetIRAnalysis>(F);
+  KI.FlatAddrspace = TheTTI.getFlatAddressSpace();
 
   // Record function properties.
   KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F);
@@ -312,16 +312,9 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
   REMARK_PROPERTY(ExternalNotKernel);
   REMARK_PROPERTY(OmpTargetNumTeams);
   REMARK_PROPERTY(OmpTargetThreadLimit);
-  // TM might be nullptr if support for the target was not built.  For example,
-  // we currently have some KernelInfo tests where the choice of target isn't
-  // important, so they arbitrarily choose a target triple.  Those tests are
-  // expected to run successfully even if support for that target was not built.
-  if (TM) {
-    TM->getSubtargetImpl(F)->forEachLaunchBound(
-        F, [&](StringRef Name, unsigned Value) {
-          remarkProperty(ORE, F, Name, Value);
-        });
-  }
+  TheTTI.forEachLaunchBound(F, [&](StringRef Name, unsigned Value) {
+    remarkProperty(ORE, F, Name, Value);
+  });
   REMARK_PROPERTY(Allocas);
   REMARK_PROPERTY(AllocasStaticSizeSum);
   REMARK_PROPERTY(AllocasDyn);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2c26493bd3f1ca..cf48fa76141739 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1348,6 +1348,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+void TargetTransformInfo::forEachLaunchBound(
+    const Function &F,
+    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
+  return TTIImpl->forEachLaunchBound(F, Body);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 4cf7733a260ff0..fe362f40cf56ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1390,3 +1390,19 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
   return AMDGPU::isFlatGlobalAddrSpace(AS);
 }
+
+void GCNTTIImpl::forEachLaunchBound(
+    const Function &F,
+    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
+  auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
+  Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]);
+  Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]);
+  Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]);
+  auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F);
+  Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first);
+  Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second);
+  auto AmdgpuWavesPerEU = ST->getWavesPerEU(F);
+  Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first);
+  Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second);
+  // TODO: Any others we should add?
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 01df2e6caaba1d..529170888f2e9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -266,6 +266,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const override;
+  void forEachLaunchBound(
+      const Function &F,
+      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 1ec7a6f64bbf55..52c24a5c25ec24 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -711,22 +711,6 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
   return NSAThreshold;
 }
 
-void GCNSubtarget::forEachLaunchBound(
-    const Function &F,
-    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
-  auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F);
-  Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]);
-  Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]);
-  Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]);
-  auto AmdgpuFlatWorkGroupSize = getFlatWorkGroupSizes(F);
-  Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first);
-  Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second);
-  auto AmdgpuWavesPerEU = getWavesPerEU(F);
-  Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first);
-  Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second);
-  // TODO: Any others we should add?
-}
-
 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
                                            const GCNSubtarget &ST)
     : ST(ST) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0df0a3e8ecca65..a4ae8a1be32258 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1593,11 +1593,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     // to the same register.
     return false;
   }
-
-  virtual void
-  forEachLaunchBound(const Function &F,
-                     llvm::function_ref<void(StringRef Name, int64_t Value)>
-                         Body) const override;
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index ab68f54f0473c7..420065585b3849 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,7 +12,6 @@
 
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
-#include "NVPTXUtilities.h"
 
 using namespace llvm;
 
@@ -70,18 +69,3 @@ bool NVPTXSubtarget::hasImageHandles() const {
 bool NVPTXSubtarget::allowFP16Math() const {
   return hasFP16Math() && NoF16Math == false;
 }
-
-void NVPTXSubtarget::forEachLaunchBound(
-    const Function &F,
-    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
-  unsigned Val;
-  if (getMaxClusterRank(F, Val))
-    Body("Maxclusterrank", Val);
-  if (auto Val = getMaxNTIDx(F))
-    Body("Maxntidx", *Val);
-  if (auto Val = getMaxNTIDy(F))
-    Body("Maxntidy", *Val);
-  if (auto Val = getMaxNTIDz(F))
-    Body("Maxntidz", *Val);
-  // TODO: Any others we should add?
-}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 710faf0665054c..457f10f1d64a26 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -119,11 +119,6 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
 
   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
-
-  virtual void
-  forEachLaunchBound(const Function &F,
-                     llvm::function_ref<void(StringRef Name, int64_t Value)>
-                         Body) const override;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 9a8ea8f87896ad..50cc2c8e22d4f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -442,3 +442,18 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::PeelingPreferences &PP) {
   BaseT::getPeelingPreferences(L, SE, PP);
 }
+
+void NVPTXTTIImpl::forEachLaunchBound(
+    const Function &F,
+    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
+  unsigned Val;
+  if (getMaxClusterRank(F, Val))
+    Body("Maxclusterrank", Val);
+  if (auto Val = getMaxNTIDx(F))
+    Body("Maxntidx", *Val);
+  if (auto Val = getMaxNTIDy(F))
+    Body("Maxntidy", *Val);
+  if (auto Val = getMaxNTIDz(F))
+    Body("Maxntidz", *Val);
+  // TODO: Any others we should add?
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 4160f5f6bfae76..2d794f1d80050c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -124,6 +124,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
       return true;
     }
   }
+  void forEachLaunchBound(
+      const Function &F,
+      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const;
 };
 
 } // end namespace llvm

>From 3f63d532fa99a59a3be58e31d09943b143b1c889 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 26 Sep 2024 14:03:13 -0400
Subject: [PATCH 26/27] forEachLaunchBound -> collectLaunchBounds

Return the launch bounds instead of passing them to a callback.
---
 .../llvm/Analysis/TargetTransformInfo.h       | 21 +++++++-------
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 +--
 llvm/lib/Analysis/KernelInfo.cpp              | 28 ++++++-------------
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  6 ++--
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 18 ++++++------
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  6 ++--
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 12 ++++----
 .../Target/NVPTX/NVPTXTargetTransformInfo.h   |  6 ++--
 8 files changed, 45 insertions(+), 56 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 27798ca4747e6d..106fef4ef820b5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1794,10 +1794,10 @@ class TargetTransformInfo {
 
   /// @}
 
-  /// For \p F, call \p Body with the name and value of each launch bound.
-  void forEachLaunchBound(
-      const Function &F,
-      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const;
+  /// Collect launch bounds for \p F into \p LB.
+  void
+  collectLaunchBounds(const Function &F,
+                      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
 
 private:
   /// The abstract base class used to type erase specific TTI
@@ -2191,9 +2191,9 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
-  virtual void forEachLaunchBound(
+  virtual void collectLaunchBounds(
       const Function &F,
-      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const = 0;
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
 };
 
 template <typename T>
@@ -2973,11 +2973,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getMaxNumArgs();
   }
 
-  void
-  forEachLaunchBound(const Function &F,
-                     llvm::function_ref<void(StringRef Name, int64_t Value)>
-                         Body) const override {
-    return Impl.forEachLaunchBound(F, Body);
+  void collectLaunchBounds(
+      const Function &F,
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
+    Impl.collectLaunchBounds(F, LB);
   }
 };
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7832c2f2c28033..1e05fa7200fe79 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -979,9 +979,9 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
-  void forEachLaunchBound(
+  void collectLaunchBounds(
       const Function &F,
-      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {}
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
 
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
index a71d8b3acd09fb..826340ca8401db 100644
--- a/llvm/lib/Analysis/KernelInfo.cpp
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -40,11 +40,8 @@ class KernelInfo {
   /// Whether the function has external linkage and is not a kernel function.
   bool ExternalNotKernel = false;
 
-  /// OpenMP Launch bounds.
-  ///@{
-  std::optional<int64_t> OmpTargetNumTeams;
-  std::optional<int64_t> OmpTargetThreadLimit;
-  ///@}
+  /// Launch bounds.
+  SmallVector<std::pair<StringRef, int64_t>> LaunchBounds;
 
   /// The number of alloca instructions inside the function, the number of those
   /// with allocation sizes that cannot be determined at compile time, and the
@@ -276,13 +273,6 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
   });
 }
 
-static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
-                           StringRef Name, std::optional<int64_t> Value) {
-  if (!Value)
-    return;
-  remarkProperty(ORE, F, Name, Value.value());
-}
-
 static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
                                                    StringRef Name) {
   if (!F.hasFnAttribute(Name))
@@ -298,8 +288,11 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
 
   // Record function properties.
   KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F);
-  KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams");
-  KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit");
+  if (auto Val = parseFnAttrAsInteger(F, "omp_target_num_teams"))
+    KI.LaunchBounds.push_back({"OmpTargetNumTeams", *Val});
+  if (auto Val = parseFnAttrAsInteger(F, "omp_target_thread_limit"))
+    KI.LaunchBounds.push_back({"OmpTargetThreadLimit", *Val});
+  TheTTI.collectLaunchBounds(F, KI.LaunchBounds);
 
   const DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
@@ -310,11 +303,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
 #define REMARK_PROPERTY(PROP_NAME)                                             \
   remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
   REMARK_PROPERTY(ExternalNotKernel);
-  REMARK_PROPERTY(OmpTargetNumTeams);
-  REMARK_PROPERTY(OmpTargetThreadLimit);
-  TheTTI.forEachLaunchBound(F, [&](StringRef Name, unsigned Value) {
-    remarkProperty(ORE, F, Name, Value);
-  });
+  for (auto LB : KI.LaunchBounds)
+    remarkProperty(ORE, F, LB.first, LB.second);
   REMARK_PROPERTY(Allocas);
   REMARK_PROPERTY(AllocasStaticSizeSum);
   REMARK_PROPERTY(AllocasDyn);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 946754735efcb0..6c24ec34d80b2e 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1354,10 +1354,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
-void TargetTransformInfo::forEachLaunchBound(
+void TargetTransformInfo::collectLaunchBounds(
     const Function &F,
-    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
-  return TTIImpl->forEachLaunchBound(F, Body);
+    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
+  return TTIImpl->collectLaunchBounds(F, LB);
 }
 
 TargetTransformInfo::Concept::~Concept() = default;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index fe362f40cf56ff..6094e5a42f4bf1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1391,18 +1391,18 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
   return AMDGPU::isFlatGlobalAddrSpace(AS);
 }
 
-void GCNTTIImpl::forEachLaunchBound(
+void GCNTTIImpl::collectLaunchBounds(
     const Function &F,
-    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
+    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
   auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
-  Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]);
-  Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]);
-  Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]);
+  LB.push_back({"AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]});
+  LB.push_back({"AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]});
+  LB.push_back({"AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]});
   auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F);
-  Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first);
-  Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second);
+  LB.push_back({"AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first});
+  LB.push_back({"AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second});
   auto AmdgpuWavesPerEU = ST->getWavesPerEU(F);
-  Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first);
-  Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second);
+  LB.push_back({"AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first});
+  LB.push_back({"AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second});
   // TODO: Any others we should add?
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 34944e6c478aa2..4b30ac71ccd33f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -265,9 +265,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const override;
-  void forEachLaunchBound(
-      const Function &F,
-      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const;
+  void
+  collectLaunchBounds(const Function &F,
+                      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 1a99a1cf911444..4752cf01dd2050 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -443,16 +443,16 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
   BaseT::getPeelingPreferences(L, SE, PP);
 }
 
-void NVPTXTTIImpl::forEachLaunchBound(
+void NVPTXTTIImpl::collectLaunchBounds(
     const Function &F,
-    llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const {
+    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
   if (auto Val = getMaxClusterRank(F))
-    Body("Maxclusterrank", *Val);
+    LB.push_back({"Maxclusterrank", *Val});
   if (auto Val = getMaxNTIDx(F))
-    Body("Maxntidx", *Val);
+    LB.push_back({"Maxntidx", *Val});
   if (auto Val = getMaxNTIDy(F))
-    Body("Maxntidy", *Val);
+    LB.push_back({"Maxntidy", *Val});
   if (auto Val = getMaxNTIDz(F))
-    Body("Maxntidz", *Val);
+    LB.push_back({"Maxntidz", *Val});
   // TODO: Any others we should add?
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 7e7d1dd5888554..07c14e88cc786f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -123,9 +123,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
       return true;
     }
   }
-  void forEachLaunchBound(
-      const Function &F,
-      llvm::function_ref<void(StringRef Name, int64_t Value)> Body) const;
+  void
+  collectLaunchBounds(const Function &F,
+                      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
 };
 
 } // end namespace llvm

>From feeaa3780cf725f0da1404b99b3f8634dbce75de Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Sat, 28 Sep 2024 12:50:23 -0400
Subject: [PATCH 27/27] Remove redundant private

---
 llvm/include/llvm/Analysis/KernelInfo.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
index 6633c28858a2f3..75d92c202212b5 100644
--- a/llvm/include/llvm/Analysis/KernelInfo.h
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -22,7 +22,6 @@ namespace llvm {
 class TargetMachine;
 
 class KernelInfoPrinter : public PassInfoMixin<KernelInfoPrinter> {
-private:
   TargetMachine *TM;
 
 public: