[llvm] [KernelInfo] Implement new LLVM IR pass for GPU code analysis (PR #102944)

Tue Oct 15 10:51:40 PDT 2024

================
@@ -0,0 +1,330 @@
+//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfoPrinter class used to emit remarks about
+// function properties from a GPU kernel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/KernelInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "kernel-info"
+
+/// Data structure holding function info for kernels.
+class KernelInfo {
+  void updateForBB(const BasicBlock &BB, int64_t Direction,
+                   OptimizationRemarkEmitter &ORE);
+
+public:
+  static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
+                             TargetMachine *TM);
+
+  /// Whether the function has external linkage and is not a kernel function.
+  bool ExternalNotKernel = false;
+
+  /// Launch bounds.
+  SmallVector<std::pair<StringRef, int64_t>> LaunchBounds;
+
+  /// The number of alloca instructions inside the function, the number of those
+  /// with allocation sizes that cannot be determined at compile time, and the
+  /// sum of the sizes that can be.
+  ///
+  /// With the current implementation for at least some GPU archs,
+  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
+  /// case the implementation changes.
+  int64_t Allocas = 0;
+  int64_t AllocasDyn = 0;
+  int64_t AllocasStaticSizeSum = 0;
+
+  /// Number of direct/indirect calls (anything derived from CallBase).
+  int64_t DirectCalls = 0;
+  int64_t IndirectCalls = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  /// Number of direct calls to inline assembly.
+  int64_t InlineAssemblyCalls = 0;
+
+  /// Number of calls of type InvokeInst.
+  int64_t Invokes = 0;
+
+  /// Target-specific flat address space.
+  unsigned FlatAddrspace;
+
+  /// Number of flat address space memory accesses (via load, store, etc.).
+  int64_t FlatAddrspaceAccesses = 0;
+};
+
+static bool isKernelFunction(Function &F) {
+  // TODO: Is this general enough?  Consider languages beyond OpenMP.
+  return F.hasFnAttribute("kernel");
+}
+
+static void identifyCallee(OptimizationRemark &R, const Module *M,
+                           const Value *V, StringRef Kind = "") {
+  SmallString<100> Name; // might be function name or asm expression
+  if (const Function *F = dyn_cast<Function>(V)) {
+    if (auto *SubProgram = F->getSubprogram()) {
+      if (SubProgram->isArtificial())
+        R << "artificial ";
+      Name = SubProgram->getName();
+    }
+  }
+  if (Name.empty()) {
+    raw_svector_ostream OS(Name);
+    V->printAsOperand(OS, /*PrintType=*/false, M);
+  }
+  if (!Kind.empty())
+    R << Kind << " ";
+  R << "'" << Name << "'";
+}
+
+static void identifyFunction(OptimizationRemark &R, const Function &F) {
+  identifyCallee(R, F.getParent(), &F, "function");
+}
+
+static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                         const AllocaInst &Alloca,
+                         TypeSize::ScalarTy StaticSize) {
+  ORE.emit([&] {
+    StringRef Name;
+    DebugLoc Loc;
+    bool Artificial = false;
+    auto DVRs = findDVRDeclares(&const_cast<AllocaInst &>(Alloca));
+    if (!DVRs.empty()) {
+      const DbgVariableRecord &DVR = **DVRs.begin();
+      Name = DVR.getVariable()->getName();
+      Loc = DVR.getDebugLoc();
+      Artificial = DVR.Variable->isArtificial();
+    }
+    OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc),
+                         Alloca.getParent());
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", ";
+    if (Artificial)
+      R << "artificial ";
+    if (Name.empty()) {
+      R << "unnamed alloca ";
+      if (DVRs.empty())
+        R << "(missing debug metadata) ";
+    } else {
+      R << "alloca '" << Name << "' ";
+    }
+    R << "with ";
+    if (StaticSize)
+      R << "static size of " << itostr(StaticSize) << " bytes";
+    else
+      R << "dynamic size";
+    return R;
+  });
+}
+
+static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                       const CallBase &Call, StringRef CallKind,
+                       StringRef RemarkKind) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call);
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", " << CallKind << ", callee is ";
+    identifyCallee(R, Caller.getParent(), Call.getCalledOperand());
+    return R;
+  });
+}
+
+static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
+                                      const Function &Caller,
+                                      const Instruction &Inst) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst);
+    R << "in ";
+    identifyFunction(R, Caller);
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
+      R << ", '" << II->getCalledFunction()->getName() << "' call";
+    } else {
+      R << ", '" << Inst.getOpcodeName() << "' instruction";
+    }
+    if (!Inst.getType()->isVoidTy()) {
+      SmallString<20> Name;
+      raw_svector_ostream OS(Name);
+      Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
+      R << " ('" << Name << "')";
+    }
+    R << " accesses memory in flat address space";
+    return R;
+  });
+}
+
+void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction,
+                             OptimizationRemarkEmitter &ORE) {
+  assert(Direction == 1 || Direction == -1);
+  const Function &F = *BB.getParent();
+  const Module &M = *F.getParent();
+  const DataLayout &DL = M.getDataLayout();
+  for (const Instruction &I : BB.instructionsWithoutDebug()) {
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+      Allocas += Direction;
+      TypeSize::ScalarTy StaticSize = 0;
+      if (std::optional<TypeSize> Size = Alloca->getAllocationSize(DL)) {
+        StaticSize = Size->getFixedValue();
+        assert(StaticSize <= std::numeric_limits<int64_t>::max());
+        AllocasStaticSizeSum += Direction * StaticSize;
+      } else {
+        AllocasDyn += Direction;
+      }
+      remarkAlloca(ORE, F, *Alloca, StaticSize);
+    } else if (const CallBase *Call = dyn_cast<CallBase>(&I)) {
+      SmallString<40> CallKind;
+      SmallString<40> RemarkKind;
+      if (Call->isIndirectCall()) {
+        IndirectCalls += Direction;
+        CallKind += "indirect";
+        RemarkKind += "Indirect";
+      } else {
+        DirectCalls += Direction;
+        CallKind += "direct";
+        RemarkKind += "Direct";
+      }
+      if (isa<InvokeInst>(Call)) {
+        Invokes += Direction;
+        CallKind += " invoke";
+        RemarkKind += "Invoke";
+      } else {
+        CallKind += " call";
+        RemarkKind += "Call";
+      }
+      if (!Call->isIndirectCall()) {
+        if (const Function *Callee = Call->getCalledFunction()) {
+          if (!Callee->isIntrinsic() && !Callee->isDeclaration()) {
+            DirectCallsToDefinedFunctions += Direction;
+            CallKind += " to defined function";
+            RemarkKind += "ToDefinedFunction";
+          }
+        } else if (Call->isInlineAsm()) {
+          InlineAssemblyCalls += Direction;
+          CallKind += " to inline assembly";
+          RemarkKind += "ToInlineAssembly";
+        }
+      }
+      remarkCall(ORE, F, *Call, CallKind, RemarkKind);
+      if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
+        if (MI->getDestAddressSpace() == FlatAddrspace) {
+          FlatAddrspaceAccesses += Direction;
+          remarkFlatAddrspaceAccess(ORE, F, I);
+        } else if (const AnyMemTransferInst *MT =
+                       dyn_cast<AnyMemTransferInst>(MI)) {
+          if (MT->getSourceAddressSpace() == FlatAddrspace) {
+            FlatAddrspaceAccesses += Direction;
+            remarkFlatAddrspaceAccess(ORE, F, I);
+          }
+        }
+      }
+    } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
+      if (Load->getPointerAddressSpace() == FlatAddrspace) {
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+      if (Store->getPointerAddressSpace() == FlatAddrspace) {
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
+      if (At->getPointerAddressSpace() == FlatAddrspace) {
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
+      if (At->getPointerAddressSpace() == FlatAddrspace) {
+        FlatAddrspaceAccesses += Direction;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    }
+  }
+}
+
+static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                           StringRef Name, int64_t Value) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, Name, &F);
+    R << "in ";
+    identifyFunction(R, F);
+    R << ", " << Name << " = " << itostr(Value);
+    return R;
+  });
+}
+
+static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
+                                                   StringRef Name) {
+  if (!F.hasFnAttribute(Name))
+    return std::nullopt;
+  return F.getFnAttributeAsParsedInteger(Name);
+}
+
+void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
+                                TargetMachine *TM) {
+  KernelInfo KI;
+  TargetTransformInfo &TheTTI = FAM.getResult<TargetIRAnalysis>(F);
+  KI.FlatAddrspace = TheTTI.getFlatAddressSpace();
+
+  // Record function properties.
+  KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F);
+  for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) {
+    if (auto Val = parseFnAttrAsInteger(F, Name))
+      KI.LaunchBounds.push_back({Name, *Val});
+  }
+  TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds);
+
+  const DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  for (const auto &BB : F)
+    if (DT.isReachableFromEntry(&BB))
----------------
arsenm wrote:

Why does this need to worry about unreachable blocks? This isn't doing any transforms that need to consider breaking the IR 

https://github.com/llvm/llvm-project/pull/102944