[llvm] [AMDGPU] Add AMDGPU-specific module splitting (PR #89245)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sat Apr 20 02:24:45 PDT 2024


================
@@ -0,0 +1,733 @@
+//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements a module splitting algorithm designed to support the
+/// FullLTO --lto-partitions option for parallel codegen. This is completely
+/// different from the common SplitModule pass, as this system is designed with
+/// AMDGPU in mind.
+///
+/// The basic idea of this module splitting implementation is the same as
+/// SplitModule: load-balance the module's functions across a set of N
+/// partitions to allow parallel codegen. However, it does it very
+/// differently than the target-agnostic variant:
+///   - Kernels are used as the module's "roots".
+///     They're known entry points on AMDGPU, and everything else is often
+///     internal only.
+///   - Each kernel has a set of dependencies, and when a kernel and its
+///     dependencies is considered "big", we try to put it in a partition where
+///     most dependencies are already imported, to avoid duplicating large
+///     amounts of code.
+///   - There's special care for indirect calls in order to ensure
+///     AMDGPUResourceUsageAnalysis can work correctly.
+///
+/// This file also includes a more elaborate logging system to enable
+/// users to easily generate logs that (if desired) do not include any value
+/// names, in order to not leak information about the source file.
+/// Such logs are very helpful to understand and fix potential issues with
+/// module splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSplitModule.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/SHA256.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-split-module"
+
+namespace {
+
+static cl::opt<float> LargeKernelFactor(
+    "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0), cl::Hidden,
+    cl::desc(
+        "consider a kernel as large and needing special treatment when it "
+        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
+        "means if the kernel and its dependencies is 2 times bigger than "
+        "an average partition; 0 disables large kernels handling entirely"));
+
+static cl::opt<float> LargeKernelOverlapForMerge(
+    "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8),
+    cl::Hidden,
+    cl::desc("defines how much overlap between two large kernel's dependencies "
+             "is needed to put them in the same partition"));
+
+static cl::opt<bool> NoExternalizeGlobals(
+    "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
+    cl::desc("disables externalization of global variable with local linkage; "
+             "may cause globals to be duplicated which increases binary size"));
+
+static cl::opt<std::string>
+    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
+              cl::desc("output directory for AMDGPU module splitting logs"));
+
+static cl::opt<bool>
+    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
+               cl::desc("hash value names before printing them in the AMDGPU "
+                        "module splitting logs"));
+
+using CostType = InstructionCost::CostType;
+using PartitionID = unsigned;
+
+static std::string getName(const Value &V) {
+  static std::optional<bool> HideNames;
+  if (!HideNames) {
+    if (LogPrivate.getNumOccurrences())
+      HideNames = LogPrivate;
+    else {
+      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
+      HideNames = (EV.value_or("0") != "0");
+    }
+  }
+
+  if (!*HideNames)
+    return V.getName().str();
+  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
+               /*LowerCase*/ true);
+}
+
+/// Main logging helper.
+///
+/// Logging can be configured by the following environment variable.
+///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
+///     If set, uses <filepath> as the directory to write logfiles to
+///     each time module splitting is used.
+///   AMD_SPLIT_MODULE_LOG_PRIVATE
+///     If set to anything other than zero, all names are hidden.
+///
+/// Both environment variables have corresponding CL options which
+/// takes priority over them.
+///
+/// Any output printed to the log files is also printed to dbgs() when -debug is
+/// used and LLVM_DEBUG is defined.
+///
+/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
+/// cannot be removed from the code (by building without debug). This probably
+/// has a small performance cost because if some computation/formatting is
+/// needed for logging purpose, it may be done everytime only to be ignored
+/// by the logger.
+///
+/// As this pass only runs once and is not doing anything computationally
+/// expensive, this is likely a reasonable trade-off.
+///
+/// If some computation should really be avoided when unused, users of the class
+/// can check whether any logging will occur by using the bool operator.
+///
+/// \code
+///   if (SML) {
+///     // Executes only if logging to a file or if -debug is available and
+///     used.
+///   }
+/// \endcode
+class SplitModuleLogger {
+public:
+  SplitModuleLogger(const Module &M) {
+    std::string LogDir = LogDirOpt;
+    if (LogDir.empty())
+      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
+
+    // No log dir specified means we don't need to log to a file.
+    // We may still log to dbgs(), though.
+    if (LogDir.empty())
+      return;
+
+    if (!sys::fs::is_directory(LogDir)) {
+      report_fatal_error("invalid AMDGPU split module log directory: '" +
+                             Twine(LogDir) + "' is not a directory",
+                         /*CrashDiag=*/false);
+    }
+
+    // If a log directory is specified, create a new file with a unique name in
+    // that directory.
+    SmallString<0> FilePath;
+    int Fd;
+    std::string LogFile = (LogDir + "/" + "Module-%%-%%-%%-%%-%%-%%-%%.txt");
+    if (auto Err = sys::fs::createUniqueFile(LogFile, Fd, FilePath)) {
+      dbgs() << LogFile << "\n";
----------------
arsenm wrote:

If this is always printed, don't use dbgs 

https://github.com/llvm/llvm-project/pull/89245


More information about the llvm-commits mailing list