[llvm] [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline (PR #75333)

Wed Jan 31 01:25:40 PST 2024

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/75333

>From 7384287f4d262f92d07269939ae50e2a757a9cf4 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 13 Dec 2023 13:32:42 +0100
Subject: [PATCH] [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline

This change allows us to use `--lto-partitions` in some cases (not guaranteed it works perfectly), as LDS is lowered before the module is split for parallel codegen.

LowerrLDS doesn't support being ran twice because
it'll think the lowered LDS GVs are "absolute addresses" LDS which aren't supported, so I just added a module flag to detect multiple runs.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  5 ++-
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 24 ++++++++++--
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  9 +++++
 .../CodeGen/AMDGPU/lower-module-lds-reruns.ll | 39 +++++++++++++++++++
 4 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-module-lds-reruns.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 36af767a70b0a..7dbd903a5a27c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -131,7 +131,10 @@ extern char &AMDGPULowerModuleLDSLegacyPassID;
 
 struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
   const AMDGPUTargetMachine &TM;
-  AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+  bool IsEarlyRun;
+  AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_,
+                           bool IsEarlyRun = false)
+      : TM(TM_), IsEarlyRun(IsEarlyRun) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 5762f1906a16d..6af1780c0969c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -215,6 +215,12 @@ using namespace llvm;
 
 namespace {
 
+cl::opt<bool>
+    ForceAddModuleFlag("amdgpu-lower-module-lds-force-add-moduleflag",
+                       cl::desc("Always add the module flag that prevents "
+                                "multiple runs of LowerModuleLDS."),
+                       cl::init(false), cl::ReallyHidden);
+
 cl::opt<bool> SuperAlignLDSGlobals(
     "amdgpu-super-align-lds-globals",
     cl::desc("Increase alignment of LDS if it is not on align boundary"),
@@ -254,6 +260,7 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
 
 class AMDGPULowerModuleLDS {
   const AMDGPUTargetMachine &TM;
+  bool IsEarlyRun;
 
   static void
   removeLocalVarsFromUsedLists(Module &M,
@@ -328,7 +335,8 @@ class AMDGPULowerModuleLDS {
   }
 
 public:
-  AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+  AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_, bool IsEarlyRun = false)
+      : TM(TM_), IsEarlyRun(IsEarlyRun) {}
 
   using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
 
@@ -1133,6 +1141,15 @@ class AMDGPULowerModuleLDS {
   }
 
   bool runOnModule(Module &M) {
+    // This pass may run twice in a full LTO pipeline.
+    //
+    // If we ran it early, we'll have added metadata to skip next runs.
+    if (M.getModuleFlag("amdgcn.lowered_module_lds"))
+      return false;
+    if (IsEarlyRun || ForceAddModuleFlag)
+      M.addModuleFlag(Module::ModFlagBehavior::Warning,
+                      "amdgcn.lowered_module_lds", 1);
+
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
 
@@ -1626,6 +1643,7 @@ llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) {
 
 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
                                                 ModuleAnalysisManager &) {
-  return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none()
-                                                 : PreservedAnalyses::all();
+  return AMDGPULowerModuleLDS(TM, IsEarlyRun).runOnModule(M)
+             ? PreservedAnalyses::none()
+             : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b8a7a5e208021..af521169859ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -779,6 +779,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
 
         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
       });
+
+  PB.registerFullLinkTimeOptimizationLastEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        // We want to support the -lto-partitions=N option as "best effort".
+        // For that, we need to lower LDS earlier in the pipeline before the
+        // module is partitioned for codegen.
+        if (EnableLowerModuleLDS)
+          PM.addPass(AMDGPULowerModuleLDSPass(*this, /*IsEarlyRun*/ true));
+      });
 }
 
 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-reruns.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-reruns.ll
new file mode 100644
index 0000000000000..f0a46b2d6ead8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-reruns.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %s -o %t.ll
+; RUN: not --crash opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %t.ll -o - 2>&1 | FileCheck %s --check-prefix=ERR
+
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module --amdgpu-lower-module-lds-force-add-moduleflag=1 %s -o %t.ll
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %t.ll -o - | FileCheck %s
+
+; Check re-run of LowerModuleLDS don't crash when the module flag is used.
+;
+; We first check this test still crashes when ran twice. If it no longer crashes at some point
+; we should update it to ensure the flag still does its job.
+;
+; This test jus has the bare minimum checks to see if the pass ran.
+
+; ERR: LLVM ERROR: LDS variables with absolute addresses are unimplemented.
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 8
+
+; CHECK: attributes #0 = { "amdgpu-lds-size"="12" }
+
+ at var0 = addrspace(3) global float poison, align 8
+ at var1 = addrspace(3) global i32 poison, align 8
+ at ptr = addrspace(1) global ptr addrspace(3) @var1, align 4
+ at with_init = addrspace(3) global i64 0
+
+define void @func() {
+  %dec = atomicrmw fsub ptr addrspace(3) @var0, float 1.0 monotonic
+  %val0 = load i32, ptr addrspace(3) @var1, align 4
+  %val1 = add i32 %val0, 4
+  store i32 %val1, ptr addrspace(3) @var1, align 4
+  %unused0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 monotonic
+  ret void
+}
+
+define amdgpu_kernel void @kern_call() {
+  call void @func()
+  %dec = atomicrmw fsub ptr addrspace(3) @var0, float 2.0 monotonic
+  ret void
+}