[llvm] [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline (PR #85626)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 21 03:43:25 PDT 2024


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/85626

>From 68219d99efb16750a1be2fe6553ea96a837d38b7 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 11 Mar 2024 09:27:29 +0100
Subject: [PATCH 1/3] [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline

This change allows us to use `--lto-partitions` in some cases (not guaranteed it works perfectly), as LDS is lowered before the module is split for parallel codegen.
---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2b457fe519d96c..c96625092a76c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -793,6 +793,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
 
         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
       });
+
+  PB.registerFullLinkTimeOptimizationLastEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        // We want to support the -lto-partitions=N option as "best effort".
+        // For that, we need to lower LDS earlier in the pipeline before the
+        // module is partitioned for codegen.
+        if (EnableLowerModuleLDS)
+          PM.addPass(AMDGPULowerModuleLDSPass(*this));
+      });
 }
 
 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {

>From b3ba8365cb7196b0739c0af3b948f82421ca8d40 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 14 Mar 2024 13:15:30 +0100
Subject: [PATCH 2/3] add lto pipeline test

---
 .../CodeGen/AMDGPU/lto-lower-module-lds.ll    | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll

diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
new file mode 100644
index 00000000000000..b813b8047bf24c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
@@ -0,0 +1,47 @@
+
+; Default O0
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O0
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O1
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O1
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O2
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O2
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O3
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O3
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; First print will be from the New PM during the full LTO pipeline.
+; Second print will be from the legacy PM during the CG pipeline.
+
+; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module]
+; CHECK: ModulePass Manager
+; CHECK:   Lower uses of LDS variables from non-kernel functions
+
+ at lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
+
+define amdgpu_kernel void @test() {
+entry:
+  store i32 1, ptr addrspace(3) @lds
+  ret void
+}

>From df1ff393b1cb4bea3c2a2882238ebae3492ac08b Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Mar 2024 11:43:10 +0100
Subject: [PATCH 3/3] poison init

---
 llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
index b813b8047bf24c..f1d946376afe06 100644
--- a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
@@ -38,7 +38,7 @@
 ; CHECK: ModulePass Manager
 ; CHECK:   Lower uses of LDS variables from non-kernel functions
 
- at lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
+ at lds = internal unnamed_addr addrspace(3) global i32 poison, align 4
 
 define amdgpu_kernel void @test() {
 entry:



More information about the llvm-commits mailing list