[llvm] [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline (PR #85626)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 21 03:43:25 PDT 2024
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/85626
>From 68219d99efb16750a1be2fe6553ea96a837d38b7 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 11 Mar 2024 09:27:29 +0100
Subject: [PATCH 1/3] [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline
This change allows us to use `--lto-partitions` in some cases (not guaranteed it works perfectly), as LDS is lowered before the module is split for parallel codegen.
---
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2b457fe519d96c..c96625092a76c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -793,6 +793,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
});
+
+ PB.registerFullLinkTimeOptimizationLastEPCallback(
+ [this](ModulePassManager &PM, OptimizationLevel Level) {
+ // We want to support the -lto-partitions=N option as "best effort".
+ // For that, we need to lower LDS earlier in the pipeline before the
+ // module is partitioned for codegen.
+ if (EnableLowerModuleLDS)
+ PM.addPass(AMDGPULowerModuleLDSPass(*this));
+ });
}
int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
>From b3ba8365cb7196b0739c0af3b948f82421ca8d40 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 14 Mar 2024 13:15:30 +0100
Subject: [PATCH 2/3] add lto pipeline test
---
.../CodeGen/AMDGPU/lto-lower-module-lds.ll | 47 +++++++++++++++++++
1 file changed, 47 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
new file mode 100644
index 00000000000000..b813b8047bf24c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
@@ -0,0 +1,47 @@
+
+; Default O0
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O0
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O1
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O1
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O2
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O2
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O3
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O3
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; First print will be from the New PM during the full LTO pipeline.
+; Second print will be from the legacy PM during the CG pipeline.
+
+; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module]
+; CHECK: ModulePass Manager
+; CHECK: Lower uses of LDS variables from non-kernel functions
+
+ at lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
+
+define amdgpu_kernel void @test() {
+entry:
+ store i32 1, ptr addrspace(3) @lds
+ ret void
+}
>From df1ff393b1cb4bea3c2a2882238ebae3492ac08b Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Mar 2024 11:43:10 +0100
Subject: [PATCH 3/3] poison init
---
llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
index b813b8047bf24c..f1d946376afe06 100644
--- a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
@@ -38,7 +38,7 @@
; CHECK: ModulePass Manager
; CHECK: Lower uses of LDS variables from non-kernel functions
- at lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
+ at lds = internal unnamed_addr addrspace(3) global i32 poison, align 4
define amdgpu_kernel void @test() {
entry:
More information about the llvm-commits
mailing list