[PATCH] D29214: [AMDGPU] Internalize non-kernel symbols

Thu Jan 26 23:29:13 PST 2017

rampitec created this revision.
Herald added a reviewer: tstellarAMD.
Herald added subscribers: tony-tye, yaxunl, nhaehnle, wdng, kzhuravl, arsenm.

Since we have no call support and late linking we can produce code
only for used symbols. This saves compilation time, size of the final
executable, and size of any intermediate dumps.

Run Internalize pass early in the opt pipeline followed by global
DCE pass. To enable it RT can pass -amdgpu-whole-program option.


Repository:
  rL LLVM

https://reviews.llvm.org/D29214

Files:
  lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
  test/CodeGen/AMDGPU/internalize.ll


Index: test/CodeGen/AMDGPU/internalize.ll
===================================================================

--- /dev/null
+++ test/CodeGen/AMDGPU/internalize.ll
@@ -0,0 +1,35 @@
+; RUN: opt -O1 -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-whole-program < %s | FileCheck %s
+; CHECK-NOT: unused
+; CHECK-NOT: foo_used
+; CHECK: gvar_used
+; CHECK: main_kernel
+
+ at gvar_unused = addrspace(1) global i32 undef, align 4
+ at gvar_used = addrspace(1) global i32 undef, align 4
+
+; Function Attrs: alwaysinline nounwind
+define void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
+entry:
+  store i32 1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
+entry:
+  store i32 %tid, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @main_kernel() {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  tail call void @foo_used(i32 addrspace(1)* @gvar_used, i32 %tid) nounwind
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+
+attributes #1 = { alwaysinline nounwind }
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -84,6 +84,13 @@
   cl::init(false),
   cl::Hidden);
 
+// Option to enable whole program mode
+static cl::opt<bool> WholeProgram(
+  "amdgpu-whole-program",
+  cl::desc("Enable whole program mode"),
+  cl::init(false),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -212,6 +219,32 @@
     [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
       PM.add(createAMDGPUUnifyMetadataPass());
     });
+  bool Whole = WholeProgram && (getTargetTriple().getArch() == Triple::amdgcn);
+  Builder.addExtension(
+    PassManagerBuilder::EP_ModuleOptimizerEarly,
+    [Whole](const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) {
+      if (Builder.OptLevel > 0 && Whole) {
+        PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
+          if (const Function *F = dyn_cast<Function>(&GV)) {
+            if (F->isDeclaration())
+                return true;
+            switch (F->getCallingConv()) {
+            default:
+              return false;
+            case CallingConv::AMDGPU_VS:
+            case CallingConv::AMDGPU_GS:
+            case CallingConv::AMDGPU_PS:
+            case CallingConv::AMDGPU_CS:
+            case CallingConv::AMDGPU_KERNEL:
+            case CallingConv::SPIR_KERNEL:
+              return true;
+            }
+          }
+          return !GV.use_empty();
+        }));
+        PM.add(createGlobalDCEPass());
+      }
+  });
 }
 
 //===----------------------------------------------------------------------===//


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D29214.86020.patch
Type: text/x-patch
Size: 2984 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170127/af764db3/attachment.bin>