[llvm] 29a74a3 - [OpenMP] Add an option to always inline OpenMP device functions.

Tue Aug 31 15:48:47 PDT 2021

Author: Joseph Huber
Date: 2021-08-31T18:48:30-04:00
New Revision: 29a74a39150ad0a041922431cda68f1afba830c9

URL: https://github.com/llvm/llvm-project/commit/29a74a39150ad0a041922431cda68f1afba830c9
DIFF: https://github.com/llvm/llvm-project/commit/29a74a39150ad0a041922431cda68f1afba830c9.diff

LOG: [OpenMP] Add an option to always inline OpenMP device functions.

Performance on GPU targets can be highly variable, sometimes inlining
everything hurts performance and sometimes it greatly improves it. Add
an option to toggle this behaviour to better investigate it.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D109014

Added: 
    llvm/test/Transforms/OpenMP/always_inline_device.ll

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index c48ae6ec2f36a..5167494293a30 100644

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -97,6 +97,11 @@ static cl::opt<bool> PrintModuleAfterOptimizations(
     cl::desc("Print the current module after OpenMP optimizations."),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> AlwaysInlineDeviceFunctions(
+    "openmp-opt-inline-device", cl::ZeroOrMore,
+    cl::desc("Inline all applicible functions on the device."), cl::Hidden,
+    cl::init(false));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -4481,6 +4486,13 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(true);
 
+  // Optionally inline device functions for potentially better performance.
+  if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
+    for (Function &F : M)
+      if (!F.isDeclaration() && !Kernels.contains(&F) &&
+          !F.hasFnAttribute(Attribute::NoInline))
+        F.addFnAttr(Attribute::AlwaysInline);
+
   if (PrintModuleAfterOptimizations)
     LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
 

diff  --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
new file mode 100644
index 0000000000000..7ffac2215a156
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes
+; RUN: opt < %s -S -passes=openmp-opt -openmp-opt-inline-device | FileCheck %s
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+ at 0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+ at __omp_offloading_fd02_c0934fc2_foo_l4_exec_mode = weak constant i8 1
+ at llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata"
+
+; Function Attrs: convergent norecurse nounwind
+define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
+; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false)
+; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+; CHECK:       user_code.entry:
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false)
+; CHECK-NEXT:    ret void
+; CHECK:       worker.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit
+
+user_code.entry:                                  ; preds = %entry
+  call void @bar() #2
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
+  ret void
+
+worker.exit:                                      ; preds = %entry
+  ret void
+}
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
+
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
+
+; Function Attrs: convergent nounwind
+define hidden void @bar() #1 {
+; CHECK: Function Attrs: alwaysinline convergent nounwind
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
+attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
+attributes #2 = { convergent }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
+!1 = !{void ()* @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"openmp", i32 50}
+!4 = !{i32 7, !"openmp-device", i32 50}
+!5 = !{i32 7, !"PIC Level", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{!"clang version 14.0.0"}