[llvm] 7d69da7 - [OpenMP] Enable HeapToStack conversion in OpenMPOpt for new RTL globalization calls

Tue Jun 22 10:23:48 PDT 2021

Author: Joseph Huber
Date: 2021-06-22T13:23:05-04:00
New Revision: 7d69da71dd35f91a5d4310b16940405e7f0c2f20

URL: https://github.com/llvm/llvm-project/commit/7d69da71dd35f91a5d4310b16940405e7f0c2f20
DIFF: https://github.com/llvm/llvm-project/commit/7d69da71dd35f91a5d4310b16940405e7f0c2f20.diff

LOG: [OpenMP] Enable HeapToStack conversion in OpenMPOpt for new RTL globalization calls

Summary:
The changes to globalization introduced in D97680 introduce a large amount of overhead by default. The old globalization method would always ignore globalization code if executing in SPMD mode. This wasn't strictly correct as data sharing is still possible in SPMD mode. The new interface is correct but introduces globalization code even when unnecessary. This optimization will use the existing HeapToStack transformation in the attributor to allow for unneeded globalization to be replaced with thread-private stack memory. This is done using the newly introduced library instances for the RTL functions added in D102087.

Depends on D97818

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D102197

Added: 
    llvm/test/Transforms/OpenMP/remove_globalization.ll

Modified: 
    llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
    llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index bf435c45293f5..1804cfeef7b8d 100644

--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -499,6 +499,11 @@ __OMP_ATTRS_SET(InaccessibleArgOnlyAttrs,
                                    EnumAttr(WillReturn), EnumAttr(NoFree))
                     : AttributeSet(EnumAttr(NoUnwind)))
 
+__OMP_ATTRS_SET(NoCaptureAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoCapture))
+                    : AttributeSet(EnumAttr(NoCapture)))
+
 #if 0
 __OMP_ATTRS_SET(InaccessibleOnlyAttrs,
                 OptimisticAttributes
@@ -840,6 +845,11 @@ __OMP_RTL_ATTRS(__kmpc_doacross_wait, BarrierAttrs, AttributeSet(),
 __OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs))
 
+__OMP_RTL_ATTRS(__kmpc_alloc_shared, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_free_shared, AllocAttrs, AttributeSet(),
+                ParamAttrs(NoCaptureAttrs))
+
 __OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(), ParamAttrs())
 

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index feee0fae92150..6270f711b005d 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1623,9 +1623,13 @@ struct OpenMPOpt {
     };
     GlobalizationRFI.foreachUse(SCC, CreateAA);
 
+    // Create an ExecutionDomain AA for every function and a HeapToStack AA for
+    // every function if there is a device kernel.
     for (auto *F : SCC) {
       if (!F->isDeclaration())
         A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
+      if (!OMPInfoCache.Kernels.empty())
+        A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
     }
   }
 };

diff  --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
index fa6bfcc6f5084..6c1b0c89597fe 100644
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -29,9 +29,9 @@
 ; CHECK-DAG:   icmp eq i8* %5, @__omp_outlined__1_wrapper.ID
 ; CHECK-DAG:   icmp eq i8* %7, @__omp_outlined__3_wrapper.ID
 
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* @__omp_outlined__1_wrapper.ID, i8** %2, i64 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %2, i64 noundef 0)
 ; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0)
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* @__omp_outlined__3_wrapper.ID, i8** %3, i64 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef %3, i64 noundef 0)
 
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }

diff  --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
new file mode 100644
index 0000000000000..837851f062884
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64"
+
+ at S = external local_unnamed_addr global i8*
+
+define void @kernel() {
+; CHECK-LABEL: define {{[^@]+}}@kernel() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @foo() #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @bar() #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @foo()
+  call void @bar()
+  ret void
+}
+
+define internal void @foo() {
+; CHECK-LABEL: define {{[^@]+}}@foo
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i8* @__kmpc_alloc_shared(i64 4)
+  call void @use(i8* %0)
+  call void @__kmpc_free_shared(i8* %0)
+  ret void
+}
+
+define internal void @bar() {
+; CHECK-LABEL: define {{[^@]+}}@bar
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]]
+; CHECK-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[TMP0]]) #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i8* @__kmpc_alloc_shared(i64 4)
+  call void @share(i8* %0)
+  call void @__kmpc_free_shared(i8* %0)
+  ret void
+}
+
+define internal void @use(i8* %x) {
+entry:
+  ret void
+}
+
+define internal void @share(i8* %x) {
+; CHECK-LABEL: define {{[^@]+}}@share
+; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i8* [[X]], i8** @S, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i8* %x, i8** @S
+  ret void
+}
+
+; CHECK: declare i8* @__kmpc_alloc_shared(i64)
+declare i8* @__kmpc_alloc_shared(i64)
+
+; CHECK: declare void @__kmpc_free_shared(i8* nocapture)
+declare void @__kmpc_free_shared(i8*)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!nvvm.annotations = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{void ()* @kernel, !"kernel", i32 1}

diff  --git a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
index aea51fffd2f48..c401da803734c 100644
--- a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
+++ b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
@@ -3,6 +3,8 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 @.str = private unnamed_addr constant [13 x i8] c"Alloc Shared\00", align 1
 
+ at S = external local_unnamed_addr global i8*
+
 ; MODULE: remark: openmp_opt_module.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
 define void @foo() {
@@ -18,11 +20,11 @@ entry:
 define void @use(i8* %0) {
 entry:
   %.addr = alloca i8*, align 8
-  store i8* %0, i8** %.addr, align 8
+  store i8* %0, i8** @S
   ret void
 }
 
-define internal i8* @__kmpc_alloc_shared(i64 %DataSize) {
+define weak i8* @__kmpc_alloc_shared(i64 %DataSize) {
 entry:
   %call = call i8* @_Z10SafeMallocmPKc(i64 %DataSize, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0)) #11
   ret i8* %call