[llvm] 6b9a3ec - [OpenMP] Do not SPMDize generic regions with no parallel
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 8 11:33:21 PDT 2021
Author: Joseph Huber
Date: 2021-09-08T14:33:15-04:00
New Revision: 6b9a3ec3a260053e4ae63e07372672b62de97eaf
URL: https://github.com/llvm/llvm-project/commit/6b9a3ec3a260053e4ae63e07372672b62de97eaf
DIFF: https://github.com/llvm/llvm-project/commit/6b9a3ec3a260053e4ae63e07372672b62de97eaf.diff
LOG: [OpenMP] Do not SPMDize generic regions with no parallel
This patch changes SPMDization to not trigger for regions with no
parallelism. Otherwise, this will introduce unnecessary barriers that
will slow the single-threaded region down.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D109438
Added:
Modified:
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
llvm/test/Transforms/OpenMP/always_inline_device.ll
llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 5167494293a30..e4effa8990e25 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -599,6 +599,12 @@ struct KernelInfoState : AbstractState {
return true;
}
+ /// Returns true if this kernel contains any OpenMP parallel regions.
+ bool mayContainParallelRegion() {
+ return !ReachedKnownParallelRegions.empty() ||
+ !ReachedUnknownParallelRegions.empty();
+ }
+
/// Return empty set as the best state of potential values.
static KernelInfoState getBestState() { return KernelInfoState(true); }
@@ -3003,7 +3009,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
// If we can we change the execution mode to SPMD-mode otherwise we build a
// custom state machine.
- if (!changeToSPMDMode(A))
+ if (!mayContainParallelRegion() || !changeToSPMDMode(A))
buildCustomStateMachine(A);
return ChangeStatus::CHANGED;
@@ -3308,8 +3314,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
// happen if there simply are no parallel regions. In the resulting kernel
// all worker threads will simply exit right away, leaving the main thread
// to do the work alone.
- if (ReachedKnownParallelRegions.empty() &&
- ReachedUnknownParallelRegions.empty()) {
+ if (!mayContainParallelRegion()) {
++NumOpenMPTargetRegionKernelsWithoutStateMachine;
auto Remark = [&](OptimizationRemark OR) {
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 7ffac2215a156..2aa797ee28886 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -12,11 +12,11 @@ define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
; CHECK: Function Attrs: convergent norecurse nounwind
; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true)
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
; CHECK: user_code.entry:
-; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false)
+; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
; CHECK-NEXT: ret void
; CHECK: worker.exit:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index 251885dc79c72..e4c341f8ada41 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -54,18 +54,40 @@ define weak void @kernel1() #0 {
define weak void @kernel2() #0 {
; CHECK-LABEL: define {{[^@]+}}@kernel2
; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1
+; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
+; CHECK: common.ret:
+; CHECK-NEXT: ret void
+; CHECK: user_code.entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) #[[ATTR1]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @helper0() #[[ATTR1]]
; CHECK-NEXT: call void @helper1() #[[ATTR1]]
; CHECK-NEXT: call void @helper2() #[[ATTR1]]
+; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0)
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
; CHECK-NEXT: ret void
;
- %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
+entry:
+ %captured_vars_addrs = alloca [0 x i8*], align 8
+ %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true)
+ %exec_user_code = icmp eq i32 %i, -1
+ br i1 %exec_user_code, label %user_code.entry, label %common.ret
+
+common.ret:
+ ret void
+
+user_code.entry:
+ %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null)
+ %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
call void @helper0()
call void @helper1()
call void @helper2()
- call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+ call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0)
+ call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
ret void
}
@@ -136,9 +158,31 @@ define internal void @helper2() {
ret void
}
+define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+;
+entry:
+ ret void
+}
+
+define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
+; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+;
+entry:
+ ret void
+}
+
declare i32 @__kmpc_get_hardware_num_threads_in_block()
declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1
declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1
+declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
!llvm.module.flags = !{!0, !1}
@@ -155,7 +199,8 @@ attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"}
;.
; CHECK: attributes #[[ATTR0]] = { "omp_target_num_teams"="777" "omp_target_thread_limit"="666" }
; CHECK: attributes #[[ATTR1]] = { nounwind }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { alwaysinline }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index a4bd36b3e98f6..3be3560f19cbc 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -38,14 +38,36 @@ define weak void @is_spmd() {
define weak void @will_be_spmd() {
; CHECK-LABEL: define {{[^@]+}}@will_be_spmd() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1
+; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
+; CHECK: common.ret:
+; CHECK-NEXT: ret void
+; CHECK: user_code.entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @is_spmd_helper2()
+; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0)
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
; CHECK-NEXT: ret void
;
- %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
+entry:
+ %captured_vars_addrs = alloca [0 x i8*], align 8
+ %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true)
+ %exec_user_code = icmp eq i32 %i, -1
+ br i1 %exec_user_code, label %user_code.entry, label %common.ret
+
+common.ret:
+ ret void
+
+user_code.entry:
+ %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null)
+ %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
call void @is_spmd_helper2()
- call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+ call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0)
+ call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
ret void
}
@@ -153,10 +175,32 @@ define internal void @is_mixed_helper() {
ret void
}
+define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+;
+entry:
+ ret void
+}
+
+define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
+; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+;
+entry:
+ ret void
+}
+
declare void @spmd_compatible() "llvm.assume"="ompx_spmd_amenable"
declare i8 @__kmpc_is_spmd_exec_mode()
-declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1
-declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext)
+declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext)
+declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
declare void @foo()
declare void @bar()
@@ -171,6 +215,8 @@ declare void @bar()
!5 = !{void ()* @will_not_be_spmd, !"kernel", i32 1}
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline }
+; CHECK: attributes #[[ATTR2]] = { nounwind }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
More information about the llvm-commits
mailing list