[llvm] be2b569 - [OpenMP] Run rewriteDeviceCodeStateMachine in the Module not CGSCC pass

Mon Jul 26 19:26:36 PDT 2021

Author: Johannes Doerfert
Date: 2021-07-26T21:26:07-05:00
New Revision: be2b569646984e4aac988afda3090c2225228752

URL: https://github.com/llvm/llvm-project/commit/be2b569646984e4aac988afda3090c2225228752
DIFF: https://github.com/llvm/llvm-project/commit/be2b569646984e4aac988afda3090c2225228752.diff

LOG: [OpenMP] Run rewriteDeviceCodeStateMachine in the Module not CGSCC pass

While rewriteDeviceCodeStateMachine should probably be folded into
buildCustomStateMachine, we at least need the optimization to happen.
This was not reliably the case in the CGSCC pass but in the Module pass
it seems to work reliably.

This also ports a test to the new kernel encoding (target_init/deinit),
and makes sure we cannot run the kernel in SPMD mode.

Differential Revision: https://reviews.llvm.org/D106345

Added: 
    

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/custom_state_machines.ll
    llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 7eb3117c4a549..5d4f4f47fb9a5 100644

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -719,6 +719,9 @@ struct OpenMPOpt {
       // Recollect uses, in case Attributor deleted any.
       OMPInfoCache.recollectUses();
 
+      // TODO: This should be folded into buildCustomStateMachine.
+      Changed |= rewriteDeviceCodeStateMachine();
+
       if (remarksEnabled())
         analysisGlobalization();
     } else {
@@ -733,7 +736,6 @@ struct OpenMPOpt {
       OMPInfoCache.recollectUses();
 
       Changed |= deleteParallelRegions();
-      Changed |= rewriteDeviceCodeStateMachine();
 
       if (HideMemoryTransferLatency)
         Changed |= hideMemTransfersLatency();

diff  --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 012e25ccf0b06..f25778ec2e5ea 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -1003,7 +1003,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       worker_state_machine.is_active.check:
 ; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__2_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute:
 ; CHECK-NEXT:    call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]])
@@ -1046,10 +1046,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1138,7 +1138,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]])
 ; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check1:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__5_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute2:
 ; CHECK-NEXT:    call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
@@ -1182,7 +1182,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    ret void
 ;
@@ -1282,13 +1282,13 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       worker_state_machine.is_active.check:
 ; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__7_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute:
 ; CHECK-NEXT:    call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
 ; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check1:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__8_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute2:
 ; CHECK-NEXT:    call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]])
@@ -1327,10 +1327,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @unknown() #[[ATTR8]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1413,7 +1413,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       worker_state_machine.is_active.check:
 ; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__10_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute:
 ; CHECK-NEXT:    call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]])
@@ -1456,10 +1456,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR9:[0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;

diff  --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
index 2ef8d760bf3a3..1c9e67cf39f62 100644
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -1,6 +1,5 @@
-; RUN: opt -S -passes=openmp-opt-cgscc -openmp-ir-builder-optimistic-attributes -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
-; RUN: opt -S -passes=openmp-opt-cgscc -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
-; RUN: opt -S        -openmp-opt-cgscc -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
+; RUN: opt -S -passes=openmp-opt -openmp-ir-builder-optimistic-attributes -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
+; RUN: opt -S -passes=openmp-opt -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
 
 ; C input used for this test:
 
@@ -14,6 +13,7 @@
 ;     #pragma omp parallel
 ;     {}
 ;     bar();
+;     unknown();
 ;     #pragma omp parallel
 ;     {}
 ;   }
@@ -24,146 +24,49 @@
 ; another kernel.
 
 ; CHECK-DAG: @__omp_outlined__1_wrapper.ID = private constant i8 undef
-; CHECK-DAG: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; CHECK-DAG: @__omp_outlined__2_wrapper.ID = private constant i8 undef
 
-; CHECK-DAG:   icmp eq i8* %5, @__omp_outlined__1_wrapper.ID
-; CHECK-DAG:   icmp eq i8* %7, @__omp_outlined__3_wrapper.ID
+; CHECK-DAG:   icmp eq void (i16, i32)* %worker.work_fn.addr_cast, bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*)
+; CHECK-DAG:   icmp eq void (i16, i32)* %worker.work_fn.addr_cast, bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*)
 
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %2, i64 noundef 0)
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0)
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef %3, i64 noundef 0)
+
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %{{.*}}, i64 noundef 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef %{{.*}}, i64 noundef 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @2, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef %{{.*}}, i64 noundef 0)
 
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+ at __omp_offloading_10301_87b2c_foo_l7_exec_mode = weak constant i8 1
+ at 2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+ at llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_10301_87b2c_foo_l7_exec_mode], section "llvm.metadata"
 
-define internal void @__omp_offloading_50_6dfa0f01_foo_l6_worker()  {
-entry:
-  %work_fn = alloca i8*, align 8
-  %exec_status = alloca i8, align 1
-  store i8* null, i8** %work_fn, align 8
-  store i8 0, i8* %exec_status, align 1
-  br label %.await.work
-
-.await.work:                                      ; preds = %.barrier.parallel, %entry
-  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-  %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn)
-  %1 = zext i1 %0 to i8
-  store i8 %1, i8* %exec_status, align 1
-  %2 = load i8*, i8** %work_fn, align 8
-  %should_terminate = icmp eq i8* %2, null
-  br i1 %should_terminate, label %.exit, label %.select.workers
-
-.select.workers:                                  ; preds = %.await.work
-  %3 = load i8, i8* %exec_status, align 1
-  %is_active = icmp ne i8 %3, 0
-  br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
-
-.execute.parallel:                                ; preds = %.select.workers
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-  %5 = load i8*, i8** %work_fn, align 8
-  %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
-  br i1 %work_match, label %.execute.fn, label %.check.next
-
-.execute.fn:                                      ; preds = %.execute.parallel
-  call void @__omp_outlined__1_wrapper(i16 zeroext 0, i32 %4) 
-  br label %.terminate.parallel
-
-.check.next:                                      ; preds = %.execute.parallel
-  %6 = load i8*, i8** %work_fn, align 8
-  %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
-  br i1 %work_match1, label %.execute.fn2, label %.check.next3
-
-.execute.fn2:                                     ; preds = %.check.next
-  call void @__omp_outlined__2_wrapper(i16 zeroext 0, i32 %4) 
-  br label %.terminate.parallel
-
-.check.next3:                                     ; preds = %.check.next
-  %7 = load i8*, i8** %work_fn, align 8
-  %work_match4 = icmp eq i8* %7, bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
-  br i1 %work_match4, label %.execute.fn5, label %.check.next6
-
-.execute.fn5:                                     ; preds = %.check.next3
-  call void @__omp_outlined__3_wrapper(i16 zeroext 0, i32 %4) 
-  br label %.terminate.parallel
-
-.check.next6:                                     ; preds = %.check.next3
-  %8 = bitcast i8* %2 to void (i16, i32)*
-  call void %8(i16 0, i32 %4)
-  br label %.terminate.parallel
-
-.terminate.parallel:                              ; preds = %.check.next6, %.execute.fn5, %.execute.fn2, %.execute.fn
-  call void @__kmpc_kernel_end_parallel()
-  br label %.barrier.parallel
-
-.barrier.parallel:                                ; preds = %.terminate.parallel, %.select.workers
-  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-  br label %.await.work
-
-.exit:                                            ; preds = %.await.work
-  ret void
-}
-
-define weak void @__omp_offloading_50_6dfa0f01_foo_l6()  {
+define weak void @__omp_offloading_10301_87b2c_foo_l7() {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, i32* %.zero.addr, align 4
-  %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
-  %0 = icmp ult i32 %nvptx_tid, %thread_limit
-  br i1 %0, label %.worker, label %.mastercheck
-
-.worker:                                          ; preds = %entry
-  call void @__omp_offloading_50_6dfa0f01_foo_l6_worker() 
-  br label %.exit
-
-.mastercheck:                                     ; preds = %entry
-  %nvptx_tid1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %nvptx_num_threads2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %nvptx_warp_size3 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  %1 = sub nuw i32 %nvptx_warp_size3, 1
-  %2 = sub nuw i32 %nvptx_num_threads2, 1
-  %3 = xor i32 %1, -1
-  %master_tid = and i32 %2, %3
-  %4 = icmp eq i32 %nvptx_tid1, %master_tid
-  br i1 %4, label %.master, label %.exit
-
-.master:                                          ; preds = %.mastercheck
-  %nvptx_num_threads4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %nvptx_warp_size5 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  %thread_limit6 = sub nuw i32 %nvptx_num_threads4, %nvptx_warp_size5
-  call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
-  call void @__kmpc_data_sharing_init_stack()
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-  store i32 %5, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) 
-  br label %.termination.notifier
-
-.termination.notifier:                            ; preds = %.master
-  call void @__kmpc_kernel_deinit(i16 1)
-  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-  br label %.exit
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit
+
+user_code.entry:                                  ; preds = %entry
+  %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %1, i32* %.threadid_temp., align 4
+  call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr)
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
+  ret void
 
-.exit:                                            ; preds = %.termination.notifier, %.mastercheck, %.worker
+worker.exit:                                      ; preds = %entry
   ret void
 }
 
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
-
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 
-
-declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() 
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
+declare void @unknown()
 
-declare void @__kmpc_kernel_init(i32, i16)
-
-declare void @__kmpc_data_sharing_init_stack()
-
-define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
@@ -175,13 +78,14 @@ entry:
   %1 = load i32, i32* %0, align 4
   %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %2, i64 0)
-  call void @bar() 
+  call void @bar()
+  call void @unknown()
   %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
-  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0)
+  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %3, i64 0)
   ret void
 }
 
-define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
@@ -190,7 +94,7 @@ entry:
   ret void
 }
 
-define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1)  {
+define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -200,7 +104,7 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) 
+  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr)
   ret void
 }
 
@@ -208,16 +112,16 @@ declare void @__kmpc_get_shared_variables(i8***)
 
 declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
 
-define hidden void @bar()  {
+define hidden void @bar() {
 entry:
   %captured_vars_addrs = alloca [0 x i8*], align 8
-  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
   %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
-  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0)
+  call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %1, i64 0)
   ret void
 }
 
-define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
@@ -226,7 +130,7 @@ entry:
   ret void
 }
 
-define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1)  {
+define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -236,13 +140,15 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) 
+  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr)
   ret void
 }
 
-declare i32 @__kmpc_global_thread_num(%struct.ident_t*) 
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
 
-define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
+
+define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
@@ -251,7 +157,7 @@ entry:
   ret void
 }
 
-define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1)  {
+define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -261,22 +167,15 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) 
+  call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr)
   ret void
 }
 
-declare void @__kmpc_kernel_deinit(i16)
-
-declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) 
-
-declare i1 @__kmpc_kernel_parallel(i8**)
-
-declare void @__kmpc_kernel_end_parallel()
-
-
+!omp_offload.info = !{!0}
 !nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3}
 
-!1 = !{void ()* @__omp_offloading_50_6dfa0f01_foo_l6, !"kernel", i32 1}
+!0 = !{i32 0, i32 66305, i32 555956, !"foo", i32 7, i32 0}
+!1 = !{void ()* @__omp_offloading_10301_87b2c_foo_l7, !"kernel", i32 1}
 !2 = !{i32 7, !"openmp", i32 50}
 !3 = !{i32 7, !"openmp-device", i32 50}