[llvm] 13b909e - OpenMPOpt: Check nested parallelism in target region

Jose M Monsalve Diaz via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 9 13:57:15 PST 2023


Author: Rafael A Herrera Guaitero
Date: 2023-01-09T15:55:30-06:00
New Revision: 13b909ef27e7cb7a4c55d1778254be1ac5c0884d

URL: https://github.com/llvm/llvm-project/commit/13b909ef27e7cb7a4c55d1778254be1ac5c0884d
DIFF: https://github.com/llvm/llvm-project/commit/13b909ef27e7cb7a4c55d1778254be1ac5c0884d.diff

LOG: OpenMPOpt: Check nested parallelism in target region

Analysis that determines if a parallel region can reach another parallel region in any target region of the TU.
A new global var is emitted with the name of the kernel + "_nested_parallelism", which is either 0 or 1 depending on the result.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D141010

Added: 
    llvm/test/Transforms/OpenMP/nested_parallelism.ll

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 25c03e7fe479c..5cf29bb1eafcb 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -598,6 +598,9 @@ struct KernelInfoState : AbstractState {
   /// caller is __kmpc_parallel_51.
   BooleanStateWithSetVector<uint8_t> ParallelLevels;
 
+  /// Flag that indicates if the kernel has nested Parallelism
+  bool NestedParallelism = false;
+
   /// Abstract State interface
   ///{
 
@@ -683,6 +686,7 @@ struct KernelInfoState : AbstractState {
     SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
     ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
     ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
+    NestedParallelism |= KIS.NestedParallelism;
     return *this;
   }
 
@@ -3339,6 +3343,15 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!KernelInitCB || !KernelDeinitCB)
       return ChangeStatus::UNCHANGED;
 
+    /// Insert nested Parallelism global variable
+    Function *Kernel = getAnchorScope();
+    Module &M = *Kernel->getParent();
+    Type *Int8Ty = Type::getInt8Ty(M.getContext());
+    new GlobalVariable(M, Int8Ty, /* isConstant */ true,
+                       GlobalValue::WeakAnyLinkage,
+                       ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
+                       Kernel->getName() + "_nested_parallelism");
+
     // If we can we change the execution mode to SPMD-mode otherwise we build a
     // custom state machine.
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
@@ -4354,6 +4367,12 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       if (auto *ParallelRegion = dyn_cast<Function>(
               CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
         ReachedKnownParallelRegions.insert(ParallelRegion);
+        /// Check nested parallelism
+        auto &FnAA = A.getAAFor<AAKernelInfo>(
+            *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
+        NestedParallelism |= !FnAA.getState().isValidState() ||
+                             !FnAA.ReachedKnownParallelRegions.empty() ||
+                             !FnAA.ReachedUnknownParallelRegions.empty();
         break;
       }
       // The condition above should usually get the parallel region function

diff  --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
new file mode 100644
index 0000000000000..8c56e69fda246
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+
+; void foo1(int i) {
+;   #pragma omp parallel
+;     i++;
+; }
+
+; void foo(int i) {
+;   #pragma omp parallel
+;     foo1(i);
+; }
+
+; int main() {
+;   int i=0;
+;   #pragma omp target
+;     foo(i);
+
+;   #pragma omp target
+;     foo1(i);
+; }
+
+target triple = "nvptx64"
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+
+ at 0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+ at __omp_offloading_10302_bd7e0_main_l13_exec_mode = weak protected constant i8 3
+ at __omp_offloading_10302_bd7e0_main_l16_exec_mode = weak protected constant i8 1
+ at i_shared = internal addrspace(3) global [4 x i8] undef, align 16
+ at i.i_shared = internal addrspace(3) global [4 x i8] undef, align 16
+ at llvm.compiler.used = appending global [2 x ptr] [ptr @__omp_offloading_10302_bd7e0_main_l13_exec_mode, ptr @__omp_offloading_10302_bd7e0_main_l16_exec_mode], section "llvm.metadata"
+
+;.
+; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak protected constant i8 3
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak protected constant i8 3
+; CHECK: @[[I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
+; CHECK: @[[I_I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
+; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @__omp_offloading_10302_bd7e0_main_l13_exec_mode, ptr @__omp_offloading_10302_bd7e0_main_l16_exec_mode], section "llvm.metadata"
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+;.
+define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(i64 noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1]], i8 2, i1 false)
+; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       user_code.entry:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED_I:%.*]], label [[_Z3FOOI_INTERNALIZED_EXIT:%.*]]
+; CHECK:       region.guarded.i:
+; CHECK-NEXT:    [[I_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[I:%.*]] to i32
+; CHECK-NEXT:    store i32 [[I_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), align 16
+; CHECK-NEXT:    br label [[_Z3FOOI_INTERNALIZED_EXIT]]
+; CHECK:       _Z3fooi.internalized.exit:
+; CHECK-NEXT:    tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR1]]
+; CHECK-NEXT:    store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8
+; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
+; CHECK-NEXT:    call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %captured_vars_addrs.i = alloca [1 x ptr], align 8
+  %0 = tail call i32 @__kmpc_target_init(ptr nonnull @1, i8 2, i1 false) #6
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %common.ret
+
+common.ret:                                       ; preds = %entry, %_Z3fooi.internalized.exit
+  ret void
+
+user_code.entry:                                  ; preds = %entry
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %captured_vars_addrs.i)
+  %1 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6
+  %2 = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #6
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %region.guarded.i, label %_Z3fooi.internalized.exit
+
+region.guarded.i:                                 ; preds = %user_code.entry
+  %i.addr.sroa.0.0.extract.trunc = trunc i64 %i to i32
+  store i32 %i.addr.sroa.0.0.extract.trunc, ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), align 16
+  br label %_Z3fooi.internalized.exit
+
+_Z3fooi.internalized.exit:                        ; preds = %user_code.entry, %region.guarded.i
+  tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @1, i32 %2)
+  store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr %captured_vars_addrs.i, align 8
+  call void @__kmpc_parallel_51(ptr nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull %captured_vars_addrs.i, i64 1) #6
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %captured_vars_addrs.i)
+  call void @__kmpc_target_deinit(ptr nonnull @1, i8 2) #6
+  br label %common.ret
+}
+
+declare i32 @__kmpc_target_init(ptr, i8, i1) local_unnamed_addr
+
+define hidden void @_Z3fooi(i32 noundef %i1) local_unnamed_addr #1 {
+; CHECK-LABEL: @_Z3fooi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR1]]
+; CHECK-NEXT:    [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4)
+; CHECK-NEXT:    store i32 [[I1:%.*]], ptr [[I]], align 16
+; CHECK-NEXT:    store ptr [[I]], ptr [[CAPTURED_VARS_ADDRS]], align 8
+; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 1)
+; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I]], i64 4)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %captured_vars_addrs = alloca [1 x ptr], align 8
+  %0 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6
+  %i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4)
+  store i32 %i1, ptr %i, align 16
+  store ptr %i, ptr %captured_vars_addrs, align 8
+  call void @__kmpc_parallel_51(ptr nonnull @1, i32 %0, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull %captured_vars_addrs, i64 1) #6
+  call void @__kmpc_free_shared(ptr %i, i64 4)
+  ret void
+}
+
+declare void @__kmpc_target_deinit(ptr, i8) local_unnamed_addr
+
+define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l16(i64 noundef %i) local_unnamed_addr #2 {
+; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1]], i8 2, i1 false)
+; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       user_code.entry:
+; CHECK-NEXT:    [[I_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[I:%.*]] to i32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR1]]
+; CHECK-NEXT:    br label [[REGION_CHECK_TID:%.*]]
+; CHECK:       region.check.tid:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
+; CHECK:       region.guarded:
+; CHECK-NEXT:    store i32 [[I_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), align 16
+; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
+; CHECK:       region.guarded.end:
+; CHECK-NEXT:    br label [[REGION_BARRIER]]
+; CHECK:       region.barrier:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB1]], i32 [[TMP2]])
+; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
+; CHECK:       region.exit:
+; CHECK-NEXT:    store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8
+; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
+; CHECK-NEXT:    call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %captured_vars_addrs.i = alloca [1 x ptr], align 8
+  %0 = tail call i32 @__kmpc_target_init(ptr nonnull @1, i8 1, i1 true) #6
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %common.ret
+
+common.ret:                                       ; preds = %entry, %user_code.entry
+  ret void
+
+user_code.entry:                                  ; preds = %entry
+  %i.addr.sroa.0.0.extract.trunc = trunc i64 %i to i32
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %captured_vars_addrs.i)
+  %1 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6
+  store i32 %i.addr.sroa.0.0.extract.trunc, ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), align 16
+  store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr %captured_vars_addrs.i, align 8
+  call void @__kmpc_parallel_51(ptr nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs.i, i64 1) #6
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %captured_vars_addrs.i)
+  call void @__kmpc_target_deinit(ptr nonnull @1, i8 1) #6
+  br label %common.ret
+}
+
+define hidden void @_Z4foo1i(i32 noundef %i1) local_unnamed_addr #1 {
+; CHECK-LABEL: @_Z4foo1i(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR1]]
+; CHECK-NEXT:    [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4)
+; CHECK-NEXT:    store i32 [[I1:%.*]], ptr [[I]], align 16
+; CHECK-NEXT:    store ptr [[I]], ptr [[CAPTURED_VARS_ADDRS]], align 8
+; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 1)
+; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I]], i64 4)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %captured_vars_addrs = alloca [1 x ptr], align 8
+  %0 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6
+  %i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4)
+  store i32 %i1, ptr %i, align 16
+  store ptr %i, ptr %captured_vars_addrs, align 8
+  call void @__kmpc_parallel_51(ptr nonnull @1, i32 %0, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs, i64 1) #6
+  call void @__kmpc_free_shared(ptr %i, i64 4)
+  ret void
+}
+
+declare ptr @__kmpc_alloc_shared(i64) local_unnamed_addr #3
+
+define internal void @__omp_outlined__(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid., ptr nocapture noundef nonnull readonly align 4 dereferenceable(4) %i) #4 {
+; CHECK-LABEL: @__omp_outlined__(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR1]]
+; CHECK-NEXT:    [[I_I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4)
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[I_I]], align 16
+; CHECK-NEXT:    store ptr [[I_I]], ptr [[CAPTURED_VARS_ADDRS_I]], align 8
+; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1)
+; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I_I]], i64 4)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %captured_vars_addrs.i = alloca [1 x ptr], align 8
+  %0 = load i32, ptr %i, align 4
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %captured_vars_addrs.i)
+  %1 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6
+  %i.i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #6
+  store i32 %0, ptr %i.i, align 16
+  store ptr %i.i, ptr %captured_vars_addrs.i, align 8
+  call void @__kmpc_parallel_51(ptr nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs.i, i64 1) #6
+  call void @__kmpc_free_shared(ptr %i.i, i64 4) #6
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %captured_vars_addrs.i)
+  ret void
+}
+
+define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #5 {
+; CHECK-LABEL: @__omp_outlined___wrapper(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I_I:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR1]]
+; CHECK-NEXT:    [[I_I_I:%.*]] = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR1]]
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[I_I_I]], align 16
+; CHECK-NEXT:    store ptr [[I_I_I]], ptr [[CAPTURED_VARS_ADDRS_I_I]], align 8
+; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]], i64 1)
+; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I_I_I]], i64 4) #[[ATTR1]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %captured_vars_addrs.i.i = alloca [1 x ptr], align 8
+  %global_args = alloca ptr, align 8
+  call void @__kmpc_get_shared_variables(ptr nonnull %global_args) #6
+  %2 = load ptr, ptr %global_args, align 8
+  %3 = load ptr, ptr %2, align 8
+  %4 = load i32, ptr %3, align 4
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %captured_vars_addrs.i.i)
+  %5 = call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6
+  %i.i.i = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #6
+  store i32 %4, ptr %i.i.i, align 16
+  store ptr %i.i.i, ptr %captured_vars_addrs.i.i, align 8
+  call void @__kmpc_parallel_51(ptr nonnull @1, i32 %5, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs.i.i, i64 1) #6
+  call void @__kmpc_free_shared(ptr %i.i.i, i64 4) #6
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %captured_vars_addrs.i.i)
+  ret void
+}
+
+declare void @__kmpc_get_shared_variables(ptr) local_unnamed_addr
+
+declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #6
+
+declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64) local_unnamed_addr #7
+
+declare void @__kmpc_free_shared(ptr allocptr nocapture, i64) local_unnamed_addr #8
+
+define internal void @__omp_outlined__1(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid., ptr nocapture noundef nonnull align 4 dereferenceable(4) %i) #9 {
+; CHECK-LABEL: @__omp_outlined__1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, ptr %i, align 4
+  ret void
+}
+
+define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #5 {
+; CHECK-LABEL: @__omp_outlined__1_wrapper(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[INC_I:%.*]] = add nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    store i32 [[INC_I]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %global_args = alloca ptr, align 8
+  call void @__kmpc_get_shared_variables(ptr nonnull %global_args) #6
+  %2 = load ptr, ptr %global_args, align 8
+  %3 = load ptr, ptr %2, align 8
+  %4 = load i32, ptr %3, align 4
+  %inc.i = add nsw i32 %4, 1
+  store i32 %inc.i, ptr %3, align 4
+  ret void
+}
+
+declare i32 @__kmpc_get_hardware_thread_id_in_block() local_unnamed_addr
+
+declare void @__kmpc_barrier_simple_spmd(ptr, i32) local_unnamed_addr #10
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #11
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #11
+
+
+!omp_offload.info = !{!0, !1}
+!nvvm.annotations = !{!2, !3}
+!llvm.module.flags = !{!4, !5}
+
+!0 = !{i32 0, i32 66306, i32 776160, !"main", i32 13, i32 0, i32 0}
+!1 = !{i32 0, i32 66306, i32 776160, !"main", i32 16, i32 0, i32 1}
+!2 = !{ptr @__omp_offloading_10302_bd7e0_main_l13, !"kernel", i32 1}
+!3 = !{ptr @__omp_offloading_10302_bd7e0_main_l16, !"kernel", i32 1}
+
+!4 = !{i32 7, !"openmp", i32 50}
+!5 = !{i32 7, !"openmp-device", i32 50}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nosync nounwind allocsize(0) }
+; CHECK: attributes #[[ATTR1]] = { nounwind }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { alwaysinline }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nosync nounwind }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nounwind }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 66306, i32 776160, !"main", i32 13, i32 0, i32 0}
+; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 66306, i32 776160, !"main", i32 16, i32 0, i32 1}
+; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_10302_bd7e0_main_l13, !"kernel", i32 1}
+; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_10302_bd7e0_main_l16, !"kernel", i32 1}
+; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+;.


        


More information about the llvm-commits mailing list