[llvm-bugs] [Bug 39171] New: [NVPTX] Early call to __kmpc_global_thread_num
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Oct 4 01:35:43 PDT 2018
https://bugs.llvm.org/show_bug.cgi?id=39171
Bug ID: 39171
Summary: [NVPTX] Early call to __kmpc_global_thread_num
Product: OpenMP
Version: unspecified
Hardware: All
OS: All
Status: NEW
Severity: normal
Priority: P
Component: Clang Compiler Support
Assignee: unassignedclangbugs at nondot.org
Reporter: hahnjo at hahnjo.de
CC: llvm-bugs at lists.llvm.org
__kmpc_global_thread_num() in libomptarget-nvptx has to handle SPMD vs Generic
differently. The decision is based on the current value of "execution_param"
which is initialized by __kmpc_kernel_init() / __kmpc_spmd_kernel_init().
However current Clang trunk calls __kmpc_global_thread_num() from the entry
BasicBlock which is incorrect and might read from uninitialized memory.
Example for SPMD construct:
#pragma omp target parallel
{ }
This generates the following LLVM IR:
; Function Attrs: noinline norecurse nounwind optnone
define weak void @__omp_offloading_45_ba1b925f_main_l5() #0 {
entry:
%.zero.addr = alloca i32, align 4
%0 = call i32 bitcast (i32 (i8*)* @__kmpc_global_thread_num to i32
(%struct.ident_t*)*)(%struct.ident_t* @0)
%.threadid_temp. = alloca i32, align 4
store i32 0, i32* %.zero.addr, align 4
%nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !12
call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 1)
call void @__kmpc_data_sharing_init_stack_spmd()
br label %.execute
.execute: ; preds = %entry
store i32 %0, i32* %.threadid_temp., align 4
call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #11
br label %.omp.deinit
.omp.deinit: ; preds = %.execute
call void @__kmpc_spmd_kernel_deinit()
br label %.exit
.exit: ; preds = %.omp.deinit
ret void
}
Example for Generic construct (num_threads prohibits SPMD mode):
#pragma omp target parallel num_threads(2)
{ }
The worker and kernel functions look like this:
; Function Attrs: noinline norecurse nounwind
define internal void @__omp_offloading_45_ba294015_main_l5_worker() #0 {
entry:
%work_fn = alloca i8*, align 8
%exec_status = alloca i8, align 1
%0 = call i32 bitcast (i32 (i8*)* @__kmpc_global_thread_num to i32
(%struct.ident_t*)*)(%struct.ident_t* @0)
store i8* null, i8** %work_fn, align 8
store i8 0, i8* %exec_status, align 1
br label %.await.work
.await.work: ; preds = %.barrier.parallel,
%entry
call void @llvm.nvvm.barrier0()
%1 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1)
%2 = zext i1 %1 to i8
store i8 %2, i8* %exec_status, align 1
%3 = load i8*, i8** %work_fn, align 8
%should_terminate = icmp eq i8* %3, null
br i1 %should_terminate, label %.exit, label %.select.workers
.select.workers: ; preds = %.await.work
%4 = load i8, i8* %exec_status, align 1
%is_active = icmp ne i8 %4, 0
br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
.execute.parallel: ; preds = %.select.workers
%5 = load i8*, i8** %work_fn, align 8
%work_match = icmp eq i8* %5, bitcast (void (i16, i32)*
@__omp_outlined___wrapper to i8*)
br i1 %work_match, label %.execute.fn, label %.check.next
.execute.fn: ; preds = %.execute.parallel
call void @__omp_outlined___wrapper(i16 0, i32 %0) #12
br label %.terminate.parallel
.check.next: ; preds = %.execute.parallel
%6 = bitcast i8* %3 to void (i16, i32)*
call void %6(i16 0, i32 %0)
br label %.terminate.parallel
.terminate.parallel: ; preds = %.check.next,
%.execute.fn
call void @__kmpc_kernel_end_parallel()
br label %.barrier.parallel
.barrier.parallel: ; preds =
%.terminate.parallel, %.select.workers
call void @llvm.nvvm.barrier0()
br label %.await.work
.exit: ; preds = %.await.work
ret void
}
; Function Attrs: noinline norecurse nounwind optnone
define weak void @__omp_offloading_45_ba294015_main_l5() #1 {
entry:
%0 = call i32 bitcast (i32 (i8*)* @__kmpc_global_thread_num to i32
(%struct.ident_t*)*)(%struct.ident_t* @0)
%.zero.addr = alloca i32, align 4
store i32 0, i32* %.zero.addr, align 4
%nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
%nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !13
%nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !14
%thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
%1 = icmp ult i32 %nvptx_tid, %thread_limit
br i1 %1, label %.worker, label %.mastercheck
.worker: ; preds = %entry
call void @__omp_offloading_45_ba294015_main_l5_worker() #12
br label %.exit
.mastercheck: ; preds = %entry
%nvptx_tid1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
%nvptx_num_threads2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !13
%nvptx_warp_size3 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !14
%2 = sub nuw i32 %nvptx_warp_size3, 1
%3 = sub nuw i32 %nvptx_num_threads2, 1
%4 = xor i32 %2, -1
%master_tid = and i32 %3, %4
%5 = icmp eq i32 %nvptx_tid1, %master_tid
br i1 %5, label %.master, label %.exit
.master: ; preds = %.mastercheck
%nvptx_num_threads4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !13
%nvptx_warp_size5 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !14
%thread_limit6 = sub nuw i32 %nvptx_num_threads4, %nvptx_warp_size5
call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
call void @__kmpc_data_sharing_init_stack()
call void bitcast (void (i8*, i32, i32)* @__kmpc_push_num_threads to void
(%struct.ident_t*, i32, i32)*)(%struct.ident_t* @0, i32 %0, i32 2)
call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)*
@__omp_outlined___wrapper to i8*), i16 1)
call void @llvm.nvvm.barrier0()
call void @llvm.nvvm.barrier0()
br label %.termination.notifier
.termination.notifier: ; preds = %.master
call void @__kmpc_kernel_deinit(i16 1)
call void @llvm.nvvm.barrier0()
br label %.exit
.exit: ; preds =
%.termination.notifier, %.mastercheck, %.worker
ret void
}
In the worker function __kmpc_global_thread_num() may only be called after the
first barrier in .await.work because this is synchronizing with
__kmpc_kernel_init() called by the master thread in .master of the kernel
function.
Some comments say that the cached value of __kmpc_global_thread_num() isn't
used anyway by libomptarget-nvptx, so the solution may be as easy as not
calling the runtime at all. Otherwise Clang must make sure to only call the
function after the runtime is initialized, either through a preceeding call to
__kmpc_kernel_init() / __kmpc_spmd_kernel_init() or by synchronizing with a
barrier.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20181004/619b48ee/attachment-0001.html>
More information about the llvm-bugs
mailing list