<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - [NVPTX] Early call to __kmpc_global_thread_num"
href="https://bugs.llvm.org/show_bug.cgi?id=39171">39171</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>[NVPTX] Early call to __kmpc_global_thread_num
</td>
</tr>
<tr>
<th>Product</th>
<td>OpenMP
</td>
</tr>
<tr>
<th>Version</th>
<td>unspecified
</td>
</tr>
<tr>
<th>Hardware</th>
<td>All
</td>
</tr>
<tr>
<th>OS</th>
<td>All
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Clang Compiler Support
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedclangbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>hahnjo@hahnjo.de
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>__kmpc_global_thread_num() in libomptarget-nvptx has to handle SPMD vs Generic
differently. The decision is based on the current value of "execution_param"
which is initialized by __kmpc_kernel_init() / __kmpc_spmd_kernel_init().
However current Clang trunk calls __kmpc_global_thread_num() from the entry
BasicBlock which is incorrect and might read from uninitialized memory.
Example for SPMD construct:
#pragma omp target parallel
{ }
This generates the following LLVM IR:
; Function Attrs: noinline norecurse nounwind optnone
define weak void @__omp_offloading_45_ba1b925f_main_l5() #0 {
entry:
%.zero.addr = alloca i32, align 4
%0 = call i32 bitcast (i32 (i8*)* @__kmpc_global_thread_num to i32
(%struct.ident_t*)*)(%struct.ident_t* @0)
%.threadid_temp. = alloca i32, align 4
store i32 0, i32* %.zero.addr, align 4
%nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !12
call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 1)
call void @__kmpc_data_sharing_init_stack_spmd()
br label %.execute
.execute: ; preds = %entry
store i32 %0, i32* %.threadid_temp., align 4
call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #11
br label %.omp.deinit
.omp.deinit: ; preds = %.execute
call void @__kmpc_spmd_kernel_deinit()
br label %.exit
.exit: ; preds = %.omp.deinit
ret void
}
Example for Generic construct (num_threads prohibits SPMD mode):
#pragma omp target parallel num_threads(2)
{ }
The worker and kernel functions look like this:
; Function Attrs: noinline norecurse nounwind
define internal void @__omp_offloading_45_ba294015_main_l5_worker() #0 {
entry:
%work_fn = alloca i8*, align 8
%exec_status = alloca i8, align 1
%0 = call i32 bitcast (i32 (i8*)* @__kmpc_global_thread_num to i32
(%struct.ident_t*)*)(%struct.ident_t* @0)
store i8* null, i8** %work_fn, align 8
store i8 0, i8* %exec_status, align 1
br label %.await.work
.await.work: ; preds = %.barrier.parallel,
%entry
call void @llvm.nvvm.barrier0()
%1 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1)
%2 = zext i1 %1 to i8
store i8 %2, i8* %exec_status, align 1
%3 = load i8*, i8** %work_fn, align 8
%should_terminate = icmp eq i8* %3, null
br i1 %should_terminate, label %.exit, label %.select.workers
.select.workers: ; preds = %.await.work
%4 = load i8, i8* %exec_status, align 1
%is_active = icmp ne i8 %4, 0
br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
.execute.parallel: ; preds = %.select.workers
%5 = load i8*, i8** %work_fn, align 8
%work_match = icmp eq i8* %5, bitcast (void (i16, i32)*
@__omp_outlined___wrapper to i8*)
br i1 %work_match, label %.execute.fn, label %.check.next
.execute.fn: ; preds = %.execute.parallel
call void @__omp_outlined___wrapper(i16 0, i32 %0) #12
br label %.terminate.parallel
.check.next: ; preds = %.execute.parallel
%6 = bitcast i8* %3 to void (i16, i32)*
call void %6(i16 0, i32 %0)
br label %.terminate.parallel
.terminate.parallel: ; preds = %.check.next,
%.execute.fn
call void @__kmpc_kernel_end_parallel()
br label %.barrier.parallel
.barrier.parallel: ; preds =
%.terminate.parallel, %.select.workers
call void @llvm.nvvm.barrier0()
br label %.await.work
.exit: ; preds = %.await.work
ret void
}
; Function Attrs: noinline norecurse nounwind optnone
define weak void @__omp_offloading_45_ba294015_main_l5() #1 {
entry:
%0 = call i32 bitcast (i32 (i8*)* @__kmpc_global_thread_num to i32
(%struct.ident_t*)*)(%struct.ident_t* @0)
%.zero.addr = alloca i32, align 4
store i32 0, i32* %.zero.addr, align 4
%nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
%nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !13
%nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !14
%thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
%1 = icmp ult i32 %nvptx_tid, %thread_limit
br i1 %1, label %.worker, label %.mastercheck
.worker: ; preds = %entry
call void @__omp_offloading_45_ba294015_main_l5_worker() #12
br label %.exit
.mastercheck: ; preds = %entry
%nvptx_tid1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
%nvptx_num_threads2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !13
%nvptx_warp_size3 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !14
%2 = sub nuw i32 %nvptx_warp_size3, 1
%3 = sub nuw i32 %nvptx_num_threads2, 1
%4 = xor i32 %2, -1
%master_tid = and i32 %3, %4
%5 = icmp eq i32 %nvptx_tid1, %master_tid
br i1 %5, label %.master, label %.exit
.master: ; preds = %.mastercheck
%nvptx_num_threads4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !13
%nvptx_warp_size5 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !14
%thread_limit6 = sub nuw i32 %nvptx_num_threads4, %nvptx_warp_size5
call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
call void @__kmpc_data_sharing_init_stack()
call void bitcast (void (i8*, i32, i32)* @__kmpc_push_num_threads to void
(%struct.ident_t*, i32, i32)*)(%struct.ident_t* @0, i32 %0, i32 2)
call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)*
@__omp_outlined___wrapper to i8*), i16 1)
call void @llvm.nvvm.barrier0()
call void @llvm.nvvm.barrier0()
br label %.termination.notifier
.termination.notifier: ; preds = %.master
call void @__kmpc_kernel_deinit(i16 1)
call void @llvm.nvvm.barrier0()
br label %.exit
.exit: ; preds =
%.termination.notifier, %.mastercheck, %.worker
ret void
}
In the worker function __kmpc_global_thread_num() may only be called after the
first barrier in .await.work because this is synchronizing with
__kmpc_kernel_init() called by the master thread in .master of the kernel
function.
Some comments say that the cached value of __kmpc_global_thread_num() isn't
used anyway by libomptarget-nvptx, so the solution may be as easy as not
calling the runtime at all. Otherwise Clang must make sure to only call the
function after the runtime is initialized, either through a preceeding call to
__kmpc_kernel_init() / __kmpc_spmd_kernel_init() or by synchronizing with a
barrier.</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>