<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/62627>62627</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
libomptarget interop use and destroy clauses with nowait and depend are missing synchronisations
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
tomdeakin
</td>
</tr>
</table>
<pre>
The OpenMP `interop` construct should ensure that foreign functions enqueued on the foreign synchronisation object (i.e., the CUDA queue) that is returned via the `omp_ipr_targetsync` property should finish before tasks enqueued after the `interop` construct which depend on that `interop` construct.
For example, OpenMP tasks depending on the `interop` construct with a `depend()` clause should not start until the foreign functions in the synchronisation object have finished.
The below example captures the incorrect behaviour, which can be fixed by adding a call to `cudaStreamSynchronize`. I'm suggesting a real fix could be to have the `interop` construct create a CUDA event and call `cuStreamWaitEvent` or `cudaStreamWaitEvent` for all other streams to make sure that those streams do not continue until the work on the "interop" stream is done.
## Build
```bash
clang++ -O3 kernel.cu --cuda-gpu-arch=sm_75 -c
clang -fopenmp --offload-arch=native example.c kernel.o -lcudart -L/path/to/cuda/lib64
```
## Output
```
# Where FIXME_MISSINGBARRIER is defined as empty
Incorrect 42894
# Where FIXME_MISSINGBARRIER is defined as cudaStreamSynchronize(s)
Success
```
## example.c
```c
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <cuda_runtime.h>
#define FIXME_ADDBARRIER
//#define FIXME_ADDBARRIER cudaStreamSynchronize(s);
extern void call_cuda_kernel(int * A, int N, cudaStream_t s);
int main(void) {
int N = 100000;
int *A = (int *)malloc(sizeof(int) * N);
#pragma omp target enter data map(alloc: A[:N])
#pragma omp target nowait depend(out: A)
for (int i = 0; i < N; ++i)
A[i] = i;
omp_interop_t iobj = omp_interop_none;
#pragma omp interop init(targetsync: iobj) nowait depend(inout: A)
// Check we have a CUDA runtime
int err;
if (omp_get_interop_int(iobj, omp_ipr_fr_id, &err) != omp_ifr_cuda) {
printf("Wrong interop runtime\n");
exit(EXIT_FAILURE);
}
// Get CUDA stream
cudaStream_t s = (cudaStream_t) omp_get_interop_ptr(iobj, omp_ipr_targetsync, NULL);
// Asynchronously enqueue CUDA kernel on the stream
#pragma omp target data use_device_ptr(A)
call_cuda_kernel(A, N, s);
FIXME_ADDBARRIER
#pragma omp interop use(iobj) nowait depend(inout: A)
#pragma omp target nowait depend(inout: A)
for (int i = 0; i < N; ++i)
A[i] += 1;
#pragma omp interop use(iobj) nowait depend(inout: A)
#pragma omp target data use_device_ptr(A)
call_cuda_kernel(A, N, s);
FIXME_ADDBARRIER
#pragma omp interop destroy(iobj) nowait depend(inout: A)
#pragma omp taskwait
#pragma omp target exit data map(from: A[:N])
// Check solution
for (int i = 0; i < N; ++i)
if (A[i] != i + 3) {
printf("Incorrect %d\n", A[i]);
exit(EXIT_FAILURE);
}
printf("Success\n");
free(A);
}
```
## kernel.cu
```c++
#include <cuda_runtime.h>
#include <cstdio>
__global__ void cuda_kernel(int *A, int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N)
A[tid] += 1;
}
extern "C" {
void call_cuda_kernel(int *A, int N, cudaStream_t s) {
cuda_kernel<<<N, 1, 0, s>>>(A, N);
}
}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8WF-P4jgS_zTmpQQCh78PPNDQrJBmek7TO5p9Q05SIZ527Jzt0N376U-2k5AwML17e7oWakKqXK5fVflXlTBj-Ekirsnsgcx2A1bZXOm1VUWK7IXLQazS9_XvOcKXEuXnfwGZj7m0qFVJ5mNIlDRWV4kFk6tKpIDSVBrB5sxCpjTyk4SskonlShpA-e8KK0xBSbA5thrmXSa5VpIb5hRBxT8wsUDoko9wROjWa2-_7TbgDRC6CltwAxptpSWmcObMq5H5WBXlkZf6aJk-oXXWnbOlViVq-974mnHJTQ4xOjfAMvPS8ZBlFnVj7ybi15wnOaRYoqwBMXtPeUTGOzLehP97pQHfWFEKdNDqwIb9gzkuT02I7u7ObQ7MicMSQpeErryOYJXBBqRUFoxl2kIlLRe9sF8Sw8Nmd_KQszPW0cK0B8UVRoxCvTaAIGGlrTQab4_LRGntTMSYszNXlXaIQ-QSJiF2dt8whfgdWOpxM0iYEGCVw5ZUKXu2Glnx3Lj2J5L5eAQHQhcFmOp0QmPDOo1MOHOQeOQxOiPe918FMtHILAIL5YVnlBaYTIMX3oXgwHfG7aOTuuVK973rCTOlwS1WNkcNxisY50vBXhAu58PmyuWplqfK5ypR0nJZYSddr0q_tOVAaYOC0nqtOwWpktjLDKERoRE8VFyk9Z35OHxiZvJwKxFMngh9IPQBhl8ieEEtUYySCoZDB254Kqsh00lOop0pjosZDJPOShhmqkRZlDAcqiwTiqWNtmSWn7GpilHSmFYwFM6ytjD8ROi-ZDYndG8VoXt3n9C94PF8euXyDWRfKltW9rYejeB7jhphf_jj8-Px8-H5-fD028Pm69fD41cfLsy44wxmAIvSvod1h7Zcp3S5mv4Xxm7XK10adza9uecqSdCYj_G1obtSTVotLhNRpQgk2hqbCh6PchI93hFzdVeqivKuzAE6aleLBXaVGtWAvQ7NZrerw9LI9_5zT-vX4Yoeunvhm0Ut4ax4OJpH71moKtcnpGsXG9g4fnE_ntzFxf7Rwg2jTrFgXBK6dIZdWyGLngYEY0CiHUzG7q81EUSEbjZeenGC0FXBhFCJQ8L_RJUFmTdPN861VccKoVGp2algoIoSQssCdIccUmYZFKwkdBkMRg7h7IFEmycy27VFBfBLY1K9Mm6hbRSqssFSu95RVg2AezQOpr_cwpO7DCTBOzuC94ST2c4v4F1E9bdvw4Gtjha4in941e5tqSTejUWtBFxyS-iy082jjTfnInoNjsuf4XXC4yoStjkmL_CKoTvU1F9XeTe5qPXFOZ65EDnnT2hbAD6vy-DMFprBI9NHV05bIHTujPjMT1rwmT4GsuvUG5SaS5v5Pk6_ayVPLf7Gs9lWEkr71QOAbz46j38cfj_uN4dP374-XhXYYldfXoXhN7QBeugjjbR_apri7t51jl8HorT6RiA6OaNbePr26dOVb32PNs0Eoioj3ptpLDgZznrTCPsu3yx7f3oqg8cUzzzB2sNuWdwgEk8gnjxu8MUdnrtz-pr0VQbbyPztev0Lx_mmhX94oumDZ7ybmfp_YPwwd389dZed7mTvDqQUjdXq_X8Fy7y4xf3ecpv637jtMn-mVXGP-G_ymlGicuP7PyqFwHadgvDsxZ0uRH3i8n9d9rrMUYTO0pa2tm199dLzIX91GKy51d2tmah-ZsemuWnEpnhaUd_irwaxdij-aRDzYfs7U1NfyU9lVyPV8XgSKmbieKxHnVtTTnfI6SXCP9V4ieWpT7XNNbL0kL6N3nzqYqGSl-bnJvzc8WL0dt3ngoFtGFZ6BGF5epMirkJaD2yE0q17VGm9_GCC-2CA66HtrI-24eNXTdy_cSCC6LH-XPjhVhW0F012B-k6SlfRig1wPZkvo_l0PJ9OBvk6y5bJMpsk49lkGWVxnFE2nU4m00lK2WIe4YCv6ZhG49l4NYlmCzoZTeN5tkiXdDqbzZJFuiLTMRaMi5EQ52Kk9GnAjalwPadzuhgIFqMw_n0MpRJfwQtdZc92A712a4ZxdTJkOhbcWHOxYrkVuBY8VkVZc0mHnv1Dbc1p9TsCE94i1LQW5P5tBtMIBTfGPVdfvRMwMKi0WOfWloZE9Yh_4jav4lGiCvf8Js7N17DU6gcmltC9R2EI3XuU_wkAAP__QaWKmw">