<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/62627>62627</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            libomptarget interop use and destroy clauses with nowait and depend are missing synchronisations 

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          tomdeakin

      </td>

    </tr>

</table>

<pre>

    The OpenMP `interop` construct should ensure that foreign functions enqueued on the foreign synchronisation object (i.e., the CUDA queue) that is returned via the `omp_ipr_targetsync` property should finish before tasks enqueued after the `interop` construct which depend on that `interop` construct.

For example, OpenMP tasks depending on the `interop` construct with a `depend()` clause should not start until the foreign functions in the synchronisation object have finished.

The below example captures the incorrect behaviour, which can be fixed by adding a call to `cudaStreamSynchronize`. I'm suggesting a real fix could be to have the `interop` construct create a CUDA event and call `cuStreamWaitEvent` or `cudaStreamWaitEvent` for all other streams to make sure that those streams do not continue until the work on the "interop" stream is done.

## Build

```bash

clang++ -O3 kernel.cu --cuda-gpu-arch=sm_75 -c

clang -fopenmp --offload-arch=native example.c kernel.o -lcudart -L/path/to/cuda/lib64

```

## Output

```

# Where FIXME_MISSINGBARRIER is defined as empty

Incorrect 42894

# Where FIXME_MISSINGBARRIER is defined as cudaStreamSynchronize(s)

Success

```

## example.c

```c

#include <stdlib.h>

#include <stdio.h>

#include <omp.h>

#include <cuda_runtime.h>

#define FIXME_ADDBARRIER

//#define FIXME_ADDBARRIER cudaStreamSynchronize(s);

extern void call_cuda_kernel(int * A, int N, cudaStream_t s);

int main(void) {

  int N = 100000;

  int *A = (int *)malloc(sizeof(int) * N);

  #pragma omp target enter data map(alloc: A[:N])

  #pragma omp target nowait depend(out: A)

 for (int i = 0; i < N; ++i)

    A[i] = i;

  omp_interop_t iobj = omp_interop_none;

  #pragma omp interop init(targetsync: iobj) nowait depend(inout: A)

  // Check we have a CUDA runtime

  int err;

 if (omp_get_interop_int(iobj, omp_ipr_fr_id, &err) != omp_ifr_cuda) {

 printf("Wrong interop runtime\n");

    exit(EXIT_FAILURE);

  }

  // Get CUDA stream

  cudaStream_t s = (cudaStream_t) omp_get_interop_ptr(iobj, omp_ipr_targetsync, NULL);

  // Asynchronously enqueue CUDA kernel on the stream

  #pragma omp target data use_device_ptr(A)

  call_cuda_kernel(A, N, s);

 FIXME_ADDBARRIER

  #pragma omp interop use(iobj) nowait depend(inout: A)

  #pragma omp target nowait depend(inout: A)

  for (int i = 0; i < N; ++i)

    A[i] += 1;

  #pragma omp interop use(iobj) nowait depend(inout: A)

  #pragma omp target data use_device_ptr(A)

 call_cuda_kernel(A, N, s);

  FIXME_ADDBARRIER

  #pragma omp interop destroy(iobj) nowait depend(inout: A)

  #pragma omp taskwait

  #pragma omp target exit data map(from: A[:N])

  // Check solution

  for (int i = 0; i < N; ++i)

    if (A[i] != i + 3) {

      printf("Incorrect %d\n", A[i]);

 exit(EXIT_FAILURE);

    }

  printf("Success\n");

 free(A);

}

```

## kernel.cu

```c++

#include <cuda_runtime.h>

#include <cstdio>

__global__ void cuda_kernel(int *A, int N) {

  const int tid = threadIdx.x + blockIdx.x * blockDim.x;

 if (tid < N)

    A[tid] += 1;

}

extern "C" {

void call_cuda_kernel(int *A, int N, cudaStream_t s) {

  cuda_kernel<<<N, 1, 0, s>>>(A, N);

}

}

```

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8WF-P4jgS_zTmpQQCh78PPNDQrJBmek7TO5p9Q05SIZ527Jzt0N376U-2k5AwML17e7oWakKqXK5fVflXlTBj-Ekirsnsgcx2A1bZXOm1VUWK7IXLQazS9_XvOcKXEuXnfwGZj7m0qFVJ5mNIlDRWV4kFk6tKpIDSVBrB5sxCpjTyk4SskonlShpA-e8KK0xBSbA5thrmXSa5VpIb5hRBxT8wsUDoko9wROjWa2-_7TbgDRC6CltwAxptpSWmcObMq5H5WBXlkZf6aJk-oXXWnbOlViVq-974mnHJTQ4xOjfAMvPS8ZBlFnVj7ybi15wnOaRYoqwBMXtPeUTGOzLehP97pQHfWFEKdNDqwIb9gzkuT02I7u7ObQ7MicMSQpeErryOYJXBBqRUFoxl2kIlLRe9sF8Sw8Nmd_KQszPW0cK0B8UVRoxCvTaAIGGlrTQab4_LRGntTMSYszNXlXaIQ-QSJiF2dt8whfgdWOpxM0iYEGCVw5ZUKXu2Glnx3Lj2J5L5eAQHQhcFmOp0QmPDOo1MOHOQeOQxOiPe918FMtHILAIL5YVnlBaYTIMX3oXgwHfG7aOTuuVK973rCTOlwS1WNkcNxisY50vBXhAu58PmyuWplqfK5ypR0nJZYSddr0q_tOVAaYOC0nqtOwWpktjLDKERoRE8VFyk9Z35OHxiZvJwKxFMngh9IPQBhl8ieEEtUYySCoZDB254Kqsh00lOop0pjosZDJPOShhmqkRZlDAcqiwTiqWNtmSWn7GpilHSmFYwFM6ytjD8ROi-ZDYndG8VoXt3n9C94PF8euXyDWRfKltW9rYejeB7jhphf_jj8-Px8-H5-fD028Pm69fD41cfLsy44wxmAIvSvod1h7Zcp3S5mv4Xxm7XK10adza9uecqSdCYj_G1obtSTVotLhNRpQgk2hqbCh6PchI93hFzdVeqivKuzAE6aleLBXaVGtWAvQ7NZrerw9LI9_5zT-vX4Yoeunvhm0Ut4ax4OJpH71moKtcnpGsXG9g4fnE_ntzFxf7Rwg2jTrFgXBK6dIZdWyGLngYEY0CiHUzG7q81EUSEbjZeenGC0FXBhFCJQ8L_RJUFmTdPN861VccKoVGp2algoIoSQssCdIccUmYZFKwkdBkMRg7h7IFEmycy27VFBfBLY1K9Mm6hbRSqssFSu95RVg2AezQOpr_cwpO7DCTBOzuC94ST2c4v4F1E9bdvw4Gtjha4in941e5tqSTejUWtBFxyS-iy082jjTfnInoNjsuf4XXC4yoStjkmL_CKoTvU1F9XeTe5qPXFOZ65EDnnT2hbAD6vy-DMFprBI9NHV05bIHTujPjMT1rwmT4GsuvUG5SaS5v5Pk6_ayVPLf7Gs9lWEkr71QOAbz46j38cfj_uN4dP374-XhXYYldfXoXhN7QBeugjjbR_apri7t51jl8HorT6RiA6OaNbePr26dOVb32PNs0Eoioj3ptpLDgZznrTCPsu3yx7f3oqg8cUzzzB2sNuWdwgEk8gnjxu8MUdnrtz-pr0VQbbyPztev0Lx_mmhX94oumDZ7ybmfp_YPwwd389dZed7mTvDqQUjdXq_X8Fy7y4xf3ecpv637jtMn-mVXGP-G_ymlGicuP7PyqFwHadgvDsxZ0uRH3i8n9d9rrMUYTO0pa2tm199dLzIX91GKy51d2tmah-ZsemuWnEpnhaUd_irwaxdij-aRDzYfs7U1NfyU9lVyPV8XgSKmbieKxHnVtTTnfI6SXCP9V4ieWpT7XNNbL0kL6N3nzqYqGSl-bnJvzc8WL0dt3ngoFtGFZ6BGF5epMirkJaD2yE0q17VGm9_GCC-2CA66HtrI-24eNXTdy_cSCC6LH-XPjhVhW0F012B-k6SlfRig1wPZkvo_l0PJ9OBvk6y5bJMpsk49lkGWVxnFE2nU4m00lK2WIe4YCv6ZhG49l4NYlmCzoZTeN5tkiXdDqbzZJFuiLTMRaMi5EQ52Kk9GnAjalwPadzuhgIFqMw_n0MpRJfwQtdZc92A712a4ZxdTJkOhbcWHOxYrkVuBY8VkVZc0mHnv1Dbc1p9TsCE94i1LQW5P5tBtMIBTfGPVdfvRMwMKi0WOfWloZE9Yh_4jav4lGiCvf8Js7N17DU6gcmltC9R2EI3XuU_wkAAP__QaWKmw">