<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/65103>65103</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [OMPT] Incorrect `work_type` for loop in `teams distribute parallel for` directive
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          Thyre
      </td>
    </tr>
</table>

<pre>
    ## Description

When working with OpenMP teams, users are able to use the directive `#pragma omp distribute parallel for` to distribute work not only across threads but teams as well. On the OMPT side, we would expect to see two `ompt_callback_work` with `ompt_scope_begin` and `ompt_scope_end` each. 

The first callback should use `work_type = ompt_work_distribute` and is dispatched by the initial thread of every team. For the parallel for, each thread should dispatch `work_type = ompt_work_loop`.

However, while we get the distribute work type correctly for the teams, threads do not use `ompt_work_loop` for `ompt_scope_end` and instead dispatch the event with `ompt_work_distribute`.

## Reproducer

The issue can be reproduced with the following code:

```c
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <omp.h>
#include <omp-tools.h>

__thread bool distribute_begin_called;
__thread bool distribute_end_called;
__thread bool loop_begin_called;
__thread bool loop_end_called;

void work_cb(ompt_work_t work_type,
             ompt_scope_endpoint_t endpoint,
             ompt_data_t *parallel_data,
 ompt_data_t *task_data,
             uint64_t count,
             const void *codeptr_ra)
{
    if( work_type == ompt_work_distribute )
 {
        if( endpoint == ompt_scope_begin )
        {
         distribute_begin_called = true;
        } else {
 distribute_end_called = true;
        }
    }
    if( work_type == ompt_work_loop )
    {
        if( endpoint == ompt_scope_begin )
 {
            loop_begin_called = true;
        } else {
 loop_end_called = true;
        }
    }
}

static int
my_initialize_tool(ompt_function_lookup_t lookup,
 int initial_device_num,
                   ompt_data_t *tool_data)
{
    ompt_set_callback_t set_callback =
 (ompt_set_callback_t)lookup("ompt_set_callback");

#define OMPT_CALLBACK(NAME, SIGNATURE, EXPECTED) \
    { \
        ompt_set_result_t result = set_callback(ompt_callback_##NAME, (ompt_callback_t)&NAME##_cb); \
        assert(result == ompt_set_always); \
    }

    OMPT_CALLBACK(work, work, ompt_set_always);

#undef OMPT_CALLBACK
    return 1; /* non-zero indicates success */
}

static void
my_finalize_tool(ompt_data_t *tool_data)
{
    
}

ompt_start_tool_result_t *
ompt_start_tool(unsigned int omp_version,
                const char *runtime_version)
{
    setbuf(stdout, NULL);
    printf("[%s] omp_version %d | runtime_version = \'%s\'\n",
 __FUNCTION__,
           omp_version,
           runtime_version);
 static ompt_start_tool_result_t tool = {&my_initialize_tool,
 &my_finalize_tool,
 ompt_data_none};
    return &tool;
}


void 
report_summary_distribute()
{
 #pragma omp parallel 
    {
        printf("Thread ID = %2d | distribute begin = %5s | distribute end = %5s | %s\n", 
 omp_get_thread_num(), 
        distribute_begin_called ? "true" : "false", 
        distribute_end_called ? "true" : "false", 
 distribute_begin_called == distribute_end_called ? "\033[0;32m pass \033[0m" : "\033[0;31m fail \033[0m");
    }
}


void 
report_summary_parallel()
{
    #pragma omp parallel
    {
 printf("Thread ID = %2d | loop begin = %5s | loop end = %5s | %s\n", 
        omp_get_thread_num(), 
        loop_begin_called ? "true" : "false", 
        loop_end_called ? "true" : "false", 
 loop_begin_called == loop_end_called ? "\033[0;32m pass \033[0m" : "\033[0;31m fail \033[0m");
    }
}

int main( void )
{
    int a[100];
    
    #pragma omp target teams distribute 
    for(int i = 0; i < 100; ++i)
    {
        a[i] = i;
 }
    report_summary_distribute();
 report_summary_parallel();

 printf("-------------------------------\n");

    // Reset
 #pragma omp parallel
    {
        distribute_begin_called = false;
        distribute_end_called = false;
        loop_begin_called = false;
        loop_end_called = false;
    }

    #pragma omp target teams distribute parallel for
    for(int i = 0; i < 100; ++i)
 {
        a[i] = i;
    }
    #pragma omp teams num_teams(2)
 #pragma omp distribute parallel for 
    for(int i = 0; i < 100; ++i)
 {
        a[i] = i;
    }
    report_summary_distribute();
 report_summary_parallel();
 
    return 0;
}
```

Running the code, we can see the following output:
```console
$ clang --version
clang version 18.0.0 (https://github.com/llvm/llvm-project.git 4b383107fa7585bb5ecd7f03cab7800b33d1585a)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/software/software/LLVM/git/bin
$ clang -fopenmp -fopenmp-targets=x86_64 reproducer.c
$ OMP_NUM_THREADS=4 ./a.out 
[ompt_start_tool] omp_version 201611 | runtime_version = 'LLVM OMP version: 5.0.20140926'
Thread ID =  0 | distribute begin =  true | distribute end =  true |  pass 
Thread ID =  3 | distribute begin = false | distribute end = false |  pass 
Thread ID =  2 | distribute begin = false | distribute end = false |  pass 
Thread ID =  1 | distribute begin = false | distribute end = false |  pass 
Thread ID =  0 | loop begin = false | loop end = false |  pass 
Thread ID =  1 | loop begin = false | loop end = false |  pass 
Thread ID =  3 | loop begin = false | loop end = false |  pass 
Thread ID =  2 | loop begin = false | loop end = false | pass 
-------------------------------
Thread ID =  0 | distribute begin =  true | distribute end =  true |  pass 
Thread ID =  3 | distribute begin = false | distribute end =  true |  fail 
Thread ID =  2 | distribute begin = false | distribute end =  true |  fail 
Thread ID =  1 | distribute begin =  true | distribute end =  true |  pass 
Thread ID = 0 | loop begin =  true | loop end = false |  fail 
Thread ID =  3 | loop begin =  true | loop end = false |  fail 
Thread ID =  2 | loop begin = true | loop end = false |  fail 
Thread ID =  1 | loop begin =  true | loop end = false |  fail 
```

In the second part of the output, we would expect to see `distribute end = false` for all threads but `Thread ID = 0`. In addition, all threads should have `loop end = true`. The first part is here to show that it only affects directives using `parallel for` as well. Using `distribute` only doesn't produce the issue. If we nest `#pragma omp parallel for` inside of a distribute directive, it also works fine. However, we need to use two loops in that case. 
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzUWktv4zgS_jXMpRBDpiLZOeSQ2O2dYPNo9KR39yZQYsnmtkwKIhV35tcvSL1t2XEmPViM0XDLYrGeX7GKZJjWYi0Rb0hwR4LlBSvNRhU3L5u3Ai9ixd9uCPUJ9WGJOilEboSSxFsS77b6_vcGJexU8UPINeyE2cBzjvLxKxhkW03oAkqNhQZWILA4QzDKvgGzQeCiwMSIVwQSeoT6ecHWWwZqmwMX2hQiLg1CzgqWZZhBqgoSepZBb9SKBqkMKJm9AUsKpTWYTYGMa4hLU-kBTMMOs2wCz9KJfn78-gJacLQa7iybMuOAP3NMjJWgEcHslFVMbXMTJSzLYpb8iKw8q4UztRnVicoxinEtpB1jku8NoeR2AFmymUDffS8bhFQU2kAjAfTG6WKdRELPyovMW45A_CU4lu5V54JGotDWLzkzyQY5xG_OTiGFESyrPQIqBXzF4s15ZQIrVTiqgYvpwunZTKnVaVif0ilTKiehN-kb-JvaWYnOzRuRoXX2Gk0d_2EYHctEFRYV2ZtVxpG1SGrCypWLeO2hA_lu4qj_nZukNtau1iArAl9RmmFMD708sKvOim-YF4qXCRb7URValwgJkxAjFA0Zr4RYmanKMrWzaZMojsS_HbAPvepf0ooTMslKbn2-YFpjYSYb4n8ZG9aGx0plp8aFOjqqtvmpsUujVKb7FO47imq8WMm9yFZZ4fIHOfHv3qFGyU_T2hifw9PRjXCrvl-V4A5zURITOu8CbqAFN6GLihj6nyGociWkiQw0jyfmcGZYZIDQ2ybb3Ktuxh6VYfrHHkX_UwppwqvIQKLKY3ITJbUBZyuhtxZluSmighF6XTtjdtdNEymhcxjk9rElB1oOMGDRsWkcMuDSWyZ7HOpPy4h418fw49YbU5TYhrObvQTM7ILQ6jOKqpMcuheDH-86xmJtaNDnvXLAwX4OwP8hh-ylxEdd0T24b22YEQlY0Lvf27eoLjbiD4zsItEkVlrKxLYN1k8_yjwyUD10oLUuqSdHHF9FgpEst-OgHk0pK65OljFoV37GXhk30P_p4lH7vVZ6SE3odaPznFB6QEEotZKHiwyhPsdUyKrbiBa3Dw93t4t_Ejp_un38Ygva7_f_eLp9-f7N_fjyn69fFi9floReAwkWAzANXwxMKlCXmV2DqgcX1aFq82EHU1WuRoWDYWsroWE1bindEmltO1SiqkOEzjvZHazRRCzbsTc9NnsPTPbVvpNcp2Xbhvr_Ua57_i4lx3SPUyugQFMWEqZOGboi9Bakkpd_YKFASC4SZlCDLpMEtbaoskQnoG8X1hb7qZCHyD8boKNiKoMNK4xj2kXaqjZGQei8lK6d5y6l1DaPXrHQtmU_lktVjUg2rLBsi1IascVu1piyGk1c2vVMG65KW3vg6fvDQz8gliwvhDRplTB2c0EDTYJlXykgNOBAZgvYk-tAbPFCZ9U09xQspMuzxpIoWn1_WrzcPz9F0Zh975g_Ymurfh3hoxGwvyolZ3eEhmNr36JdUMJDeBzWfKkk2vj3PVgDltDQzWrRPoRJr6OpHgvMVWEiXW63rHjr97A2FgcRHW682q3AiWLWj-xL1XPdLyt30IBWEe01C3Vpq4YDvT-Mku8N1jGvow2dr6I1mrrLqwqEs6cjqT_Hu4cVEEpdzaMUiG-NpynLNA5FHfIZlM0zuZxoYqy9J7mTYOH5PgnuPOLf-XQLObNrUvt22xM9pJ1uIWUi26PdT85jVf09MDXwGIeSZTyKpjEwnYMi11mN4ccNnIuc3npwHoDG-qyPQeew1zpz_miLZ408wvH_BRRbX7ZMSNvW1juM0U2FNMBIcDf1PBIMV7djmDGscAcE7tymv-do6d0pxdw1jS7-1hz3uAArx5X3O0LvxOme3OolbEmyPESn26D9fW8xbWedTpRhpzLA_uXpTwvnAybOcytCV_ANNZqTy_mIG4h3fWqbVWHTHyXe20QckI7vU8bJTvI60iqeB5jBqdZnwHM2cg72TkM9nYKy3Eb1idac9mScc_z5uRz402b84hyAgw7HG-ltmgOwfuy_lVIKuXbHZ-7QrDq8TZisDmwHp2qqNHlpunO19kRNSa0ybPYMV5BkTK7h8rJpBN1A9bLpR6fziTfx7GZpY0yuLVOXeWthNmU8SdSW0FWWvTb_XeaF-i8mZrIWBq5if-5PvVnKZsE8iOMAEz5LPT9h8WzuebHv82kwD7qtwYsDtF20f87DKLy6LOUPqXbyMhOy_Hm5lmVNV1XNreKYWepcafGzGrqX2rikWoqiWv1XKjeErrRKzY4VOHx8ePjXY2UOoatYyAPnpCpHuc3bh8sq6TTxl5WO3RFnMUm66c-PX6On74_Ry2_fvtwufyf-8gomhK7YRJWm2fYEd_sbmb19AvWm4XR6fKdAZ9YAK6yJmLU5mHgT6k2vvGsa2v1D32d1pwHe8WbVnY0ca1a7wbrgjjH3jzN3y9wx5t3gCeb0r2Q-_SuZe2OtXTdz0Np9QNtfyND_1Qzpn2LY4_dek_C3wXaPedV9_kpsn8X8BLY_7ZZRaHcTjwHnuLKjSPwMw1EkfoLfaO59iOFopb-vrmk1Jkpy2wMZUKl7VZf149e2JPSOrUzN5SDLssEdMQm9vTiS0JvAvQTGuTDVAdJgVn0rumHV5fXATLfJs_O7y11ngNCwwcLdgOuN2oHZMAOiublOU0yM7u7ENZTatjEk9PZvwNvL7O8NxfA22DHkCrUkdGagrsvVfbDWJU7gPrXek6jN4dX7vjQhteBo3c_6adEqal0jDLBMK3dWqyEVEifQv_m1spC3d_875YChQcjKCwnTOIELfuPza_-aXeDNNLz2_bnvebOLzc1sRv0EeTLjIUdKkyT28IrROOaM-xzDC3FDPep7ln4-9bzZJE1n_NrnXhjQ4Op65pErD7dMZBPbnU1Usb5wnrgJg6nnX2Qsxkw3fwJR3LgWLi7Xmlx5mdBGd9OMMJn7Y4nnx68vtk-5l_WN9eBmvEGaA4bNitA7vT8Jvc6jF2WR3Xys0SR05QzShK6cTf8LAAD__w10Fx4">