<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/55102>55102</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            When should scope_end callbacks be issued?
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          SpinTensor
      </td>
    </tr>
</table>

<pre>
    Hi,

please checkout the following testcode:
```
#include <stdio.h>
#include <time.h>
#include <stdbool.h>
#include <unistd.h>
#include <omp.h>
#include <omp-tools.h>
struct timespec local_ref_time = {0ll,0ll};
bool ref_time_set = false;

long long get_runtime_usec() {
   if (!ref_time_set) {
      ref_time_set = true;
      clock_gettime(CLOCK_MONOTONIC, &local_ref_time);
   }
   struct timespec local_time;
   clock_gettime(CLOCK_MONOTONIC, &local_time);
   long long delta_sec  = local_time.tv_sec  - local_ref_time.tv_sec;
   long long delta_nsec = local_time.tv_nsec - local_ref_time.tv_nsec;
   if (delta_nsec < 0) {
      delta_nsec += 1000000000l;
      delta_sec -= 1l;
   }
   return 1000000l*delta_sec + delta_nsec/1000l;
}

static void my_ompt_callback_parallel_begin(ompt_data_t *encountering_task_data,
                                            const ompt_frame_t *encountering_task_frame,
                                            ompt_data_t *parallel_data,
                                            unsigned int requested_parallelism,
                                            int flags, const void *codeptr_ra) {
   fprintf(stderr, "par begin:                   %7lld usec\n", get_runtime_usec());
}
static void my_ompt_callback_parallel_end(ompt_data_t *parallel_data,
                                          ompt_data_t *encountering_task_data,
                                          int flags, const void *codeptr_ra) {
   fprintf(stderr, "par end:                     %7lld usec\n", get_runtime_usec());
}

static void my_ompt_callback_implicit_task(ompt_scope_endpoint_t endpoint,
                                           ompt_data_t *parallel_data,
                                           ompt_data_t *task_data,
                                           unsigned int actual_parallelism,
                                           unsigned int index,
                                           int flags) {
   switch (endpoint) {
      case ompt_scope_begin:
         fprintf(stderr, "implicit_task begin:    t=%d: %7lld usec\n",
                 omp_get_thread_num(), get_runtime_usec());
         break;
      case ompt_scope_end:
         fprintf(stderr, "implicit_task end:      t=%d: %7lld usec\n",
                 omp_get_thread_num(), get_runtime_usec());
         break;
      case ompt_scope_beginend:
         fprintf(stderr, "implicit_task beginend: t=%d: %7lld usec\n",
                 omp_get_thread_num(), get_runtime_usec());
         break;
      default:
         break;
   }
}

// initialize callbacks
int my_ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
                       ompt_data_t *tool_data) {
   // Get the set_callback function pointer
   ompt_set_callback_t ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
   // register the available callback functions
   ompt_callback_parallel_begin_t f_ompt_callback_parallel_begin = &my_ompt_callback_parallel_begin;
   ompt_set_callback(ompt_callback_parallel_begin, (ompt_callback_t)f_ompt_callback_parallel_begin);
   ompt_callback_parallel_end_t f_ompt_callback_parallel_end = &my_ompt_callback_parallel_end;
   ompt_set_callback(ompt_callback_parallel_end, (ompt_callback_t)f_ompt_callback_parallel_end);
   ompt_callback_implicit_task_t f_ompt_callback_implicit_task = &my_ompt_callback_implicit_task;
   ompt_set_callback(ompt_callback_implicit_task, (ompt_callback_t)f_ompt_callback_implicit_task);

   return 1; // success: activates tool
}
void my_ompt_finalize(ompt_data_t *tool_data) {
   (void) tool_data;
}
// start tool
ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
                                          const char *runtime_version) {
   static ompt_start_tool_result_t ompt_start_tool_result;
   ompt_start_tool_result.initialize = &my_ompt_initialize;
   ompt_start_tool_result.finalize = &my_ompt_finalize;
   return &ompt_start_tool_result; // success: registers tool
}

int main(int argc, char **argv) {
   #pragma omp parallel num_threads(2)
   {
      int threadnum = omp_get_thread_num();
      sleep(threadnum+1);
   }
   fprintf(stderr, "\n");
   sleep(1);
   #pragma omp parallel num_threads(2)
   {
      int threadnum = omp_get_thread_num();
      sleep(threadnum+1);
   }
   sleep(1);

   return 0;
}
```
I register the implicit task callback, which is triggerred everytime an implicit task is created (at the beginning of a scope), or ended (at the end of a scope).
I compile this code with a fairly new git version
```
clang --version
clang version 15.0.0 (/home/fuhl/software/llvm_src/clang 34312f1f0c4f56ae78577783ec62bee3fb5dab90)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/bm/llvm-15.x.x/bin
```

`clang -fopenmp -o test.x test.c`

When running the code I get the following output:
```
implicit_task begin:    t=0:       0 usec
par begin:                     10852 usec
implicit_task begin:    t=0:   11077 usec
implicit_task begin:    t=1:   11220 usec
implicit_task end:      t=0: 2011483 usec
par end:                     2011540 usec

par begin:                   3011718 usec
implicit_task begin:    t=0: 3011785 usec
implicit_task end:      t=1: 3011823 usec
implicit_task begin:    t=1: 3011855 usec
implicit_task end:      t=0: 5012031 usec
par end:                     5012059 usec
implicit_task end:      t=0: 6012236 usec
implicit_task end:      t=1: 6012459 usec
```
The first line is the initial task. After the first parallel region starts only the master-thread reaches the scope end and triggers the `implicit_task` callback. The other implicit task is only ended, after the next parallel region already began.
I know that the threads are kept alive, because thread creation is expensive, but as I understand it, the thread should wait, after it triggered all the scope_end event callbacks.
The same behavior can also be observed with wait other callbacks, like sync_region and sync_region_wait, which are only triggered, after the next parallel region started, or the program is terminated.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzVWVtz27oR_jXyC0YaXkRdHvQQy8dtpu3JQzPTRw4IgiJqiGABUJb767sAeBelyEo6c47HURRg77v4doEkIv3Y_ZXNgv3Me5l5X9xnySlWFJGckjdRaaRzijLBuXhnxQFpqjQRKZ2FNfls5dW_7p9ByArCq5SiWbhXOmVikc_C36Z2NTvSq5vAmgjBr-5XBQOSq9viWN7am2uQrXoUSsuKgLNgkiopQVwQzGNJs9gsAdcLmq2fPc4hWuZz_TILnx2rMRM1lLGi2lJnmCva0rhPLiCC9uNAdSyrwnJUipJZsJkFW6PCUSKEWIbsqt8XPSaCnwvN4Emn2NEQcOctBqWGEKTu__5t_7f4H99-__b92-9f9-AT6FoNfQZVfSnG4-b7dLAsU4_jM0onFHbBSinXGNwjyPrXcSz0yS3PR_mqN26JKwzjpTi7PCWuGMlz6RnI2iNvIj99kuDZaPS95oeP8tR5OreE_FoGJNWVLBpJUJRfOlbQ0lM6C179gaZWTFP4WDOCToKl6PgRw9nQMfjOEwzJK7GEr5THCT2wAvy12ykG2VBrwRdaEAFVTCVAQ6yxerN7LaCgT_wQUSiNrPxMYijnKwrs5kMaRra3vj1sclUodihoilihISP_qQAbadoGjanjQ2KNtIzjgzInxIXFZgdsNshbahlLPC60rIQQ6QxSBKhIpXSnKwBbkMtdOGXDLIjWnKfIQlC0hwwHhnEanfrnsy2i-8qHFull8fxsAv5ftfir42-cn4z-r4j_HVlgx5IzwrQNSpMGRURJTV5KAXZDBJuvj5Tsrz5ZI3k_BSyDQ4qJrgDXf_KEDkSyIqXnR6T0ymxYSuqdaZKb7tLl5KKrEDOj9TLZHPKRGVfKclASA4DQ0HegKm3FTlfnFUfBFtPrY51LitO4qI5N1d5Tz62YBLjfxtPLyFd3oB7xtH8U_wye2sw87m6P_Q_pbkozXHF96d0FcYd3Q-CD6QZ-4SwxzTBn_4V7Sw17yhGYU9YgYkfVoGBWFUQzUcRciLeqBLhxX4xv7nBbjjilJ0ao8_32SR9DF9wNaugaHuHa8L9Qd8GC2b0FbNRYhezhp7JlcpXRI4315Zq7qzQwPyAGIxr_IE_BBYWtg-HU78yUUEgw20hrKz5hxnHCu1i3FquhqVfmSLA6uzlo1i6sfjSP9gyd8GVze5bdowsSE6Dblo3Cc33cueUjbP_YQ3tsH_LPDluf9c4y3fBtACwTzg2B55p3w1HkU-6Nppg7_RtxbUcXctTdpGCnqXZVEUKVMkgJAwM7YU0VMid5BEGDcStjxQBa7sKAjRFhljuaywmvNkpjqXtWuIiZxdgyS6oASp3G0R7oGYwsBtFPVCo4sA8MLm4iJjlMtqCqQf9W3miWcYPpVWOnNy4rY0yx6CH-qNh6KP9jMU3WxkLabPZE1JUCVFetnqigBjonK6jXo7C9X9shVR6IvXrUIYZfWDpdVk9YSnw4YuMbag4ygh5Vt2oYLDcWz7smOui-RpejBB4bgKutfti2FafUtJCWeRY8-zeei67MKu3kMWBshF8I_NO4O-nBuIq8iYM-fEn9Ouy6DZIhC7AdTO7Re87gusCgwCQ7HCC-cMwpnMcP-3KJixErEBJwRFNzo91gN3_Y5laYB16RIYzs9FnPeMLeYAfUpoEN6BaNyQRiymAy0LlRA7dlBJeZHCgzzCT_QAV9RwewpIGLKccJx2DHfD6gcYv1EvKjhbfw3Pvoay7Mg9BrVuWAdK9KZPodS7PC-ekYK2mewBx7uAz9IPMzjyyzaIXpehOt1-tNSMkqSCgNsyRKcbL12ir6DgePmvkUnTereLWcV8VbId6LOWdFdZ4fiqqms6WBjuAwN9SlUOxcxwTQ0pRq-sKkm71fRQmt6jU51ibOwZvz4myW2HRE2sU6NBlEvYBTMBf2PX5xdn-REcO_clogAGibV5M3m5CvZmofveqLSpeVvvamf_u26HWPG567Ubj_Rrj98oSQ722ioMdwlxbf99bru5n8hikIvKtM4zuh1RR4vr_chCN_brzkGIZo2ddyVxhCYFv7m8-FwTJtors98humTRB-LnaWKbpfkzUv8vzAC_37Y2cZou3ntKyAKQhXnwuCYVoONI1K_bs5FkzCgAMnnFpQNdjrRgqLnwv0JWsw2VG2vcjgNYCTHQwUEgXgnaE6YgPic9c-gAiTnDq5Fj4tmmL4U8O32wKLhrPrymtBf4GMmQLI5CW2W7UWsA1449bWgp4vTcXc2PRhko-LFsQNxAFPDfZ1d4WxhKI3WsJ8wtnJvMADF8EQyZrCdRUjFaygZ0Ao1dBVwKQAeCowCyJmnGXmobEnH6lcVDxF79jtOMONXy4q0H_A8i5q9hIFXQ6aenvjX3Q5VPhomlqOTwz6F8HGVSVgBYlEUXkCcbYxGXV1JLuHA1DP2RsI-ShI3EQK1PX-HTd2uuZrYuPy3Vh7R_BtnThK4chKKQ4SH23ZUXmEGRT2F0_pLky34RY_aaY53VlYr8PVxaI13zjJlKpAcvj6VEm-y7UuzSzqhlNov3mVLKBT1_2naUOg_d-UmNZk2SEOr1Hke8FTvgtXeBlF2ygJlumSeNl2G2TEpxsvTcgy2G6eOE4oV7tZ9AwTnWnyVoQd8l6e2C7wAIGXwQrgO4w2i3S9CgDH1yFOViFInC09CgMwXxg7FkIenuTOmpRUBwWbHMYg1W1iZS8z1KoD-bjSuZC7f5as-A5VJ-ST1b6z1v8PJCBhSQ">