<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/98886>98886</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            CUDA grid sync hangs up with -O0
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          MichaelVarvarin
      </td>
    </tr>
</table>

<pre>
    The execution of CUDA kernel freezes on `cooperative_groups::this_grid().sync();` if `blockCount > 2 * smCount`, despite the maximum allowed `blockCount` being reported as `16 * smCount`, if optimization is disabled. Any other optomization level, except `-O0` works fine.

Tested with:
- clang version 17.0.6 (Fedora 17.0.6-2.fc39)
- CUDA versions 12.1 and 12.5, obtained from Nvidia website

Here is the code to reproduce the issue, along with the command, used to compile it.

```
clang++ --std=c++17 ./test.cu -o test -O0 --cuda-gpu-arch=sm_86 -L/usr/local/cuda-12.1/lib64 -lcudart_static -ldl -lrt -pthread 
```

```
// System includes
#include <stdio.h>

// CUDA runtime
#include <cuda_runtime.h>


//Cooperative groups
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__global__ void testKernel() {

  auto grid = cooperative_groups::this_grid();
  grid.sync();
  
}

int main(int argc, char **argv) {
  
  int blockSize=1;

  int devID = 0;
  cudaDeviceProp props;


  // Get GPU information
 cudaGetDevice(&devID);
  cudaGetDeviceProperties(&props, devID);
 printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,
         props.major, props.minor);


  int numBlocksPerSm = 0;
  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 &numBlocksPerSm, testKernel, blockSize,
               0);
 int smCount = props.multiProcessorCount;
  void *kernelArgs[] = {};
 
  printf("SmCount: %i, maximum blocks per SM: %i.\n\n", smCount, numBlocksPerSm);

  {
    printf("Launching with 2 * SmCount: %i blocks.\n", 2 * smCount);
    dim3 dimGrid(2 * smCount);
    dim3 dimBlock(blockSize);
 
    cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,
                                dimBlock, kernelArgs, 0, NULL);
    cudaDeviceSynchronize();
    printf("Test passed.\n\n");
  }


  {  
    printf("Launching with 2 * SmCount + 1: %i blocks. Clang with -O0 will hang up here.\n", 2 * smCount + 1);
    dim3 dimGrid(2 * smCount + 1);
    dim3 dimBlock(blockSize);
 
    cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,
                                dimBlock, kernelArgs, 0, NULL);
    cudaDeviceSynchronize();
    printf("Test passed.\n\n");
  }

  printf("All tests passed.\n");
 return EXIT_SUCCESS;
}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsV19vo7oS_zTOywhETP4-5CElTe_qtruVsr26b5GxJ8G7BiPbpO1--iMb0kCaPTr7flBFwcz85jd_POMwa-WxQlyR6R2ZbkascYU2qyfJC4bqf8ycmJHVKNfiffW9QMA35I2TugJ9gOxls4afaCpUcDCIv9CCroDMEq51jYY5ecL90eimtiRdk3TtCmn3RyMFoQtCl7F9r3j7SNI7MktAHrx6rjT_memmckDSe6BA6BpsGVbILCE0A4G2lg7BFQgle5NlUwJTSr-iGCJ41BxldQSDtTYOBTDrRcazG7DyALp2spS_WPBSWhDSslyhiGFdvYN2BRovoz9kFJ5QeWV841g7jx19S7zdV21-WjjICmOSbEiybu_f0Xoar9IVPixhLQKuWHWEExrrQcfzOIk9w8UWhTasW4hofODp0gesUwtJ6LQsjGk8BlYJ_zD1nHTumKxQwMHoEr6epJAMXjG30mGf03_QoPfWx5NrgeC0D5jRouFtlKW1DXpIpnR1DOw76bJklfBfGovCK3Jd1lIhSDfw28e4_QuvwWFC7wi9gyiyTpB0w9v38RxiQrcOrYt5A5EG_wjRtwSiiDeCRce6iZjhBUk3ttwvZhA9ErptrCF0qzRnitBtEPQR8Wsyn00gUn7NuL11zEkOkRIKImUcRLUrDDIBN6neXqRbQrewe7cOS5AVV41Ae_6WdgtA0sw6IXVckPR-ANfqhwSapnKyxFvKnvG--_4Zo4eUXTYddJvuFtynrfkJtGIl2ppxBH4Ekm7g1na-62vs90elc6b2ezhpKUK2_hsaQ7u7gcwH8gCscRp8J_itgRv94sMqBNWr9tF96ezMN32DsnJQMlkRuvCPzBy5L1heMOO7AKFrZo6nAdMzEoDXCA1lJ38hSTfjK-9bCYGnL5vgTdJj47O3wZPk-Gx0DbXRn4L3Ya6thwd08PD8ArI6aFOGFtNJeKwHdC1c8HsWjA79H0h5o2icRNvKt_ZD_7xWrI2s3CGI0VYZCJ0Kkq6BTDNCKaFT2z61mz_TZd24Vir2N-CsZrlU0r2TaVYFlQ9LWet77IuL0OzMtrvabyX7oc1FtJSVf13-Jl4-6FVT3vnM2Gc0u_Jm9L9x3tSs4u9P7G3NfYl9aDw1ysnaaI7WekuLTo_Q2RDYc-rXdNarh0--tFcyCK7n2k2aQLJz0Nt_Pttv59CFfNhJhK7bCbs2R9sO6QDgy3S-uUiflfpZ3HWjzWeQTqWnfR6Vgb6FGg3sns4CccjaJXPn0UgzuA7HdU6gv2-GLB5ZU_FCnkdGO8yvqHV84p7x4dAfVDiAkGXqbw9ta_hHsoE_oYte5paf49fWTMu511B7zWxxzguhy2FNfBC6XRGfrgupDHpJphmEs8jXl8fHa2cu7WT3XvHC6Co4sriW6yfAHzegZtaiGGZ40DWH_bKXVYA_zyv4mT6-zi5k4YgTxP0of5VKQeGXmhoKNPjb_Hd4f1AFf6_xby38QS0MEdZKhVZoBzBXEAZdYyq4__-X7_vdS5bd73aXhvGBfj5OjcQqFct0yUa4Gs_pOJnQ5XQyKlYsoclC5NNpmszFIU8WuaDJIs0Pc0yWPJ2O5IomdJLMx9PxmC7Hk3g6nog8nYs0Wc4WFJFMEiyZVLFSpzLW5jgKx9jVcrFYzEaK5ahs-O1DaYWv5zMu9T-FzMrrRHlztGSSKGmdvaA46RSuwtEtHGL8SSSUsvW1fK7wUWPUqnCuPc6E8X6UrmjymOvSH0rV6fwvqo3-gdwRug0kLKHbluRpRf8KAAD__2fN_YY">