<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/98886>98886</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            CUDA grid sync hangs up with -O0

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          MichaelVarvarin

      </td>

    </tr>

</table>

<pre>

    The execution of CUDA kernel freezes on `cooperative_groups::this_grid().sync();` if `blockCount > 2 * smCount`, despite the maximum allowed `blockCount` being reported as `16 * smCount`, if optimization is disabled. Any other optomization level, except `-O0` works fine.

Tested with:

- clang version 17.0.6 (Fedora 17.0.6-2.fc39)

- CUDA versions 12.1 and 12.5, obtained from Nvidia website

Here is the code to reproduce the issue, along with the command, used to compile it.

```

clang++ --std=c++17 ./test.cu -o test -O0 --cuda-gpu-arch=sm_86 -L/usr/local/cuda-12.1/lib64 -lcudart_static -ldl -lrt -pthread 

```

```

// System includes

#include <stdio.h>

// CUDA runtime

#include <cuda_runtime.h>

//Cooperative groups

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__global__ void testKernel() {

  auto grid = cooperative_groups::this_grid();

  grid.sync();

}

int main(int argc, char **argv) {

  int blockSize=1;

  int devID = 0;

  cudaDeviceProp props;

  // Get GPU information

 cudaGetDevice(&devID);

  cudaGetDeviceProperties(&props, devID);

 printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,

         props.major, props.minor);

  int numBlocksPerSm = 0;

  cudaOccupancyMaxActiveBlocksPerMultiprocessor(

 &numBlocksPerSm, testKernel, blockSize,

               0);

 int smCount = props.multiProcessorCount;

  void *kernelArgs[] = {};

  printf("SmCount: %i, maximum blocks per SM: %i.\n\n", smCount, numBlocksPerSm);

  {

    printf("Launching with 2 * SmCount: %i blocks.\n", 2 * smCount);

    dim3 dimGrid(2 * smCount);

    dim3 dimBlock(blockSize);

    cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,

                                dimBlock, kernelArgs, 0, NULL);

    cudaDeviceSynchronize();

    printf("Test passed.\n\n");

  }

  {  

    printf("Launching with 2 * SmCount + 1: %i blocks. Clang with -O0 will hang up here.\n", 2 * smCount + 1);

    dim3 dimGrid(2 * smCount + 1);

    dim3 dimBlock(blockSize);

    cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,

                                dimBlock, kernelArgs, 0, NULL);

    cudaDeviceSynchronize();

    printf("Test passed.\n\n");

  }

  printf("All tests passed.\n");

 return EXIT_SUCCESS;

}

```

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsV19vo7oS_zTOywhETP4-5CElTe_qtruVsr26b5GxJ8G7BiPbpO1--iMb0kCaPTr7flBFwcz85jd_POMwa-WxQlyR6R2ZbkascYU2qyfJC4bqf8ycmJHVKNfiffW9QMA35I2TugJ9gOxls4afaCpUcDCIv9CCroDMEq51jYY5ecL90eimtiRdk3TtCmn3RyMFoQtCl7F9r3j7SNI7MktAHrx6rjT_memmckDSe6BA6BpsGVbILCE0A4G2lg7BFQgle5NlUwJTSr-iGCJ41BxldQSDtTYOBTDrRcazG7DyALp2spS_WPBSWhDSslyhiGFdvYN2BRovoz9kFJ5QeWV841g7jx19S7zdV21-WjjICmOSbEiybu_f0Xoar9IVPixhLQKuWHWEExrrQcfzOIk9w8UWhTasW4hofODp0gesUwtJ6LQsjGk8BlYJ_zD1nHTumKxQwMHoEr6epJAMXjG30mGf03_QoPfWx5NrgeC0D5jRouFtlKW1DXpIpnR1DOw76bJklfBfGovCK3Jd1lIhSDfw28e4_QuvwWFC7wi9gyiyTpB0w9v38RxiQrcOrYt5A5EG_wjRtwSiiDeCRce6iZjhBUk3ttwvZhA9ErptrCF0qzRnitBtEPQR8Wsyn00gUn7NuL11zEkOkRIKImUcRLUrDDIBN6neXqRbQrewe7cOS5AVV41Ae_6WdgtA0sw6IXVckPR-ANfqhwSapnKyxFvKnvG--_4Zo4eUXTYddJvuFtynrfkJtGIl2ppxBH4Ekm7g1na-62vs90elc6b2ezhpKUK2_hsaQ7u7gcwH8gCscRp8J_itgRv94sMqBNWr9tF96ezMN32DsnJQMlkRuvCPzBy5L1heMOO7AKFrZo6nAdMzEoDXCA1lJ38hSTfjK-9bCYGnL5vgTdJj47O3wZPk-Gx0DbXRn4L3Ya6thwd08PD8ArI6aFOGFtNJeKwHdC1c8HsWjA79H0h5o2icRNvKt_ZD_7xWrI2s3CGI0VYZCJ0Kkq6BTDNCKaFT2z61mz_TZd24Vir2N-CsZrlU0r2TaVYFlQ9LWet77IuL0OzMtrvabyX7oc1FtJSVf13-Jl4-6FVT3vnM2Gc0u_Jm9L9x3tSs4u9P7G3NfYl9aDw1ysnaaI7WekuLTo_Q2RDYc-rXdNarh0--tFcyCK7n2k2aQLJz0Nt_Pttv59CFfNhJhK7bCbs2R9sO6QDgy3S-uUiflfpZ3HWjzWeQTqWnfR6Vgb6FGg3sns4CccjaJXPn0UgzuA7HdU6gv2-GLB5ZU_FCnkdGO8yvqHV84p7x4dAfVDiAkGXqbw9ta_hHsoE_oYte5paf49fWTMu511B7zWxxzguhy2FNfBC6XRGfrgupDHpJphmEs8jXl8fHa2cu7WT3XvHC6Co4sriW6yfAHzegZtaiGGZ40DWH_bKXVYA_zyv4mT6-zi5k4YgTxP0of5VKQeGXmhoKNPjb_Hd4f1AFf6_xby38QS0MEdZKhVZoBzBXEAZdYyq4__-X7_vdS5bd73aXhvGBfj5OjcQqFct0yUa4Gs_pOJnQ5XQyKlYsoclC5NNpmszFIU8WuaDJIs0Pc0yWPJ2O5IomdJLMx9PxmC7Hk3g6nog8nYs0Wc4WFJFMEiyZVLFSpzLW5jgKx9jVcrFYzEaK5ahs-O1DaYWv5zMu9T-FzMrrRHlztGSSKGmdvaA46RSuwtEtHGL8SSSUsvW1fK7wUWPUqnCuPc6E8X6UrmjymOvSH0rV6fwvqo3-gdwRug0kLKHbluRpRf8KAAD__2fN_YY">