<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/98886>98886</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
CUDA grid sync hangs up with -O0
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
MichaelVarvarin
</td>
</tr>
</table>
<pre>
The execution of CUDA kernel freezes on `cooperative_groups::this_grid().sync();` if `blockCount > 2 * smCount`, despite the maximum allowed `blockCount` being reported as `16 * smCount`, if optimization is disabled. Any other optomization level, except `-O0` works fine.
Tested with:
- clang version 17.0.6 (Fedora 17.0.6-2.fc39)
- CUDA versions 12.1 and 12.5, obtained from Nvidia website
Here is the code to reproduce the issue, along with the command, used to compile it.
```
clang++ --std=c++17 ./test.cu -o test -O0 --cuda-gpu-arch=sm_86 -L/usr/local/cuda-12.1/lib64 -lcudart_static -ldl -lrt -pthread
```
```
// System includes
#include <stdio.h>
// CUDA runtime
#include <cuda_runtime.h>
//Cooperative groups
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void testKernel() {
auto grid = cooperative_groups::this_grid();
grid.sync();
}
int main(int argc, char **argv) {
int blockSize=1;
int devID = 0;
cudaDeviceProp props;
// Get GPU information
cudaGetDevice(&devID);
cudaGetDeviceProperties(&props, devID);
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,
props.major, props.minor);
int numBlocksPerSm = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, testKernel, blockSize,
0);
int smCount = props.multiProcessorCount;
void *kernelArgs[] = {};
printf("SmCount: %i, maximum blocks per SM: %i.\n\n", smCount, numBlocksPerSm);
{
printf("Launching with 2 * SmCount: %i blocks.\n", 2 * smCount);
dim3 dimGrid(2 * smCount);
dim3 dimBlock(blockSize);
cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,
dimBlock, kernelArgs, 0, NULL);
cudaDeviceSynchronize();
printf("Test passed.\n\n");
}
{
printf("Launching with 2 * SmCount + 1: %i blocks. Clang with -O0 will hang up here.\n", 2 * smCount + 1);
dim3 dimGrid(2 * smCount + 1);
dim3 dimBlock(blockSize);
cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,
dimBlock, kernelArgs, 0, NULL);
cudaDeviceSynchronize();
printf("Test passed.\n\n");
}
printf("All tests passed.\n");
return EXIT_SUCCESS;
}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsV19vo7oS_zTOywhETP4-5CElTe_qtruVsr26b5GxJ8G7BiPbpO1--iMb0kCaPTr7flBFwcz85jd_POMwa-WxQlyR6R2ZbkascYU2qyfJC4bqf8ycmJHVKNfiffW9QMA35I2TugJ9gOxls4afaCpUcDCIv9CCroDMEq51jYY5ecL90eimtiRdk3TtCmn3RyMFoQtCl7F9r3j7SNI7MktAHrx6rjT_memmckDSe6BA6BpsGVbILCE0A4G2lg7BFQgle5NlUwJTSr-iGCJ41BxldQSDtTYOBTDrRcazG7DyALp2spS_WPBSWhDSslyhiGFdvYN2BRovoz9kFJ5QeWV841g7jx19S7zdV21-WjjICmOSbEiybu_f0Xoar9IVPixhLQKuWHWEExrrQcfzOIk9w8UWhTasW4hofODp0gesUwtJ6LQsjGk8BlYJ_zD1nHTumKxQwMHoEr6epJAMXjG30mGf03_QoPfWx5NrgeC0D5jRouFtlKW1DXpIpnR1DOw76bJklfBfGovCK3Jd1lIhSDfw28e4_QuvwWFC7wi9gyiyTpB0w9v38RxiQrcOrYt5A5EG_wjRtwSiiDeCRce6iZjhBUk3ttwvZhA9ErptrCF0qzRnitBtEPQR8Wsyn00gUn7NuL11zEkOkRIKImUcRLUrDDIBN6neXqRbQrewe7cOS5AVV41Ae_6WdgtA0sw6IXVckPR-ANfqhwSapnKyxFvKnvG--_4Zo4eUXTYddJvuFtynrfkJtGIl2ppxBH4Ekm7g1na-62vs90elc6b2ezhpKUK2_hsaQ7u7gcwH8gCscRp8J_itgRv94sMqBNWr9tF96ezMN32DsnJQMlkRuvCPzBy5L1heMOO7AKFrZo6nAdMzEoDXCA1lJ38hSTfjK-9bCYGnL5vgTdJj47O3wZPk-Gx0DbXRn4L3Ya6thwd08PD8ArI6aFOGFtNJeKwHdC1c8HsWjA79H0h5o2icRNvKt_ZD_7xWrI2s3CGI0VYZCJ0Kkq6BTDNCKaFT2z61mz_TZd24Vir2N-CsZrlU0r2TaVYFlQ9LWet77IuL0OzMtrvabyX7oc1FtJSVf13-Jl4-6FVT3vnM2Gc0u_Jm9L9x3tSs4u9P7G3NfYl9aDw1ysnaaI7WekuLTo_Q2RDYc-rXdNarh0--tFcyCK7n2k2aQLJz0Nt_Pttv59CFfNhJhK7bCbs2R9sO6QDgy3S-uUiflfpZ3HWjzWeQTqWnfR6Vgb6FGg3sns4CccjaJXPn0UgzuA7HdU6gv2-GLB5ZU_FCnkdGO8yvqHV84p7x4dAfVDiAkGXqbw9ta_hHsoE_oYte5paf49fWTMu511B7zWxxzguhy2FNfBC6XRGfrgupDHpJphmEs8jXl8fHa2cu7WT3XvHC6Co4sriW6yfAHzegZtaiGGZ40DWH_bKXVYA_zyv4mT6-zi5k4YgTxP0of5VKQeGXmhoKNPjb_Hd4f1AFf6_xby38QS0MEdZKhVZoBzBXEAZdYyq4__-X7_vdS5bd73aXhvGBfj5OjcQqFct0yUa4Gs_pOJnQ5XQyKlYsoclC5NNpmszFIU8WuaDJIs0Pc0yWPJ2O5IomdJLMx9PxmC7Hk3g6nog8nYs0Wc4WFJFMEiyZVLFSpzLW5jgKx9jVcrFYzEaK5ahs-O1DaYWv5zMu9T-FzMrrRHlztGSSKGmdvaA46RSuwtEtHGL8SSSUsvW1fK7wUWPUqnCuPc6E8X6UrmjymOvSH0rV6fwvqo3-gdwRug0kLKHbluRpRf8KAAD__2fN_YY">