[flang-commits] [flang] [flang][cuda] Compute grid x when calling a kernel with <<<*, block>>> (PR #115538)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Fri Nov 8 13:09:14 PST 2024
https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/115538
>From 3fbd025411966218c08470076853f332f248e84d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 8 Nov 2024 11:37:11 -0800
Subject: [PATCH 1/2] [flang][cuda] Compute grid x when star is used
---
flang/runtime/CUDA/kernel.cpp | 46 +++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index abb7ebb72e5923..8881d8a524aac0 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -25,6 +25,29 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
blockDim.x = blockX;
blockDim.y = blockY;
blockDim.z = blockZ;
+ bool gridIsStar = (gridX < 0); // <<<*, block>>> syntax was used.
+ if (gridIsStar) {
+ int maxBlocks, nbBlocks, dev, multiProcCount;
+ cudaError_t err1, err2;
+ nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+ cudaGetDevice(&dev);
+ err1 = cudaDeviceGetAttribute(
+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+ &maxBlocks, kernel, nbBlocks, smem);
+ if (err1 == cudaSuccess && err2 == cudaSuccess)
+ maxBlocks = multiProcCount * maxBlocks;
+ if (maxBlocks > 0) {
+ if (gridDim.y > 0)
+ maxBlocks = maxBlocks / gridDim.y;
+ if (gridDim.z > 0)
+ maxBlocks = maxBlocks / gridDim.z;
+ if (maxBlocks < 1)
+ maxBlocks = 1;
+ if (gridIsStar)
+ gridDim.x = maxBlocks;
+ }
+ }
cudaStream_t stream = 0; // TODO stream managment
CUDA_REPORT_IF_ERROR(
cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
@@ -41,6 +64,29 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
config.blockDim.x = blockX;
config.blockDim.y = blockY;
config.blockDim.z = blockZ;
+ bool gridIsStar = (gridX < 0); // <<<*, block>>> syntax was used.
+ if (gridIsStar) {
+ int maxBlocks, nbBlocks, dev, multiProcCount;
+ cudaError_t err1, err2;
+ nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
+ cudaGetDevice(&dev);
+ err1 = cudaDeviceGetAttribute(
+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+ &maxBlocks, kernel, nbBlocks, smem);
+ if (err1 == cudaSuccess && err2 == cudaSuccess)
+ maxBlocks = multiProcCount * maxBlocks;
+ if (maxBlocks > 0) {
+ if (config.gridDim.y > 0)
+ maxBlocks = maxBlocks / config.gridDim.y;
+ if (config.gridDim.z > 0)
+ maxBlocks = maxBlocks / config.gridDim.z;
+ if (maxBlocks < 1)
+ maxBlocks = 1;
+ if (gridIsStar)
+ config.gridDim.x = maxBlocks;
+ }
+ }
config.dynamicSmemBytes = smem;
config.stream = 0; // TODO stream managment
cudaLaunchAttribute launchAttr[1];
>From fc5ad5f016265cd051bf0b05d919feb42e9b49fa Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 8 Nov 2024 13:09:00 -0800
Subject: [PATCH 2/2] Compute up to one missing dim
---
flang/runtime/CUDA/kernel.cpp | 80 +++++++++++++++++++++++++++++------
1 file changed, 66 insertions(+), 14 deletions(-)
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index 8881d8a524aac0..88cdf3cf426229 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -25,8 +25,17 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
blockDim.x = blockX;
blockDim.y = blockY;
blockDim.z = blockZ;
- bool gridIsStar = (gridX < 0); // <<<*, block>>> syntax was used.
- if (gridIsStar) {
+ unsigned nbNegGridDim{0};
+ if (gridX < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridY < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridZ < 0) {
+ ++nbNegGridDim;
+ }
+ if (nbNegGridDim == 1) {
int maxBlocks, nbBlocks, dev, multiProcCount;
cudaError_t err1, err2;
nbBlocks = blockDim.x * blockDim.y * blockDim.z;
@@ -35,18 +44,35 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&maxBlocks, kernel, nbBlocks, smem);
- if (err1 == cudaSuccess && err2 == cudaSuccess)
+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
maxBlocks = multiProcCount * maxBlocks;
+ }
if (maxBlocks > 0) {
- if (gridDim.y > 0)
+ if (gridDim.x > 0) {
+ maxBlocks = maxBlocks / gridDim.x;
+ }
+ if (gridDim.y > 0) {
maxBlocks = maxBlocks / gridDim.y;
- if (gridDim.z > 0)
+ }
+ if (gridDim.z > 0) {
maxBlocks = maxBlocks / gridDim.z;
- if (maxBlocks < 1)
+ }
+ if (maxBlocks < 1) {
maxBlocks = 1;
- if (gridIsStar)
+ }
+ if (gridX < 0) {
gridDim.x = maxBlocks;
+ }
+ if (gridY < 0) {
+ gridDim.y = maxBlocks;
+ }
+ if (gridZ < 0) {
+ gridDim.z = maxBlocks;
+ }
}
+ } else if (nbNegGridDim > 1) {
+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Too many invalid grid dimensions");
}
cudaStream_t stream = 0; // TODO stream managment
CUDA_REPORT_IF_ERROR(
@@ -64,8 +90,17 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
config.blockDim.x = blockX;
config.blockDim.y = blockY;
config.blockDim.z = blockZ;
- bool gridIsStar = (gridX < 0); // <<<*, block>>> syntax was used.
- if (gridIsStar) {
+ unsigned nbNegGridDim{0};
+ if (gridX < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridY < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridZ < 0) {
+ ++nbNegGridDim;
+ }
+ if (nbNegGridDim == 1) {
int maxBlocks, nbBlocks, dev, multiProcCount;
cudaError_t err1, err2;
nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
@@ -74,18 +109,35 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&maxBlocks, kernel, nbBlocks, smem);
- if (err1 == cudaSuccess && err2 == cudaSuccess)
+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
maxBlocks = multiProcCount * maxBlocks;
+ }
if (maxBlocks > 0) {
- if (config.gridDim.y > 0)
+ if (config.gridDim.x > 0) {
+ maxBlocks = maxBlocks / config.gridDim.x;
+ }
+ if (config.gridDim.y > 0) {
maxBlocks = maxBlocks / config.gridDim.y;
- if (config.gridDim.z > 0)
+ }
+ if (config.gridDim.z > 0) {
maxBlocks = maxBlocks / config.gridDim.z;
- if (maxBlocks < 1)
+ }
+ if (maxBlocks < 1) {
maxBlocks = 1;
- if (gridIsStar)
+ }
+ if (gridX < 0) {
config.gridDim.x = maxBlocks;
+ }
+ if (gridY < 0) {
+ config.gridDim.y = maxBlocks;
+ }
+ if (gridZ < 0) {
+ config.gridDim.z = maxBlocks;
+ }
}
+ } else if (nbNegGridDim > 1) {
+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Too many invalid grid dimensions");
}
config.dynamicSmemBytes = smem;
config.stream = 0; // TODO stream managment
More information about the flang-commits
mailing list