[Openmp-commits] [openmp] aa27cfc - [OpenMP][CUDA] Cache the maximal number of threads per block (per kernel)
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Sun Aug 16 12:40:39 PDT 2020
Author: Johannes Doerfert
Date: 2020-08-16T14:38:33-05:00
New Revision: aa27cfc1e7d7456325e951a4ba3ced405027f7d0
URL: https://github.com/llvm/llvm-project/commit/aa27cfc1e7d7456325e951a4ba3ced405027f7d0
DIFF: https://github.com/llvm/llvm-project/commit/aa27cfc1e7d7456325e951a4ba3ced405027f7d0.diff
LOG: [OpenMP][CUDA] Cache the maximal number of threads per block (per kernel)
Instead of calling `cuFuncGetAttribute` with
`CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK` for every kernel invocation,
we can do it for the first one and cache the result as part of the
`KernelInfo` struct. The only functional change is that we now expect
`cuFuncGetAttribute` to succeed and otherwise propagate the error.
Ignoring any error seems like a slippery slope...
Reviewed By: JonChesterfield
Differential Revision: https://reviews.llvm.org/D86038
Added:
Modified:
openmp/libomptarget/plugins/cuda/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 462bdfe5ca8b..6921c781e27f 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -75,6 +75,9 @@ struct KernelTy {
// 1 - Generic mode (with master warp)
int8_t ExecutionMode;
+ /// Maximal number of threads per block for this kernel.
+ int MaxThreadsPerBlock = 0;
+
KernelTy(CUfunction _Func, int8_t _ExecutionMode)
: Func(_Func), ExecutionMode(_ExecutionMode) {}
};
@@ -843,10 +846,9 @@ class DeviceRTLTy {
return OFFLOAD_SUCCESS;
}
- int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
- void **TgtArgs, ptr
diff _t *TgtOffsets,
- const int ArgNum, const int TeamNum,
- const int ThreadLimit,
+ int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
+ ptr
diff _t *TgtOffsets, const int ArgNum,
+ const int TeamNum, const int ThreadLimit,
const unsigned int LoopTripCount,
__tgt_async_info *AsyncInfo) const {
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
@@ -862,10 +864,9 @@ class DeviceRTLTy {
Args[I] = &Ptrs[I];
}
- const KernelTy *KernelInfo =
- reinterpret_cast<const KernelTy *>(TgtEntryPtr);
+ KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
- unsigned int CudaThreadsPerBlock;
+ int CudaThreadsPerBlock;
if (ThreadLimit > 0) {
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
CudaThreadsPerBlock = ThreadLimit;
@@ -886,13 +887,18 @@ class DeviceRTLTy {
CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
}
- int KernelLimit;
- Err = cuFuncGetAttribute(&KernelLimit,
- CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
- KernelInfo->Func);
- if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
- DP("Threads per block capped at kernel limit %d\n", KernelLimit);
- CudaThreadsPerBlock = KernelLimit;
+ if (!KernelInfo->MaxThreadsPerBlock) {
+ Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+ KernelInfo->Func);
+ if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
+ return OFFLOAD_FAIL;
+ }
+
+ if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
+ DP("Threads per block capped at kernel limit %d\n",
+ KernelInfo->MaxThreadsPerBlock);
+ CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
}
unsigned int CudaBlocksPerGrid;
More information about the Openmp-commits
mailing list