[Openmp-commits] [PATCH] D51624: [libomptarget][CUDA] Use cuDeviceGetAttribute, NFCI.

Tue Sep 4 08:14:51 PDT 2018

This revision was automatically updated to reflect the committed changes.
Hahnfeld marked an inline comment as done.
Closed by commit rL341372: [libomptarget][CUDA] Use cuDeviceGetAttribute, NFCI. (authored by Hahnfeld, committed by ).
Herald added a subscriber: llvm-commits.

Changed prior to commit:
  https://reviews.llvm.org/D51624?vs=163793&id=163832#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D51624

Files:
  openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp


Index: openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp
===================================================================

--- openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp
+++ openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp
@@ -285,43 +285,48 @@
     return OFFLOAD_FAIL;
   }
 
-  // scan properties to determine number of threads/block and blocks/grid.
-  CUdevprop Properties;
-  err = cuDeviceGetProperties(&Properties, cuDevice);
+  // Query attributes to determine number of threads/block and blocks/grid.
+  int maxGridDimX;
+  err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             cuDevice);
   if (err != CUDA_SUCCESS) {
-    DP("Error getting device Properties, use defaults\n");
+    DP("Error getting max grid dimension, use default\n");
     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
-    DeviceInfo.WarpSize[device_id] = 32;
+  } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
+    DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
+    DP("Using %d CUDA blocks per grid\n", maxGridDimX);
   } else {
-    // Get blocks per grid
-    if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
-      DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
-      DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
-    } else {
-      DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
-      DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
-          "at the hard limit\n", Properties.maxGridSize[0],
-          RTLDeviceInfoTy::HardTeamLimit);
-    }
+    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+    DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+       "at the hard limit\n",
+       maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
+  }
 
-    // Get threads per block, exploit threads only along x axis
-    if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
-      DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
-      DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
-      if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
-        DP("(fewer than max per block along all xyz dims %d)\n",
-            Properties.maxThreadsPerBlock);
-      }
-    } else {
-      DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
-      DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
-          "capping at the hard limit\n", Properties.maxThreadsDim[0],
-          RTLDeviceInfoTy::HardThreadLimit);
-    }
+  // We are only exploiting threads along the x axis.
+  int maxBlockDimX;
+  err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting max block dimension, use default\n");
+    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+  } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
+    DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
+    DP("Using %d CUDA threads per block\n", maxBlockDimX);
+  } else {
+    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
+    DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
+       "at the hard limit\n",
+       maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
+  }
 
-    // According to the documentation, SIMDWidth is "Warp size in threads".
-    DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth;
+  int warpSize;
+  err =
+      cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting warp size, assume default\n");
+    DeviceInfo.WarpSize[device_id] = 32;
+  } else {
+    DeviceInfo.WarpSize[device_id] = warpSize;
   }
 
   // Adjust teams to the env variables


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D51624.163832.patch
Type: text/x-patch
Size: 4063 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20180904/cace7391/attachment.bin>