[Openmp-commits] [openmp] 6764301 - [Libomptarget] Correctly implement `getWTime` on AMDGPU

Joseph Huber via Openmp-commits openmp-commits at lists.llvm.org
Tue Jul 4 19:50:52 PDT 2023


Author: Joseph Huber
Date: 2023-07-04T21:50:43-05:00
New Revision: 6764301a6bd3ffe80f3469005cbe5dffd41ba073

URL: https://github.com/llvm/llvm-project/commit/6764301a6bd3ffe80f3469005cbe5dffd41ba073
DIFF: https://github.com/llvm/llvm-project/commit/6764301a6bd3ffe80f3469005cbe5dffd41ba073.diff

LOG: [Libomptarget] Correctly implement `getWTime` on AMDGPU

AMDGPU provides a fixed frequency clock since some generations back.
However, the frequency is variable by card and must be looked up at
runtime. This patch adds a new device environment line for the clock
frequency so that we can use it in the same way as NVPTX. This is the
correct implementation and the version in ASO should be replaced.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D154456

Added: 
    

Modified: 
    openmp/libomptarget/DeviceRTL/include/Configuration.h
    openmp/libomptarget/DeviceRTL/src/Configuration.cpp
    openmp/libomptarget/DeviceRTL/src/Misc.cpp
    openmp/libomptarget/include/DeviceEnvironment.h
    openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
    openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
    openmp/libomptarget/test/offloading/wtime.c

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 09bce1092f0960..068c0166845a74 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -37,6 +37,9 @@ uint32_t getDebugKind();
 /// Return the amount of dynamic shared memory that was allocated at launch.
 uint64_t getDynamicMemorySize();
 
+/// Returns the cycles per second of the device's fixed frequency clock.
+uint64_t getClockFrequency();
+
 /// Return if debugging is enabled for the given debug kind.
 bool isDebugMode(DebugKind Level);
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index ceccef625ed29f..994ff2b67bb34e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -46,6 +46,10 @@ uint64_t config::getDynamicMemorySize() {
   return __omp_rtl_device_environment.DynamicMemSize;
 }
 
+uint64_t config::getClockFrequency() {
+  return __omp_rtl_device_environment.ClockFrequency;
+}
+
 bool config::isDebugMode(config::DebugKind Kind) {
   return config::getDebugKind() & Kind;
 }

diff  --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
index 68ce445a16edf8..a19a263e55b246 100644
--- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -9,6 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Configuration.h"
 #include "Types.h"
 
 #include "Debug.h"
@@ -27,14 +28,23 @@ double getWTime();
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
-double getWTick() { return ((double)1E-9); }
+double getWTick() {
+  // The number of ticks per second for the AMDGPU clock varies by card and can
+  // only be retrived by querying the driver. We rely on the device environment
+  // to inform us what the proper frequency is.
+  return 1.0 / config::getClockFrequency();
+}
 
 double getWTime() {
-  // The intrinsics for measuring time have undocumented frequency
-  // This will probably need to be found by measurement on a number of
-  // architectures. Until then, return 0, which is very inaccurate as a
-  // timer but resolves the undefined symbol at link time.
-  return 0;
+  uint64_t NumTicks = 0;
+  if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl))
+    NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83);
+  else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime))
+    NumTicks = __builtin_amdgcn_s_memrealtime();
+  else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime))
+    NumTicks = __builtin_amdgcn_s_memtime();
+
+  return static_cast<double>(NumTicks) * getWTick();
 }
 
 #pragma omp end declare variant

diff  --git a/openmp/libomptarget/include/DeviceEnvironment.h b/openmp/libomptarget/include/DeviceEnvironment.h
index 231492c68f762c..4260002a1f0361 100644
--- a/openmp/libomptarget/include/DeviceEnvironment.h
+++ b/openmp/libomptarget/include/DeviceEnvironment.h
@@ -20,6 +20,7 @@ struct DeviceEnvironmentTy {
   uint32_t NumDevices;
   uint32_t DeviceNum;
   uint32_t DynamicMemSize;
+  uint64_t ClockFrequency;
 };
 
 #endif

diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 21436baecf9da8..1fcbcf29f9e353 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1596,6 +1596,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
     GridValues.GV_Warp_Size = WavefrontSize;
 
+    // Get the frequency of the steady clock.
+    if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
+                                 ClockFrequency))
+      return Err;
+
     // Load the grid values dependending on the wavefront.
     if (WavefrontSize == 32)
       GridValues = getAMDGPUGridValues<32>();
@@ -1757,6 +1762,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// See GenericDeviceTy::getComputeUnitKind().
   std::string getComputeUnitKind() const override { return ComputeUnitKind; }
 
+  /// Returns the clock frequency for the given AMDGPU device.
+  uint64_t getClockFrequency() const override { return ClockFrequency; }
+
   /// Allocate and construct an AMDGPU kernel.
   Expected<GenericKernelTy *>
   constructKernelEntry(const __tgt_offload_entry &KernelEntry,
@@ -2417,6 +2425,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The GPU architecture.
   std::string ComputeUnitKind;
 
+  /// The frequency of the steady clock inside the device.
+  uint64_t ClockFrequency;
+
   /// Reference to the host device.
   AMDHostDeviceTy &HostDevice;
 

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index c86b2eb357936b..9eaaaf817d9f17 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -509,6 +509,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
   // TODO: The device ID used here is not the real device ID used by OpenMP.
   DeviceEnvironment.DeviceNum = DeviceId;
   DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
+  DeviceEnvironment.ClockFrequency = getClockFrequency();
 
   // Create the metainfo of the device environment global.
   GlobalTy DevEnvGlobal("__omp_rtl_device_environment",

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 189406ac1dc2a1..8fe615b2f6f239 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -732,6 +732,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
     return GridValues.GV_Default_Num_Teams;
   }
   uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
+  virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
 
   /// Get target compute unit kind (e.g., sm_80, or gfx908).
   virtual std::string getComputeUnitKind() const { return "unknown"; }

diff  --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index d823cbebc3d7bf..c165b582f63d0b 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -829,6 +829,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return ComputeCapability.str();
   }
 
+  /// Returns the clock frequency for the given NVPTX device.
+  uint64_t getClockFrequency() const override { return 1000000000; }
+
 private:
   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
   using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;

diff  --git a/openmp/libomptarget/test/offloading/wtime.c b/openmp/libomptarget/test/offloading/wtime.c
index 230d67bced7def..2ba60aadb5bece 100644
--- a/openmp/libomptarget/test/offloading/wtime.c
+++ b/openmp/libomptarget/test/offloading/wtime.c
@@ -1,7 +1,6 @@
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-// UNSUPPORTED: amdgcn-amd-amdhsa
+// RUN: %libomptarget-compileopt-and-run-generic
 
+#include <assert.h>
 #include <omp.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -10,17 +9,17 @@
 
 int main(int argc, char *argv[]) {
   int *data = (int *)malloc(N * sizeof(int));
-#pragma omp target map(from : data[0 : N])
+  double duration = 0.0;
+
+#pragma omp target map(from : data[0 : N]) map(from : duration)
   {
     double start = omp_get_wtime();
     for (int i = 0; i < N; ++i)
       data[i] = i;
     double end = omp_get_wtime();
-    double duration = end - start;
-    printf("duration: %lfs\n", duration);
+    duration = end - start;
   }
+  assert(duration > 0.0);
   free(data);
   return 0;
 }
-
-// CHECK: duration: {{.+[1-9]+}}


        


More information about the Openmp-commits mailing list