[Openmp-commits] [openmp] [Libomptarget] Remove remaining inline assembly from the device RTL (PR #79922)

Joseph Huber via Openmp-commits openmp-commits at lists.llvm.org
Mon Jan 29 16:49:45 PST 2024


https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/79922

Summary:
Recent patches have added some missing intrinsic functions NVPTX. This
patch gets rid of all the remaining uses of inline assembly. The one
change that wasn't directly replaced with a built-in was the `pack` and
`unpack` implementations. However, using the generic C implementation is
equivalent to the output SASS when run through PTXAS.


>From add98f3f4447af88bf41e9165c6d2a91fc27cc31 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 29 Jan 2024 18:46:48 -0600
Subject: [PATCH] [Libomptarget] Remove remaining inline assembly from the
 device RTL

Summary:
Recent patches have added some missing intrinsic functions NVPTX. This
patch gets rid of all the remaining uses of inline assembly. The one
change that wasn't directly replaced with a built-in was the `pack` and
`unpack` implementations. However, using the generic C implementation is
equivalent to the output SASS when run through PTXAS.
---
 openmp/libomptarget/DeviceRTL/CMakeLists.txt  |  2 +-
 openmp/libomptarget/DeviceRTL/src/Mapping.cpp | 18 ++-------
 openmp/libomptarget/DeviceRTL/src/Misc.cpp    |  5 +--
 .../DeviceRTL/src/Synchronization.cpp         |  7 +---
 openmp/libomptarget/DeviceRTL/src/Utils.cpp   | 39 +------------------
 5 files changed, 10 insertions(+), 61 deletions(-)

diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index 1ce3e1e40a80a..2509f1276ccee 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -293,7 +293,7 @@ foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
   if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
     compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
   elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
+    compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
   else()
     libomptarget_error_say("Unknown GPU architecture '${gpu_arch}'")
   endif()
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 822b8dc2dd5e6..31dd8054dec33 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -154,23 +154,11 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
 
 const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
 
-LaneMaskTy activemask() {
-  unsigned int Mask;
-  asm("activemask.b32 %0;" : "=r"(Mask));
-  return Mask;
-}
+LaneMaskTy activemask() { return __nvvm_activemask(); }
 
-LaneMaskTy lanemaskLT() {
-  __kmpc_impl_lanemask_t Res;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
-  return Res;
-}
+LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
 
-LaneMaskTy lanemaskGT() {
-  __kmpc_impl_lanemask_t Res;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
-  return Res;
-}
+LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
 
 uint32_t getThreadIdInBlock(int32_t Dim) {
   switch (Dim) {
diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
index 87d568779b401..c24af9442d16e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -62,9 +62,8 @@ double getWTick() {
 }
 
 double getWTime() {
-  unsigned long long nsecs;
-  asm volatile("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
-  return (double)nsecs * getWTick();
+  uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
+  return static_cast<double>(nsecs) * getWTick();
 }
 
 #pragma omp end declare variant
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 57f98a353589c..80ba87b300bcd 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -345,10 +345,7 @@ void namedBarrier() {
   // The named barrier for active parallel threads of a team in an L1 parallel
   // region to synchronize with each other.
   constexpr int BarrierNo = 7;
-  asm volatile("barrier.sync %0, %1;"
-               :
-               : "r"(BarrierNo), "r"(NumThreads)
-               : "memory");
+  __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
 }
 
 void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
@@ -361,7 +358,7 @@ void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
 
 void syncThreads(atomic::OrderingTy Ordering) {
   constexpr int BarrierNo = 8;
-  asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
+  __nvvm_barrier_sync(BarrierNo);
 }
 
 void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
index 7da4da4ab95e2..d07ac0fb499c9 100644
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -22,52 +22,17 @@ using namespace ompx;
 namespace impl {
 
 bool isSharedMemPtr(const void *Ptr) { return false; }
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
   static_assert(sizeof(unsigned long) == 8, "");
-  *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
-  *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
+  *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
+  *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
 }
 
 uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
   return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
 }
 
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
-  uint32_t LowBitsLocal, HighBitsLocal;
-  asm("mov.b64 {%0,%1}, %2;"
-      : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
-      : "l"(Val));
-  *LowBits = LowBitsLocal;
-  *HighBits = HighBitsLocal;
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
-  uint64_t Val;
-  asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
-  return Val;
-}
-
-#pragma omp end declare variant
-///}
-
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                     int32_t Width);



More information about the Openmp-commits mailing list