[Openmp-commits] [openmp] [Libomptarget] Remove remaining inline assembly from the device RTL (PR #79922)
Joseph Huber via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jan 29 16:49:45 PST 2024
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/79922
Summary:
Recent patches have added some missing intrinsic functions NVPTX. This
patch gets rid of all the remaining uses of inline assembly. The one
change that wasn't directly replaced with a built-in was the `pack` and
`unpack` implementations. However, using the generic C implementation is
equivalent to the output SASS when run through PTXAS.
>From add98f3f4447af88bf41e9165c6d2a91fc27cc31 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 29 Jan 2024 18:46:48 -0600
Subject: [PATCH] [Libomptarget] Remove remaining inline assembly from the
device RTL
Summary:
Recent patches have added some missing intrinsic functions NVPTX. This
patch gets rid of all the remaining uses of inline assembly. The one
change that wasn't directly replaced with a built-in was the `pack` and
`unpack` implementations. However, using the generic C implementation is
equivalent to the output SASS when run through PTXAS.
---
openmp/libomptarget/DeviceRTL/CMakeLists.txt | 2 +-
openmp/libomptarget/DeviceRTL/src/Mapping.cpp | 18 ++-------
openmp/libomptarget/DeviceRTL/src/Misc.cpp | 5 +--
.../DeviceRTL/src/Synchronization.cpp | 7 +---
openmp/libomptarget/DeviceRTL/src/Utils.cpp | 39 +------------------
5 files changed, 10 insertions(+), 61 deletions(-)
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index 1ce3e1e40a80a..2509f1276ccee 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -293,7 +293,7 @@ foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
- compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
+ compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
else()
libomptarget_error_say("Unknown GPU architecture '${gpu_arch}'")
endif()
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 822b8dc2dd5e6..31dd8054dec33 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -154,23 +154,11 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
-LaneMaskTy activemask() {
- unsigned int Mask;
- asm("activemask.b32 %0;" : "=r"(Mask));
- return Mask;
-}
+LaneMaskTy activemask() { return __nvvm_activemask(); }
-LaneMaskTy lanemaskLT() {
- __kmpc_impl_lanemask_t Res;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
- return Res;
-}
+LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-LaneMaskTy lanemaskGT() {
- __kmpc_impl_lanemask_t Res;
- asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
- return Res;
-}
+LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
uint32_t getThreadIdInBlock(int32_t Dim) {
switch (Dim) {
diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
index 87d568779b401..c24af9442d16e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -62,9 +62,8 @@ double getWTick() {
}
double getWTime() {
- unsigned long long nsecs;
- asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
- return (double)nsecs * getWTick();
+ uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
+ return static_cast<double>(nsecs) * getWTick();
}
#pragma omp end declare variant
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 57f98a353589c..80ba87b300bcd 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -345,10 +345,7 @@ void namedBarrier() {
// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
constexpr int BarrierNo = 7;
- asm volatile("barrier.sync %0, %1;"
- :
- : "r"(BarrierNo), "r"(NumThreads)
- : "memory");
+ __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
}
void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
@@ -361,7 +358,7 @@ void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
void syncThreads(atomic::OrderingTy Ordering) {
constexpr int BarrierNo = 8;
- asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
+ __nvvm_barrier_sync(BarrierNo);
}
void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
index 7da4da4ab95e2..d07ac0fb499c9 100644
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -22,52 +22,17 @@ using namespace ompx;
namespace impl {
bool isSharedMemPtr(const void *Ptr) { return false; }
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
- *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
- *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
+ *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
+ *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match( \
- device = {arch(nvptx, nvptx64)}, \
- implementation = {extension(match_any)})
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
- uint32_t LowBitsLocal, HighBitsLocal;
- asm("mov.b64 {%0,%1}, %2;"
- : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
- : "l"(Val));
- *LowBits = LowBitsLocal;
- *HighBits = HighBitsLocal;
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
- uint64_t Val;
- asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
- return Val;
-}
-
-#pragma omp end declare variant
-///}
-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);
More information about the Openmp-commits
mailing list