[Openmp-commits] [llvm] [openmp] [Offload] Clean up and split Shared/Utils.h (PR #194876)

Wed Apr 29 07:51:53 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-offload

Author: Joseph Huber (jhuber6)

<details>
<summary>Changes</summary>

Summary:
This is used in the GPU portion of the build, but really shouldn't be.
Most of these helpers were only used by the GPU portion, which has much
nicer builtins available in new clang. Using these in the main offload
build is broken on Windows and not available on older compilers, so we
split. Also means one less header we share.

The alignment helpers are trivially replaced by an LLVM utility, the
only thing that remains are the pointer arithmetic functions. There's no
conveient place to put these so I just let them stay.


---
Full diff: https://github.com/llvm/llvm-project/pull/194876.diff


10 Files Affected:

- (modified) offload/include/Shared/Utils.h (-34) 
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+2-2) 
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+1-1) 
- (modified) openmp/device/include/DeviceUtils.h (+16-1) 
- (modified) openmp/device/src/Allocator.cpp (+1-1) 
- (modified) openmp/device/src/DeviceUtils.cpp (-3) 
- (modified) openmp/device/src/Reduction.cpp (+2-2) 
- (modified) openmp/device/src/Synchronization.cpp (+1-1) 
- (modified) openmp/device/src/Tasking.cpp (+1-1) 
- (modified) openmp/device/src/Workshare.cpp (+1-1) 


``````````diff

diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h
index fa0212e2c2175..19a92fc86588c 100644
--- a/offload/include/Shared/Utils.h
+++ b/offload/include/Shared/Utils.h
@@ -30,40 +30,6 @@ template <typename Ty1, typename Ty2> Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) {
   return (Ty1 *)(const_cast<char *>((const char *)(Ptr)) + Offset);
 }
 
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 alignPtr(Ty1 V, Ty2 Align) {
-  return reinterpret_cast<Ty1>(((uintptr_t(V) + Align - 1) / Align) * Align);
-}
-
-/// Round up \p V to a \p Boundary.
-template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return alignPtr(V, Boundary);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_ffs(V);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_ffsl(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_popcount(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_popcountl(V);
-}
-
 } // namespace utils
 
 #endif // OMPTARGET_SHARED_UTILS_H
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index b98048c899332..f4e81025a285d 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2344,7 +2344,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     // Transparently round up to a multiple of the page size.
     auto *Pool = CoarseGrainedMemoryPools[0];
-    Size = utils::roundUp(Size, (uint64_t)Pool->getGranule());
+    Size = llvm::alignTo(Size, (uint64_t)Pool->getGranule());
 
     // Reserve the virtual address range.
     hsa_status_t Status =
@@ -4156,7 +4156,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  uint64_t ImplArgsOffset = utils::roundUp(
+  uint64_t ImplArgsOffset = llvm::alignTo(
       LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
   if (ArgsSize > ImplArgsOffset) {
     hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 42291e4e6b4df..05fdcb032bd29 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -684,7 +684,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
                            "wrong device page size");
 
     // Transparently round up to a multiple of the page size.
-    Size = utils::roundUp(Size, Granularity);
+    Size = llvm::alignTo(Size, Granularity);
 
     // Reserve the virtual address range.
     CUdeviceptr DevPtr = 0;
diff --git a/openmp/device/include/DeviceUtils.h b/openmp/device/include/DeviceUtils.h
index b92514ee9838a..876cf666f8610 100644
--- a/openmp/device/include/DeviceUtils.h
+++ b/openmp/device/include/DeviceUtils.h
@@ -13,7 +13,6 @@
 #define OMPTARGET_DEVICERTL_DEVICE_UTILS_H
 
 #include "DeviceTypes.h"
-#include "Shared/Utils.h"
 
 namespace utils {
 
@@ -63,6 +62,22 @@ template <typename To, typename From> inline To bitCast(From V) {
   return __builtin_bit_cast(To, V);
 }
 
+/// Return the first bit set in \p V.
+template <typename T> inline int ctz(T V) { return __builtin_ctzg(V); }
+
+/// Return the number of bits set in \p V.
+template <typename T> inline int popc(T V) { return __builtin_popcountg(V); }
+
+/// Return \p V aligned up to the nearest power of two multiple of \p A.
+template <typename T, typename U> inline int alignUp(T V, U A) {
+  return __builtin_align_up(V, A);
+}
+
+/// Return \p Ptr advanced by \p Offset bytes.
+template <typename T, typename U> T *advancePtr(T *Ptr, U Offset) {
+  return reinterpret_cast<T *>(reinterpret_cast<char *>(Ptr) + Offset);
+}
+
 /// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
 /// is identified by \p Mask.
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index 3782478932046..2a98e81d268f8 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -41,7 +41,7 @@ struct BumpAllocatorTy final {
   uint64_t Offset = 0;
 
   void *alloc(uint64_t Size) {
-    Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
+    Size = utils::alignUp(Size, uint64_t(allocator::ALIGNMENT));
 
     uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
     if (OldData + Size >= MEMORY_SIZE)
diff --git a/openmp/device/src/DeviceUtils.cpp b/openmp/device/src/DeviceUtils.cpp
index d6f8c499c8904..a7ae25e49d21f 100644
--- a/openmp/device/src/DeviceUtils.cpp
+++ b/openmp/device/src/DeviceUtils.cpp
@@ -5,9 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
 
 #include "DeviceUtils.h"
 
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index fffd0063940c6..f2a2d5e39aaa5 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -52,11 +52,11 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
   do {
     Liveness = mapping::activemask();
-    remote_id = utils::ffs(Liveness & lanemask_gt);
+    remote_id = utils::ctz(Liveness & lanemask_gt);
     size = utils::popc(Liveness);
     logical_lane_id /= 2;
     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+            /*Offset=*/remote_id - physical_lane_id, /*AlgoVersion=*/2);
   } while (logical_lane_id % 2 == 0 && size > 1);
   return (logical_lane_id == 0);
 }
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 93f10ebd37292..d1b772becab41 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -107,7 +107,7 @@ void unsetCriticalLock(omp_lock_t *Lock) {
 }
 
 void setCriticalLock(omp_lock_t *Lock) {
-  uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
+  uint64_t LowestActiveThread = utils::ctz(mapping::activemask());
   if (mapping::getThreadIdInWarp() == LowestActiveThread) {
     fence::kernel(atomic::release);
     while (
diff --git a/openmp/device/src/Tasking.cpp b/openmp/device/src/Tasking.cpp
index d0be0ace50dff..bd705b3d5258b 100644
--- a/openmp/device/src/Tasking.cpp
+++ b/openmp/device/src/Tasking.cpp
@@ -27,7 +27,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
                                         size_t SharedValuesSize,
                                         TaskFnTy TaskFn) {
   auto TaskSizeInclPrivateValuesPadded =
-      utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *));
+      utils::alignUp(TaskSizeInclPrivateValues, sizeof(void *));
   auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
   TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
       TaskSizeTotal, "explicit task descriptor");
diff --git a/openmp/device/src/Workshare.cpp b/openmp/device/src/Workshare.cpp
index 653104ce883d1..dd91e95754612 100644
--- a/openmp/device/src/Workshare.cpp
+++ b/openmp/device/src/Workshare.cpp
@@ -339,7 +339,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
 
   static uint64_t NextIter() {
     __kmpc_impl_lanemask_t active = mapping::activemask();
-    uint32_t leader = utils::ffs(active) - 1;
+    uint32_t leader = utils::ctz(active);
     uint32_t change = utils::popc(active);
     __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
     unsigned int rank = utils::popc(active & lane_mask_lt);

``````````

</details>


https://github.com/llvm/llvm-project/pull/194876