[Openmp-commits] [openmp] [OpenMP][NFC] Simplify rounding operations (PR #196155)

Wed May 6 12:50:27 PDT 2026

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/196155

>From 32f6ea9c07fe142847d33fe611423a4fa0822372 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 6 May 2026 14:45:19 -0500
Subject: [PATCH] [OpenMP][NFC] Simplify rounding operations

Summary:
There were a lot of these cases that did rounding up / down. Make
helpers for them and simplify.
---
 openmp/device/include/DeviceUtils.h | 12 +++++++++++-
 openmp/device/src/Mapping.cpp       |  8 ++++----
 openmp/device/src/Parallelism.cpp   |  2 +-
 openmp/device/src/Reduction.cpp     | 12 +++++-------
 openmp/device/src/Workshare.cpp     |  9 ++++-----
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/openmp/device/include/DeviceUtils.h b/openmp/device/include/DeviceUtils.h
index 876cf666f8610..9786b4e8aaaf9 100644
--- a/openmp/device/include/DeviceUtils.h
+++ b/openmp/device/include/DeviceUtils.h
@@ -69,10 +69,20 @@ template <typename T> inline int ctz(T V) { return __builtin_ctzg(V); }
 template <typename T> inline int popc(T V) { return __builtin_popcountg(V); }
 
 /// Return \p V aligned up to the nearest power of two multiple of \p A.
-template <typename T, typename U> inline int alignUp(T V, U A) {
+template <typename T, typename U> inline T alignUp(T V, U A) {
   return __builtin_align_up(V, A);
 }
 
+/// Return \p V aligned down to the nearest power of two multiple of \p A.
+template <typename T, typename U> inline T alignDown(T V, U A) {
+  return __builtin_align_down(V, A);
+}
+
+/// Return \p X divided by \p Y, rounded up to the nearest integer.
+template <typename T, typename U> inline T roundUp(T X, U Y) {
+  return (X + Y - 1) / Y;
+}
+
 /// Return \p Ptr advanced by \p Offset bytes.
 template <typename T, typename U> T *advancePtr(T *Ptr, U Offset) {
   return reinterpret_cast<T *>(reinterpret_cast<char *>(Ptr) + Offset);
diff --git a/openmp/device/src/Mapping.cpp b/openmp/device/src/Mapping.cpp
index b145892d1ece0..5c51da65c8d28 100644
--- a/openmp/device/src/Mapping.cpp
+++ b/openmp/device/src/Mapping.cpp
@@ -27,8 +27,8 @@ extern const inline uint32_t __oclc_ABI_version = 500;
 #endif
 
 static bool isInLastWarp() {
-  uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
-                     ~(mapping::getWarpSize() - 1);
+  uint32_t MainTId = utils::alignDown(mapping::getNumberOfThreadsInBlock() - 1,
+                                      mapping::getWarpSize());
   return mapping::getThreadIdInBlock() == MainTId;
 }
 
@@ -131,8 +131,8 @@ uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
 }
 
 uint32_t mapping::getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
+  return utils::alignUp(mapping::getNumberOfThreadsInBlock(),
+                        mapping::getWarpSize());
 }
 
 uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp
index 9f74990ce43ea..0902cc3f161b9 100644
--- a/openmp/device/src/Parallelism.cpp
+++ b/openmp/device/src/Parallelism.cpp
@@ -61,7 +61,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
   if (NumThreads < mapping::getWarpSize())
     NumThreads = 1;
   else
-    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+    NumThreads = utils::alignDown(NumThreads, mapping::getWarpSize());
 
   return NumThreads;
 }
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index f2a2d5e39aaa5..271d31629d8ee 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -84,8 +84,7 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
 
 #if __has_builtin(__nvvm_reflect)
   if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    uint32_t WarpsNeeded = utils::roundUp(NumThreads, mapping::getWarpSize());
     uint32_t WarpId = mapping::getWarpIdInBlock();
 
     // Volta execution model:
@@ -136,8 +135,7 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   //
   // Only L1 parallel region can enter this if condition.
   if (NumThreads > mapping::getWarpSize()) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    uint32_t WarpsNeeded = utils::alignUp(NumThreads, mapping::getWarpSize());
     // Gather all the reduced values from each warp
     // to the first warp.
     cpyFct(reduce_data, WarpsNeeded);
@@ -158,7 +156,7 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
 uint32_t roundToWarpsize(uint32_t s) {
   if (s < mapping::getWarpSize())
     return 1;
-  return (s & ~(unsigned)(mapping::getWarpSize() - 1));
+  return utils::alignDown(s, mapping::getWarpSize());
 }
 
 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
@@ -281,8 +279,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
       // a block reduction is performed here.
       uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
       if (ActiveThreads > mapping::getWarpSize()) {
-        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
-                               mapping::getWarpSize();
+        uint32_t WarpsNeeded =
+            utils::alignUp(ActiveThreads, mapping::getWarpSize());
         // Gather all the reduced values from each warp
         // to the first warp.
         cpyFct(reduce_data, WarpsNeeded);
diff --git a/openmp/device/src/Workshare.cpp b/openmp/device/src/Workshare.cpp
index dd91e95754612..6e6440b690db0 100644
--- a/openmp/device/src/Workshare.cpp
+++ b/openmp/device/src/Workshare.cpp
@@ -144,10 +144,9 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
       if (chunk > 0) {
         // round up to make sure the chunk is enough to cover all iterations
         T tripCount = ub - lb + 1; // +1 because ub is inclusive
-        T span = (tripCount + numberOfActiveOMPThreads - 1) /
-                 numberOfActiveOMPThreads;
+        T span = utils::roundUp(tripCount, numberOfActiveOMPThreads);
         // perform chunk adjustment
-        chunk = (span + chunk - 1) & ~(chunk - 1);
+        chunk = utils::alignUp(span, chunk);
 
         ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
         T oldUb = ub;
@@ -290,9 +289,9 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
       ST stride;
       int lastiter = 0;
       // round up to make sure the chunk is enough to cover all iterations
-      T span = (tripCount + tnum - 1) / tnum;
+      T span = utils::roundUp(tripCount, tnum);
       // perform chunk adjustment
-      chunk = (span + chunk - 1) & ~(chunk - 1);
+      chunk = utils::alignUp(span, chunk);
 
       T oldUb = ub;
       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);