[Openmp-commits] [PATCH] D14254: [OpenMP] Initial implementation of OpenMP offloading library - libomptarget device RTLs.

Mon Feb 1 08:01:27 PST 2016

Hahnfeld added inline comments.

================
Comment at: libomptarget/deviceRTLs/nvptx/CMakeLists.txt:59-61
@@ +58,5 @@
+
+  include_directories(
+      src/
+  )
+
----------------
Should not be needed, at least by CUDA can correctly find relative paths in `#include "omptarget-nvptx.h"`

================
Comment at: libomptarget/deviceRTLs/nvptx/src/cancel.cu:14
@@ +13,3 @@
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
----------------
I think this can be `#include "omptarget-nvptx.h"` (other files as well)

================
Comment at: libomptarget/deviceRTLs/nvptx/src/libcall.cu:16-17
@@ +15,4 @@
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+NOINLINE void PrintTaskDescr(omptarget_nvptx_TaskDescr *taskDescr, char *title,
+                             int level);
+
----------------
Move this declaration into `debug.h`?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/libcall.cu:142
@@ +141,3 @@
+  int rc = 1;
+  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+  return rc;
----------------
Typo, should say `omp_get_max_active_levels`

================
Comment at: libomptarget/deviceRTLs/nvptx/src/libcall.cu:247-254
@@ +246,10 @@
+        modifier);
+  if (kind >= omp_sched_static && kind < omp_sched_auto) {
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    currTaskDescr->SetRuntimeSched(kind);
+    currTaskDescr->RuntimeChunkSize() = modifier;
+    PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %d\n",
+          (int)currTaskDescr->GetRuntimeSched(),
+          currTaskDescr->RuntimeChunkSize());
+  }
+}
----------------
This makes it impossible to later reset the schedule to this combinition - can this happen in theory?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/loop.cu:18-19
@@ +17,4 @@
+
+EXTERN void CompleteCG(omptarget_nvptx_CounterGroup &cg, Counter *priv,
+                       Counter n);
+
----------------
No other usage?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu:55-56
@@ +54,4 @@
+    // init global icv
+    omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime =
+        1.0 / 745000000.0; // host reports 745 mHz
+    omptarget_nvptx_threadPrivateContext->GlobalICV()->cancelPolicy =
----------------
Hard-coded?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h:27-31
@@ +26,7 @@
+// local includes
+#include "../../../deviceRTLs/nvptx/src/option.h" // choices we have
+#include "../../../deviceRTLs/nvptx/src/counter_group.h"
+#include "../../../deviceRTLs/nvptx/src/debug.h" // debug
+#include "../../../deviceRTLs/nvptx/src/interface.h" // interfaces with omp, compiler, and user
+#include "../../../deviceRTLs/nvptx/src/support.h"
+
----------------
I think these paths can be reduced as well

================
Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h:26
@@ +25,3 @@
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
+  // sched starts from 1..4; encode it as 0..3; so add 1 here
+  uint8_t val = ((uint8_t)sched) - 1;
----------------
Typo: `add` -> `sub`

================
Comment at: libomptarget/deviceRTLs/nvptx/src/option.h:34-40
@@ +33,9 @@
+
+#ifdef OMPTHREAD_IS_WARP
+// assume here one OpenMP thread per CUDA warp
+#define MAX_NUM_OMP_THREADS MAX_NUM_WARPS
+#else
+// assume here one OpenMP thread per CUDA thread
+#define MAX_NUM_OMP_THREADS MAX_NUM_THREADS
+#endif
+
----------------
A feature-switch?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/parallel.cu:155-156
@@ +154,4 @@
+    newTaskDescr->ThreadId() /= numLanes;
+    // newTaskDescr->ThreadsInTeam(); // =
+    // newTaskDescr->ThreadsInTeam()/numLanes;
+  }
----------------
dead code?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:20
@@ +19,3 @@
+
+EXTERN void omp_reduction_op_gpu(char *, char *);
+
----------------
Needed?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:97-116
@@ +96,22 @@
+                               kmp_CriticalName *lck) {
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(globalThreadId);
+  int numthread;
+  if (currTaskDescr->IsParallelConstruct()) {
+    numthread = omp_get_num_threads();
+  } else {
+    numthread = omp_get_num_teams();
+  }
+
+  if (numthread == 1)
+    return 1;
+  else if (!__gpu_block_reduce())
+    return 2;
+  else {
+    if (threadIdx.x == 0)
+      return 1;
+    else
+      return 0;
+  }
+}
----------------
Same as `__kmpc_reduce_gpu` (and `__kmpc_reduce_nowait41`)?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:307
@@ +306,3 @@
+
+#define ATOMIC_GENOP_DC(_op)                                                   \
+  EXTERN void __kmpc_atomic_cmplx8_##_op(kmp_Indent *id_ref, int32_t gtid,     \
----------------
`__kmpc_atomic_cmplx8_##_op##_cpt`?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:330-350
@@ +329,23 @@
+
+// implementation with shared
+#define ATOMIC_GENOP_DC_obsolete(_op)                                          \
+  EXTERN void __kmpc_atomic_cmplx16_##_op(kmp_Indent *id_ref, int32_t gtid,    \
+                                          double _Complex *lhs,                \
+                                          double _Complex rhs) {               \
+    __shared__ unsigned int stepinblock;                                       \
+    unsigned tnum = __ballot(1);                                               \
+    if (tnum != (~0x0)) {                                                      \
+      return;                                                                  \
+    }                                                                          \
+    if (threadIdx.x == 0)                                                      \
+      stepinblock = 0;                                                         \
+    __syncthreads();                                                           \
+    while (stepinblock < blockDim.x) {                                         \
+      if (threadIdx.x == stepinblock) {                                        \
+        dc_##_op(lhs, rhs);                                                    \
+        stepinblock++;                                                         \
+      }                                                                        \
+      __syncthreads();                                                         \
+    }                                                                          \
+  }
+
----------------
Not needed anymore?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:352-355
@@ +351,6 @@
+
+ATOMIC_GENOP_DC(add);
+ATOMIC_GENOP_DC(sub);
+ATOMIC_GENOP_DC(mul);
+ATOMIC_GENOP_DC(div);
+
----------------
`rev` versions for `sub` and `div`?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:496
@@ +495,3 @@
+
+// for int and unit
+#define ATOMIC_GENOP_ALL_MIXED(_name, _dirname, _tname, _optype)               \
----------------
Typo: `uint`?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:516-533
@@ +515,20 @@
+
+#define ATOMIC_GENOP_ALL_MIXED_FIXED8U(_name, _dirname, _tname, _optype)       \
+  _dirname(_tname, _optype, add, Add);                                         \
+  _name(_tname, _optype, sub);                                                 \
+  _name##_REV(_tname, _optype, sub);                                           \
+  _name(_tname, _optype, mul);                                                 \
+  _name(_tname, _optype, div);                                                 \
+  _name##_REV(_tname, _optype, div);                                           \
+  _dirname(_tname, _optype, min, Min);                                         \
+  _dirname(_tname, _optype, max, Max);                                         \
+  _dirname(_tname, _optype, andb, And);                                        \
+  _dirname(_tname, _optype, orb, Or);                                          \
+  _dirname(_tname, _optype, xor, Xor);                                         \
+  _name(_tname, _optype, shl);                                                 \
+  _name(_tname, _optype, shr);                                                 \
+  _name(_tname, _optype, andl);                                                \
+  _name(_tname, _optype, orl);                                                 \
+  _name(_tname, _optype, eqv);                                                 \
+  _name(_tname, _optype, neqv);
+
----------------
Below `ATOMIC_GENOP_ALL_MIXED` is also used for `fixed8u` - where there any changes planned on purpose?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:663-666
@@ +662,6 @@
+
+template <>
+INLINE __device__ float Compute<float, omptarget_nvptx_add>(float a, float b) {
+  return a + b;
+}
+
----------------
Could you add a comment explaining why this has to be specialized?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:916-919
@@ +915,6 @@
+
+template <>
+INLINE __device__ int myshfldown<int>(int val, unsigned int delta, int size) {
+  return __shfl_down(val, delta, size);
+}
+
----------------
Is there a need to specialize this template?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:961-971
@@ +960,13 @@
+  switch (binop) {
+  case omptarget_nvptx_inc:
+  case omptarget_nvptx_dec:
+  case omptarget_nvptx_add:
+  case omptarget_nvptx_sub:
+  case omptarget_nvptx_sub_rev:
+    return (T)0;
+  case omptarget_nvptx_mul:
+  case omptarget_nvptx_div:
+    return (T)1;
+  default:
+    return (T)0;
+  }
----------------
Other types? Are they correctly defaulting?

http://reviews.llvm.org/D14254