[Openmp-commits] [PATCH] D14254: [OpenMP] Initial implementation of OpenMP offloading library - libomptarget device RTLs.
Jonas Hahnfeld via Openmp-commits
openmp-commits at lists.llvm.org
Mon Feb 1 08:01:27 PST 2016
Hahnfeld added inline comments.
================
Comment at: libomptarget/deviceRTLs/nvptx/CMakeLists.txt:59-61
@@ +58,5 @@
+
+ include_directories(
+ src/
+ )
+
----------------
Should not be needed, at least by CUDA can correctly find relative paths in `#include "omptarget-nvptx.h"`
================
Comment at: libomptarget/deviceRTLs/nvptx/src/cancel.cu:14
@@ +13,3 @@
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
----------------
I think this can be `#include "omptarget-nvptx.h"` (other files as well)
================
Comment at: libomptarget/deviceRTLs/nvptx/src/libcall.cu:16-17
@@ +15,4 @@
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+NOINLINE void PrintTaskDescr(omptarget_nvptx_TaskDescr *taskDescr, char *title,
+ int level);
+
----------------
Move this declaration into `debug.h`?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/libcall.cu:142
@@ +141,3 @@
+ int rc = 1;
+ PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+ return rc;
----------------
Typo, should say `omp_get_max_active_levels`
================
Comment at: libomptarget/deviceRTLs/nvptx/src/libcall.cu:247-254
@@ +246,10 @@
+ modifier);
+ if (kind >= omp_sched_static && kind < omp_sched_auto) {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ currTaskDescr->SetRuntimeSched(kind);
+ currTaskDescr->RuntimeChunkSize() = modifier;
+ PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %d\n",
+ (int)currTaskDescr->GetRuntimeSched(),
+ currTaskDescr->RuntimeChunkSize());
+ }
+}
----------------
This makes it impossible to later reset the schedule to this combinition - can this happen in theory?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/loop.cu:18-19
@@ +17,4 @@
+
+EXTERN void CompleteCG(omptarget_nvptx_CounterGroup &cg, Counter *priv,
+ Counter n);
+
----------------
No other usage?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu:55-56
@@ +54,4 @@
+ // init global icv
+ omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime =
+ 1.0 / 745000000.0; // host reports 745 mHz
+ omptarget_nvptx_threadPrivateContext->GlobalICV()->cancelPolicy =
----------------
Hard-coded?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h:27-31
@@ +26,7 @@
+// local includes
+#include "../../../deviceRTLs/nvptx/src/option.h" // choices we have
+#include "../../../deviceRTLs/nvptx/src/counter_group.h"
+#include "../../../deviceRTLs/nvptx/src/debug.h" // debug
+#include "../../../deviceRTLs/nvptx/src/interface.h" // interfaces with omp, compiler, and user
+#include "../../../deviceRTLs/nvptx/src/support.h"
+
----------------
I think these paths can be reduced as well
================
Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h:26
@@ +25,3 @@
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
+ // sched starts from 1..4; encode it as 0..3; so add 1 here
+ uint8_t val = ((uint8_t)sched) - 1;
----------------
Typo: `add` -> `sub`
================
Comment at: libomptarget/deviceRTLs/nvptx/src/option.h:34-40
@@ +33,9 @@
+
+#ifdef OMPTHREAD_IS_WARP
+// assume here one OpenMP thread per CUDA warp
+#define MAX_NUM_OMP_THREADS MAX_NUM_WARPS
+#else
+// assume here one OpenMP thread per CUDA thread
+#define MAX_NUM_OMP_THREADS MAX_NUM_THREADS
+#endif
+
----------------
A feature-switch?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/parallel.cu:155-156
@@ +154,4 @@
+ newTaskDescr->ThreadId() /= numLanes;
+ // newTaskDescr->ThreadsInTeam(); // =
+ // newTaskDescr->ThreadsInTeam()/numLanes;
+ }
----------------
dead code?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:20
@@ +19,3 @@
+
+EXTERN void omp_reduction_op_gpu(char *, char *);
+
----------------
Needed?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:97-116
@@ +96,22 @@
+ kmp_CriticalName *lck) {
+ int globalThreadId = GetGlobalThreadId();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(globalThreadId);
+ int numthread;
+ if (currTaskDescr->IsParallelConstruct()) {
+ numthread = omp_get_num_threads();
+ } else {
+ numthread = omp_get_num_teams();
+ }
+
+ if (numthread == 1)
+ return 1;
+ else if (!__gpu_block_reduce())
+ return 2;
+ else {
+ if (threadIdx.x == 0)
+ return 1;
+ else
+ return 0;
+ }
+}
----------------
Same as `__kmpc_reduce_gpu` (and `__kmpc_reduce_nowait41`)?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:307
@@ +306,3 @@
+
+#define ATOMIC_GENOP_DC(_op) \
+ EXTERN void __kmpc_atomic_cmplx8_##_op(kmp_Indent *id_ref, int32_t gtid, \
----------------
`__kmpc_atomic_cmplx8_##_op##_cpt`?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:330-350
@@ +329,23 @@
+
+// implementation with shared
+#define ATOMIC_GENOP_DC_obsolete(_op) \
+ EXTERN void __kmpc_atomic_cmplx16_##_op(kmp_Indent *id_ref, int32_t gtid, \
+ double _Complex *lhs, \
+ double _Complex rhs) { \
+ __shared__ unsigned int stepinblock; \
+ unsigned tnum = __ballot(1); \
+ if (tnum != (~0x0)) { \
+ return; \
+ } \
+ if (threadIdx.x == 0) \
+ stepinblock = 0; \
+ __syncthreads(); \
+ while (stepinblock < blockDim.x) { \
+ if (threadIdx.x == stepinblock) { \
+ dc_##_op(lhs, rhs); \
+ stepinblock++; \
+ } \
+ __syncthreads(); \
+ } \
+ }
+
----------------
Not needed anymore?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:352-355
@@ +351,6 @@
+
+ATOMIC_GENOP_DC(add);
+ATOMIC_GENOP_DC(sub);
+ATOMIC_GENOP_DC(mul);
+ATOMIC_GENOP_DC(div);
+
----------------
`rev` versions for `sub` and `div`?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:496
@@ +495,3 @@
+
+// for int and unit
+#define ATOMIC_GENOP_ALL_MIXED(_name, _dirname, _tname, _optype) \
----------------
Typo: `uint`?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:516-533
@@ +515,20 @@
+
+#define ATOMIC_GENOP_ALL_MIXED_FIXED8U(_name, _dirname, _tname, _optype) \
+ _dirname(_tname, _optype, add, Add); \
+ _name(_tname, _optype, sub); \
+ _name##_REV(_tname, _optype, sub); \
+ _name(_tname, _optype, mul); \
+ _name(_tname, _optype, div); \
+ _name##_REV(_tname, _optype, div); \
+ _dirname(_tname, _optype, min, Min); \
+ _dirname(_tname, _optype, max, Max); \
+ _dirname(_tname, _optype, andb, And); \
+ _dirname(_tname, _optype, orb, Or); \
+ _dirname(_tname, _optype, xor, Xor); \
+ _name(_tname, _optype, shl); \
+ _name(_tname, _optype, shr); \
+ _name(_tname, _optype, andl); \
+ _name(_tname, _optype, orl); \
+ _name(_tname, _optype, eqv); \
+ _name(_tname, _optype, neqv);
+
----------------
Below `ATOMIC_GENOP_ALL_MIXED` is also used for `fixed8u` - where there any changes planned on purpose?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:663-666
@@ +662,6 @@
+
+template <>
+INLINE __device__ float Compute<float, omptarget_nvptx_add>(float a, float b) {
+ return a + b;
+}
+
----------------
Could you add a comment explaining why this has to be specialized?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:916-919
@@ +915,6 @@
+
+template <>
+INLINE __device__ int myshfldown<int>(int val, unsigned int delta, int size) {
+ return __shfl_down(val, delta, size);
+}
+
----------------
Is there a need to specialize this template?
================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:961-971
@@ +960,13 @@
+ switch (binop) {
+ case omptarget_nvptx_inc:
+ case omptarget_nvptx_dec:
+ case omptarget_nvptx_add:
+ case omptarget_nvptx_sub:
+ case omptarget_nvptx_sub_rev:
+ return (T)0;
+ case omptarget_nvptx_mul:
+ case omptarget_nvptx_div:
+ return (T)1;
+ default:
+ return (T)0;
+ }
----------------
Other types? Are they correctly defaulting?
http://reviews.llvm.org/D14254
More information about the Openmp-commits
mailing list