[Openmp-commits] [PATCH] D14031: [OpenMP] Initial implementation of OpenMP offloading library - libomptarget.

Mon Oct 26 07:57:35 PDT 2015

Hahnfeld added a comment.

Great work!
Would it be possible to split this into multiple, smaller reviews (for main library, plugins and device runtime for nvptx)?

Greetings
Jonas

================
Comment at: libomptarget/deviceRTLs/nvptx/src/parallel.cu:151-158
@@ +150,10 @@
+  }
+//  } else {
+//    // not a for with a simd inside: use only one lane
+//    // we may have started thread_limit*simd_info CUDA threads
+//    // and we need to set the number of threads to thread_limit value
+//    // FIXME: is this always the case, even if numLanes > 1?
+////    newTaskDescr->ThreadId() = threadIdx.x;
+//    //newTaskDescr->ThreadsInTeam();// = newTaskDescr->ThreadLimit();
+//  }
+}
----------------
Commented out - should this be committed?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:71-117
@@ +70,49 @@
+	}
+//	return 2;
+	/**
+	 * Only when all the threads in a block are doing reduction,
+	 * the warpBlockRedu is used. Otherwise atomic.
+	 * check the data type, too.
+	 * A special case: when the size of thread group is one,
+	 * do reduction directly.
+	 **/
+	
+	// Note: this code provokes warning because it follows a "return"
+	
+	//since there is no thread interface yet, just infer from the 
+	// result of ballot
+#if 0
+	unsigned tnum = __ballot(1);
+	if (tnum != (~0x0)) { //assume swapSize is 32
+		return 2;
+	}
+	
+#if 0
+	if (threadIdx.x == 0) {
+		if ((void *)reductFct != (void *)omp_reduction_op_gpu) {
+			printf("function pointer value is not correct\n");
+		} else {
+			printf("function pointer value is correct\n");
+		}
+	}
+#endif
+
+	//printf("function pointer %p %d %p\n", reductFct, reduce_size, omp_reduction_op_gpu);
+	if (reduce_size == 0) {
+	(*reductFct)((char*)reduce_data, (char*)reduce_data);
+	} else {
+	//omp_reduction_op_gpu((char*)reduce_data, (char*)reduce_data);
+	 (*gpu_callback)((char*)reduce_data, (char*)reduce_data);
+	}
+	
+	//int **myp = (int **) reduce_data;
+	// the results are with thread 0. Reduce to the shared one
+	if (threadIdx.x == 0) {
+		//printf("function pointer %p %p\n", reductFct, omp_reduction_op);
+	 //   	printf("my result %d\n", *myp[0]);
+		return 1;
+	} else {
+		return 0;
+	}
+#endif
+}
----------------
Commented or #if 0

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:157-178
@@ +156,24 @@
+	// Notice: as above, uncomment if 0 once this code below is ready for shipping
+#if 0
+	unsigned tnum = __ballot(1);
+	if (tnum != (~0x0)) { //assume swapSize is 32
+		return 2;
+	}
+
+	if (threadIdx.x == 0) {
+		printf("choose block reduction\n");
+	}
+
+	(*reductFct)(reduce_data, reduce_data);
+	//omp_reduction_op((char*)reduce_data, (char*)reduce_data);
+
+	int **myp = (int **) reduce_data;
+	// the results are with thread 0. Reduce to the shared one
+	if (threadIdx.x == 0) {
+	    	printf("my result %d\n", *myp[0]);
+		return 1;
+	} else {
+		return 0;
+	}
+#endif
+}
----------------
Not ready for shipping?

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:780-795
@@ +779,18 @@
+
+#if 0
+template <
+  omptarget_nvptx_BINOP_t binop          // enum describing the operation
+>
+INLINE __device__ float Compute<float, binop>(float a, float b) // a is old value, b is new value
+{
+  OpType res = 0;
+  if (binop == omptarget_nvptx_add)     res = a + b;
+  if (binop == omptarget_nvptx_sub)     res = a - b;
+  if (binop == omptarget_nvptx_mul)     res = a * b;
+  if (binop == omptarget_nvptx_div)     res = a / b;
+  if (binop == omptarget_nvptx_min)     res = a < b ? a : b;
+  if (binop == omptarget_nvptx_max)     res = a > b ? a : b;
+  return res;
+}
+#endif
+
----------------
#if 0

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:1001-1028
@@ +1000,30 @@
+	return __shfl_down(val, delta, size);
+#if 0
+	T ret = 0;
+	int localv;
+	int remotev;
+	/* not finished */
+	switch(sizeof(T)) {
+	case 1:
+	case 2:
+		localv = reinterpret_cast<int>(val);
+		remotev = __shfl_down(localv, delta, size);
+		ret = reinterpret_cast<T>(remotev);
+		break;
+	break;
+	
+	}
+	return ret;
+#endif
+}
+
+#if 0
+template<>
+INLINE __device__ float myshfldown<float>(float val, unsigned int delta, int size) {
+	int t = __float_as_int(val);
+	int t1 = __shfl_down(t, delta, size);
+	float ret = __int_as_float<float>(t1);
+	return ret;
+}
+#endif
+
----------------
some more #if 0

================
Comment at: libomptarget/deviceRTLs/nvptx/src/reduction.cu:1239-1246
@@ +1238,10 @@
+
+#if 1
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed1, int8_t);
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed2, int16_t);
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed4, int32_t);
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed8, int64_t);
+ARRAY_GEN_ALLOP_FLOAT(ARRAYATOMIC_GENOP,float4, float);
+ARRAY_GEN_ALLOP_FLOAT(ARRAYATOMIC_GENOP,float8, double);
+#endif
+
----------------
#if 1?

http://reviews.llvm.org/D14031