[polly] r281611 - GPGPU: Do not assume arrays start at 0

Thu Sep 15 07:05:59 PDT 2016

Author: grosser
Date: Thu Sep 15 09:05:58 2016
New Revision: 281611

URL: http://llvm.org/viewvc/llvm-project?rev=281611&view=rev
Log:
GPGPU: Do not assume arrays start at 0

Our alias checks precisely check that the minimal and maximal accessed elements
do not overlap in a kernel. Hence, we must ensure that our host <-> device
transfers do not touch additional memory locations that are not covered in
the alias check. To ensure this, we make sure that the data we copy for a
given array is only the data from the smallest element accessed to the largest
element accessed.

We also adjust the size of the array according to the offset at which the array
is actually accessed.

An interesting result of this is: In case array are accessed with negative
subscripts ,e.g., A[-100], we automatically allocate and transfer _more_ data to
cover the full array. This is important as such code indeed exists in the wild.

Added:
    polly/trunk/test/GPGPU/non-zero-array-offset.ll
Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=281611&r1=281610&r2=281611&view=diff
==============================================================================

--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Thu Sep 15 09:05:58 2016
@@ -284,6 +284,21 @@ private:
   /// @param Array The array for which to compute a size.
   Value *getArraySize(gpu_array_info *Array);
 
+  /// Generate code to compute the minimal offset at which an array is accessed.
+  ///
+  /// The offset of an array is the minimal array location accessed in a scop.
+  ///
+  /// Example:
+  ///
+  ///   for (long i = 0; i < 100; i++)
+  ///     A[i + 42] += ...
+  ///
+  ///   getArrayOffset(A) results in 42.
+  ///
+  /// @param Array The array for which to compute the offset.
+  /// @returns An llvm::Value that contains the offset of the array.
+  Value *getArrayOffset(gpu_array_info *Array);
+
   /// Prepare the kernel arguments for kernel code generation
   ///
   /// @param Kernel The kernel to generate code for.
@@ -468,6 +483,12 @@ void GPUNodeBuilder::allocateDeviceArray
     DevArrayName.append(Array->name);
 
     Value *ArraySize = getArraySize(Array);
+    Value *Offset = getArrayOffset(Array);
+    if (Offset)
+      ArraySize = Builder.CreateSub(
+          ArraySize,
+          Builder.CreateMul(Offset,
+                            Builder.getInt64(ScopArray->getElemSizeInBytes())));
     Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
     DevArray->setName(DevArrayName);
     DeviceAllocations[ScopArray] = DevArray;
@@ -721,6 +742,48 @@ Value *GPUNodeBuilder::getArraySize(gpu_
   return ArraySize;
 }
 
+Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
+  if (gpu_array_is_scalar(Array))
+    return nullptr;
+
+  isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
+
+  isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent));
+
+  isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min));
+
+  for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++)
+    ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0);
+
+  if (isl_set_is_subset(Min, ZeroSet)) {
+    isl_set_free(Min);
+    isl_set_free(ZeroSet);
+    isl_ast_build_free(Build);
+    return nullptr;
+  }
+  isl_set_free(ZeroSet);
+
+  isl_ast_expr *Result =
+      isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0));
+
+  for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
+    if (i > 0) {
+      isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
+      isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
+      Result = isl_ast_expr_mul(Result, BExpr);
+    }
+    isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i);
+    isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin);
+    Result = isl_ast_expr_add(Result, MExpr);
+  }
+
+  Value *ResultValue = ExprBuilder.create(Result);
+  isl_set_free(Min);
+  isl_ast_build_free(Build);
+
+  return ResultValue;
+}
+
 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
                                         enum DataDirection Direction) {
   isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
@@ -730,6 +793,7 @@ void GPUNodeBuilder::createDataTransfer(
   auto ScopArray = (ScopArrayInfo *)(Array->user);
 
   Value *Size = getArraySize(Array);
+  Value *Offset = getArrayOffset(Array);
   Value *DevPtr = DeviceAllocations[ScopArray];
 
   Value *HostPtr;
@@ -739,8 +803,20 @@ void GPUNodeBuilder::createDataTransfer(
   else
     HostPtr = ScopArray->getBasePtr();
 
+  if (Offset) {
+    HostPtr = Builder.CreatePointerCast(
+        HostPtr, ScopArray->getElementType()->getPointerTo());
+    HostPtr = Builder.CreateGEP(HostPtr, Offset);
+  }
+
   HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
 
+  if (Offset) {
+    Size = Builder.CreateSub(
+        Size, Builder.CreateMul(
+                  Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
+  }
+
   if (Direction == HOST_TO_DEVICE)
     createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
   else
@@ -1000,6 +1076,16 @@ GPUNodeBuilder::createLaunchParameters(p
 
     Value *DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
     DevArray = createCallGetDevicePtr(DevArray);
+
+    Value *Offset = getArrayOffset(&Prog->array[i]);
+
+    if (Offset) {
+      DevArray = Builder.CreatePointerCast(
+          DevArray, SAI->getElementType()->getPointerTo());
+      DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
+      DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
+    }
+
     Instruction *Param = new AllocaInst(
         Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
         EntryBlock->getTerminator());

Added: polly/trunk/test/GPGPU/non-zero-array-offset.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/non-zero-array-offset.ll?rev=281611&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/non-zero-array-offset.ll (added)
+++ polly/trunk/test/GPGPU/non-zero-array-offset.ll Thu Sep 15 09:05:58 2016
@@ -0,0 +1,124 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+;
+; REQUIRES: pollyacc
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(8);
+; CODE-NEXT:     dim3 k0_dimGrid(1);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   {
+; CODE-NEXT:     dim3 k1_dimBlock(8);
+; CODE-NEXT:     dim3 k1_dimGrid(1);
+; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_bb3(t0);
+
+; CODE: # kernel1
+; CODE-NEXT: Stmt_bb11(t0);
+
+; IR:       %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT:  %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT:  [[REG0:%.+]] = getelementptr float, float* %B, i64 8
+; IR-NEXT:  [[REG1:%.+]] = bitcast float* [[REG0]] to i8*
+; IR-NEXT:  call void @polly_copyFromHostToDevice(i8* [[REG1]], i8* %p_dev_array_MemRef_B, i64 32)
+
+; IR:      [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
+; IR-NEXT: [[REGB:%.+]]  = bitcast i8* [[REGA]] to float*
+; IR-NEXT: [[REGC:%.+]]  = getelementptr float, float* [[REGB]], i64 -8
+; IR-NEXT: [[REGD:%.+]]  = bitcast float* [[REGC]] to i8*
+
+;    void foo(float A[], float B[]) {
+;      for (long i = 0; i < 8; i++)
+;        B[i + 8] *= 4;
+;
+;      for (long i = 0; i < 8; i++)
+;        A[i] *= 12;
+;    }
+;
+;    #ifdef OUTPUT
+;    int main() {
+;      float A[16];
+;
+;      for (long i = 0; i < 16; i++) {
+;        __sync_synchronize();
+;        A[i] = i;
+;      }
+;
+;      foo(A, A);
+;
+;      float sum = 0;
+;      for (long i = 0; i < 16; i++) {
+;        __sync_synchronize();
+;        sum += A[i];
+;      }
+;
+;      printf("%f\n", sum);
+;    }
+;    #endif
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, float* %B) {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb7, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
+  %exitcond1 = icmp ne i64 %i.0, 8
+  br i1 %exitcond1, label %bb3, label %bb9
+
+bb3:                                              ; preds = %bb2
+  %tmp = add nuw nsw i64 %i.0, 8
+  %tmp4 = getelementptr inbounds float, float* %B, i64 %tmp
+  %tmp5 = load float, float* %tmp4, align 4
+  %tmp6 = fmul float %tmp5, 4.000000e+00
+  store float %tmp6, float* %tmp4, align 4
+  br label %bb7
+
+bb7:                                              ; preds = %bb3
+  %tmp8 = add nuw nsw i64 %i.0, 1
+  br label %bb2
+
+bb9:                                              ; preds = %bb2
+  br label %bb10
+
+bb10:                                             ; preds = %bb15, %bb9
+  %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ]
+  %exitcond = icmp ne i64 %i1.0, 8
+  br i1 %exitcond, label %bb11, label %bb17
+
+bb11:                                             ; preds = %bb10
+  %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0
+  %tmp13 = load float, float* %tmp12, align 4
+  %tmp14 = fmul float %tmp13, 1.200000e+01
+  store float %tmp14, float* %tmp12, align 4
+  br label %bb15
+
+bb15:                                             ; preds = %bb11
+  %tmp16 = add nuw nsw i64 %i1.0, 1
+  br label %bb10
+
+bb17:                                             ; preds = %bb10
+  ret void
+}

Modified: polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll?rev=281611&r1=281610&r2=281611&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll (original)
+++ polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll Thu Sep 15 09:05:58 2016
@@ -38,13 +38,13 @@ target triple = "x86_64-unknown-linux-gn
 ; CODE-NEXT:     Stmt_for_cond15_for_cond12_loopexit_crit_edge(0);
 ; CODE-NEXT: }
 
-; IR:      %1 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromHostToDevice(i8* %1, i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
+; IR:      [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
 
-; IR:      %14 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* %14, i64 4)
-; IR-NEXT: %15 = bitcast i32* %out_l.055.s2a to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* %15, i64 4)
+; IR:      [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4)
+; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)
 
 ; KERNEL-IR: entry:
 ; KERNEL-IR-NEXT:   %out_l.055.s2a = alloca i32