[polly] r277802 - GPGPU: Sort dimension sizes of multi-dimensional shared memory arrays correctly

Fri Aug 5 01:27:24 PDT 2016

Author: grosser
Date: Fri Aug  5 03:27:24 2016
New Revision: 277802

URL: http://llvm.org/viewvc/llvm-project?rev=277802&view=rev
Log:
GPGPU: Sort dimension sizes of multi-dimensional shared memory arrays correctly

Before this commit we generated the array type in reverse order and we also
added the outermost dimension size to the new array declaration, which is
incorrect as Polly additionally assumed an additional unsized outermost
dimension, such that we had an off-by-one error in the linearization of access
expressions.

Added:
    polly/trunk/test/GPGPU/shared-memory-two-dimensional.ll
Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=277802&r1=277801&r2=277802&view=diff
==============================================================================

--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Fri Aug  5 03:27:24 2016
@@ -1292,11 +1292,17 @@ void GPUNodeBuilder::createKernelVariabl
     Type *ArrayTy = EleTy;
     SmallVector<const SCEV *, 4> Sizes;
 
-    for (unsigned int j = 0; j < Var.array->n_index; ++j) {
+    for (unsigned int j = 1; j < Var.array->n_index; ++j) {
       isl_val *Val = isl_vec_get_element_val(Var.size, j);
       long Bound = isl_val_get_num_si(Val);
       isl_val_free(Val);
       Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
+    }
+
+    for (int j = Var.array->n_index - 1; j >= 0; --j) {
+      isl_val *Val = isl_vec_get_element_val(Var.size, j);
+      long Bound = isl_val_get_num_si(Val);
+      isl_val_free(Val);
       ArrayTy = ArrayType::get(ArrayTy, Bound);
     }
 

Added: polly/trunk/test/GPGPU/shared-memory-two-dimensional.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/shared-memory-two-dimensional.ll?rev=277802&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/shared-memory-two-dimensional.ll (added)
+++ polly/trunk/test/GPGPU/shared-memory-two-dimensional.ll Fri Aug  5 03:27:24 2016
@@ -0,0 +1,103 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-acc-use-shared \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg \
+; RUN: -polly-acc-use-shared \
+; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+
+; REQUIRES: pollyacc
+
+;    void foo(float A[], float b[][8]) {
+;      for (long i = 0; i < 32; i++)
+;        for (long j = 0; j < 16; j++)
+;          for (long k = 0; k < 8; k++)
+;            A[i] += j * k * b[j][k];
+;    }
+
+
+; CODE:      # kernel0
+; CODE-NEXT: {
+; CODE-NEXT:   if (t0 <= 7)
+; CODE-NEXT:     for (int c0 = 0; c0 <= 15; c0 += 1)
+; CODE-NEXT:       read(c0, t0);
+; CODE-NEXT:   read(t0);
+; CODE-NEXT:   sync0();
+; CODE-NEXT:   for (int c3 = 0; c3 <= 15; c3 += 1)
+; CODE-NEXT:     for (int c4 = 0; c4 <= 7; c4 += 1)
+; CODE-NEXT:       Stmt_bb8(t0, c3, c4);
+; CODE-NEXT:   sync1();
+; CODE-NEXT:   write(t0);
+; CODE-NEXT: }
+
+; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4
+
+; KERNEL:        %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8
+; KERNEL-NEXT:   %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0
+; KERNEL-NEXT:   %polly.access.MemRef_b = getelementptr float, float* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b
+; KERNEL-NEXT:   %shared.read = load float, float* %polly.access.MemRef_b
+; KERNEL-NEXT:   store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, [8 x float]* %b) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb22, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ]
+  %exitcond2 = icmp ne i64 %i.0, 32
+  br i1 %exitcond2, label %bb4, label %bb24
+
+bb4:                                              ; preds = %bb3
+  br label %bb5
+
+bb5:                                              ; preds = %bb19, %bb4
+  %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ]
+  %exitcond1 = icmp ne i64 %j.0, 16
+  br i1 %exitcond1, label %bb6, label %bb21
+
+bb6:                                              ; preds = %bb5
+  br label %bb7
+
+bb7:                                              ; preds = %bb16, %bb6
+  %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ]
+  %exitcond = icmp ne i64 %k.0, 8
+  br i1 %exitcond, label %bb8, label %bb18
+
+bb8:                                              ; preds = %bb7
+  %tmp = mul nuw nsw i64 %j.0, %k.0
+  %tmp9 = sitofp i64 %tmp to float
+  %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0
+  %tmp11 = load float, float* %tmp10, align 4
+  %tmp12 = fmul float %tmp9, %tmp11
+  %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0
+  %tmp14 = load float, float* %tmp13, align 4
+  %tmp15 = fadd float %tmp14, %tmp12
+  store float %tmp15, float* %tmp13, align 4
+  br label %bb16
+
+bb16:                                             ; preds = %bb8
+  %tmp17 = add nuw nsw i64 %k.0, 1
+  br label %bb7
+
+bb18:                                             ; preds = %bb7
+  br label %bb19
+
+bb19:                                             ; preds = %bb18
+  %tmp20 = add nuw nsw i64 %j.0, 1
+  br label %bb5
+
+bb21:                                             ; preds = %bb5
+  br label %bb22
+
+bb22:                                             ; preds = %bb21
+  %tmp23 = add nuw nsw i64 %i.0, 1
+  br label %bb3
+
+bb24:                                             ; preds = %bb3
+  ret void
+}