[polly] r203544 - ScheduleOptimizer: Fix prevectorization.

Mon Mar 10 23:27:36 PDT 2014

Author: grosser
Date: Tue Mar 11 01:27:36 2014
New Revision: 203544

URL: http://llvm.org/viewvc/llvm-project?rev=203544&view=rev
Log:
ScheduleOptimizer: Fix prevectorization.

In case we are at the innermost band, we try to prepare for vectorization. This
means, we look for the innermost parallel loop and strip mine this loop to the
innermost level using a strip-mine factor corresponding to the number of vector
iterations.

For whatever reason, the code that implemented this feature was broken. We now
added a comment, a test case and obviously also the right code.

Added:
    polly/trunk/test/ScheduleOptimizer/prevectorization.ll
Modified:
    polly/trunk/lib/ScheduleOptimizer.cpp

Modified: polly/trunk/lib/ScheduleOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/ScheduleOptimizer.cpp?rev=203544&r1=203543&r2=203544&view=diff
==============================================================================

--- polly/trunk/lib/ScheduleOptimizer.cpp (original)
+++ polly/trunk/lib/ScheduleOptimizer.cpp Tue Mar 11 01:27:36 2014
@@ -395,12 +395,17 @@ IslScheduleOptimizer::getScheduleForBand
           isl_union_map_flat_range_product(PartialSchedule, SuffixSchedule);
       isl_band_list_free(Children);
     } else if (PollyVectorizerChoice != VECTORIZER_NONE) {
-      for (int j = 0; j < isl_band_n_member(Band); j++) {
+      // In case we are at the innermost band, we try to prepare for
+      // vectorization. This means, we look for the innermost parallel loop
+      // and strip mine this loop to the innermost level using a strip-mine
+      // factor corresponding to the number of vector iterations.
+      int NumDims = isl_band_n_member(Band);
+      for (int j = NumDims - 1; j >= 0; j--) {
         if (isl_band_member_is_coincident(Band, j)) {
           isl_map *TileMap;
           isl_union_map *TileUMap;
 
-          TileMap = getPrevectorMap(ctx, ScheduleDimensions - j - 1,
+          TileMap = getPrevectorMap(ctx, ScheduleDimensions - NumDims + j,
                                     ScheduleDimensions);
           TileUMap = isl_union_map_from_map(TileMap);
           TileUMap =

Added: polly/trunk/test/ScheduleOptimizer/prevectorization.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/prevectorization.ll?rev=203544&view=auto
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/prevectorization.ll (added)
+++ polly/trunk/test/ScheduleOptimizer/prevectorization.ll Tue Mar 11 01:27:36 2014
@@ -0,0 +1,78 @@
+; RUN: opt -S %loadPolly -basicaa -polly-opt-isl -polly-vectorizer=polly -polly-ast -analyze < %s | FileCheck %s 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at C = common global [1536 x [1536 x float]] zeroinitializer, align 16
+ at A = common global [1536 x [1536 x float]] zeroinitializer, align 16
+ at B = common global [1536 x [1536 x float]] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @foo() #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry.split, %for.inc28
+  %indvar4 = phi i64 [ 0, %entry.split ], [ %indvar.next5, %for.inc28 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.inc25
+  %indvar6 = phi i64 [ 0, %for.cond1.preheader ], [ %indvar.next7, %for.inc25 ]
+  %arrayidx24 = getelementptr [1536 x [1536 x float]]* @C, i64 0, i64 %indvar4, i64 %indvar6
+  store float 0.000000e+00, float* %arrayidx24, align 4
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body3, %for.body8
+  %indvar = phi i64 [ 0, %for.body3 ], [ %indvar.next, %for.body8 ]
+  %arrayidx16 = getelementptr [1536 x [1536 x float]]* @A, i64 0, i64 %indvar4, i64 %indvar
+  %arrayidx20 = getelementptr [1536 x [1536 x float]]* @B, i64 0, i64 %indvar, i64 %indvar6
+  %0 = load float* %arrayidx24, align 4
+  %1 = load float* %arrayidx16, align 4
+  %2 = load float* %arrayidx20, align 4
+  %mul = fmul float %1, %2
+  %add = fadd float %0, %mul
+  store float %add, float* %arrayidx24, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp ne i64 %indvar.next, 1536
+  br i1 %exitcond, label %for.body8, label %for.inc25
+
+for.inc25:                                        ; preds = %for.body8
+  %indvar.next7 = add i64 %indvar6, 1
+  %exitcond8 = icmp ne i64 %indvar.next7, 1536
+  br i1 %exitcond8, label %for.body3, label %for.inc28
+
+for.inc28:                                        ; preds = %for.inc25
+  %indvar.next5 = add i64 %indvar4, 1
+  %exitcond9 = icmp ne i64 %indvar.next5, 1536
+  br i1 %exitcond9, label %for.cond1.preheader, label %for.end30
+
+for.end30:                                        ; preds = %for.inc28
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; CHECK: #pragma omp parallel for
+; CHECK: for (int c1 = 0; c1 <= 1535; c1 += 32)
+; CHECK:   for (int c2 = 0; c2 <= 1535; c2 += 32)
+; CHECK:     for (int c3 = c1; c3 <= c1 + 31; c3 += 1)
+; CHECK:       for (int c4 = c2; c4 <= c2 + 31; c4 += 4)
+; CHECK:         #pragma simd
+; CHECK:         for (int c5 = c4; c5 <= c4 + 3; c5 += 1)
+; CHECK:           Stmt_for_body3(c3, c5);
+; CHECK: #pragma omp parallel for
+; CHECK: for (int c1 = 0; c1 <= 1535; c1 += 32)
+; CHECK:   for (int c2 = 0; c2 <= 1535; c2 += 32)
+; CHECK:     for (int c3 = 0; c3 <= 1535; c3 += 32)
+; CHECK:       for (int c4 = c1; c4 <= c1 + 31; c4 += 1)
+; CHECK:         for (int c5 = c2; c5 <= c2 + 31; c5 += 4)
+; CHECK:           for (int c6 = c3; c6 <= c3 + 31; c6 += 1)
+; CHECK:             #pragma simd
+; CHECK:             for (int c7 = c5; c7 <= c5 + 3; c7 += 1)
+; CHECK:               Stmt_for_body8(c4, c7, c6);
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}