[polly] r245424 - Make prevectorization width configurable

Wed Aug 19 01:46:12 PDT 2015

Author: grosser
Date: Wed Aug 19 03:46:11 2015
New Revision: 245424

URL: http://llvm.org/viewvc/llvm-project?rev=245424&view=rev
Log:
Make prevectorization width configurable

Polly uses 'prevectorization' to enable outer loop vectorization. When
vectorizing an outer loop, we strip-mine <number-of-prevec-dims> loop
iterations which are than interchanged to the innermost level such that LLVM's
inner loop vectorizer (or Polly's simple vectorizer) can easily vectorize this
loop. The number of loop iterations to strip-mine is now configurable with the
option -polly-prevect-width=<number-of-prevec-dims>.

This is mostly a debugging option. We should probably add a heuristic that
derives the number of prevectorization dimensions from the target data and
the data types used.

Modified:
    polly/trunk/lib/Transform/ScheduleOptimizer.cpp
    polly/trunk/test/ScheduleOptimizer/prevectorization.ll

Modified: polly/trunk/lib/Transform/ScheduleOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Transform/ScheduleOptimizer.cpp?rev=245424&r1=245423&r2=245424&view=diff
==============================================================================

--- polly/trunk/lib/Transform/ScheduleOptimizer.cpp (original)
+++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp Wed Aug 19 03:46:11 2015
@@ -107,6 +107,12 @@ static cl::opt<std::string>
                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::opt<int> PrevectorWidth(
+    "polly-prevect-width",
+    cl::desc(
+        "The number of loop iterations to strip-mine for pre-vectorization"),
+    cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
+
 static cl::opt<int> DefaultTileSize(
     "polly-default-tile-size",
     cl::desc("The default tile size (if not enough were provided by"
@@ -176,7 +182,7 @@ private:
   /// reason about parallelism.
   static __isl_give isl_schedule_node *
   prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize,
-                   int VectorWidth = 4);
+                   int VectorWidth);
 
   /// @brief Apply additional optimizations on the bands in the schedule tree.
   ///
@@ -298,7 +304,7 @@ isl_schedule_node *IslScheduleOptimizer:
 
   for (int i = Dims - 1; i >= 0; i--)
     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
-      Node = IslScheduleOptimizer::prevectSchedBand(Node, i);
+      Node = IslScheduleOptimizer::prevectSchedBand(Node, i, PrevectorWidth);
       break;
     }
 

Modified: polly/trunk/test/ScheduleOptimizer/prevectorization.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/prevectorization.ll?rev=245424&r1=245423&r2=245424&view=diff
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/prevectorization.ll (original)
+++ polly/trunk/test/ScheduleOptimizer/prevectorization.ll Wed Aug 19 03:46:11 2015
@@ -1,5 +1,11 @@
 ; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=polly -polly-ast -analyze < %s | FileCheck %s 
 ; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=stripmine -polly-ast -analyze < %s | FileCheck %s
+
+; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl \
+; RUN:                   -polly-vectorizer=polly -polly-ast -analyze \
+; RUN:                   -polly-prevect-width=16 < %s | \
+; RUN:                   FileCheck %s -check-prefix=VEC16
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @C = common global [1536 x [1536 x float]] zeroinitializer, align 16
@@ -73,6 +79,28 @@ attributes #0 = { nounwind uwtable "less
 ; CHECK:             for (int c6 = 0; c6 <= 3; c6 += 1)
 ; CHECK:               Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
 
+; VEC16: {
+; VEC16:   #pragma known-parallel
+; VEC16:   for (int c0 = 0; c0 <= 47; c0 += 1)
+; VEC16:     for (int c1 = 0; c1 <= 47; c1 += 1)
+; VEC16:       for (int c2 = 0; c2 <= 31; c2 += 1)
+; VEC16:         for (int c3 = 0; c3 <= 1; c3 += 1)
+; VEC16:           #pragma simd
+; VEC16:           for (int c4 = 0; c4 <= 15; c4 += 1)
+; VEC16:             Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4);
+; VEC16:   #pragma known-parallel
+; VEC16:   for (int c0 = 0; c0 <= 47; c0 += 1)
+; VEC16:     for (int c1 = 0; c1 <= 47; c1 += 1)
+; VEC16:       for (int c2 = 0; c2 <= 47; c2 += 1)
+; VEC16:         for (int c3 = 0; c3 <= 31; c3 += 1)
+; VEC16:           for (int c4 = 0; c4 <= 1; c4 += 1)
+; VEC16:             for (int c5 = 0; c5 <= 31; c5 += 1)
+; VEC16:               #pragma simd
+; VEC16:               for (int c6 = 0; c6 <= 15; c6 += 1)
+; VEC16:                 Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
+; VEC16: }
+
+
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.5.0 "}