[polly] r243458 - Rewrite getPrevectorMap using schedule trees operations

Tue Jul 28 11:03:37 PDT 2015

Author: grosser
Date: Tue Jul 28 13:03:36 2015
New Revision: 243458

URL: http://llvm.org/viewvc/llvm-project?rev=243458&view=rev
Log:
Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman at gmail.com>

Modified:
    polly/trunk/lib/Transform/ScheduleOptimizer.cpp
    polly/trunk/test/ScheduleOptimizer/prevectorization-without-tiling.ll
    polly/trunk/test/ScheduleOptimizer/prevectorization.ll

Modified: polly/trunk/lib/Transform/ScheduleOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Transform/ScheduleOptimizer.cpp?rev=243458&r1=243457&r2=243458&view=diff
==============================================================================

--- polly/trunk/lib/Transform/ScheduleOptimizer.cpp (original)
+++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp Tue Jul 28 13:03:36 2015
@@ -118,16 +118,14 @@ private:
   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
   bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule);
 
-  /// @brief Create a map that pre-vectorizes one scheduling dimension.
+  /// @brief Pre-vectorizes one scheduling dimension of a schedule band.
   ///
-  /// getPrevectorMap creates a map that maps each input dimension to the same
-  /// output dimension, except for the dimension DimToVectorize.
-  /// DimToVectorize is strip mined by 'VectorWidth' and the newly created
-  /// point loop of DimToVectorize is moved to the innermost level.
+  /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
+  /// sinks the resulting point loop.
   ///
-  /// Example (DimToVectorize=0, ScheduleDimensions=2, VectorWidth=4):
+  /// Example (DimToVectorize=0, VectorWidth=4):
   ///
-  /// | Before transformation
+  /// | Before transformation:
   /// |
   /// | A[i,j] -> [i,j]
   /// |
@@ -135,17 +133,12 @@ private:
   /// |    for (j = 0; j < 128; j++)
   /// |      A(i,j);
   ///
-  ///   Prevector map:
-  ///   [i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
-  ///
   /// | After transformation:
   /// |
-  /// | A[i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
-  /// |
-  /// | for (it = 0; it < 128; it+=4)
+  /// | for (it = 0; it < 32; it+=1)
   /// |    for (j = 0; j < 128; j++)
-  /// |      for (ip = max(0,it); ip < min(128, it + 3); ip++)
-  /// |        A(ip,j);
+  /// |      for (ip = 0; ip <= 3; ip++)
+  /// |        A(4 * it + ip,j);
   ///
   /// The goal of this transformation is to create a trivially vectorizable
   /// loop.  This means a parallel loop at the innermost level that has a
@@ -156,9 +149,9 @@ private:
   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
   /// currently constant and not yet target specific. This function does not
   /// reason about parallelism.
-  static __isl_give isl_map *getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
-                                             int ScheduleDimensions,
-                                             int VectorWidth = 4);
+  static __isl_give isl_schedule_node *
+  prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize,
+                   int VectorWidth = 4);
 
   /// @brief Apply additional optimizations on the bands in the schedule tree.
   ///
@@ -170,7 +163,7 @@ private:
   ///      - if the band has more than one loop dimension
   ///
   ///  - Prevectorize the schedule of the band (or the point loop in case of
-  ///    tiling)
+  ///    tiling).
   ///      - if vectorization is enabled
   ///
   /// @param Node The schedule node to (possibly) optimize.
@@ -204,58 +197,33 @@ private:
 
 char IslScheduleOptimizer::ID = 0;
 
-__isl_give isl_map *
-IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
-                                      int ScheduleDimensions, int VectorWidth) {
-  isl_space *Space;
-  isl_local_space *LocalSpace, *LocalSpaceRange;
-  isl_set *Modulo;
-  isl_map *TilingMap;
-  isl_constraint *c;
-  isl_aff *Aff;
-  int PointDimension; /* ip */
-  int TileDimension;  /* it */
-  isl_val *VectorWidthMP;
-
-  assert(0 <= DimToVectorize && DimToVectorize < ScheduleDimensions);
-
-  Space = isl_space_alloc(ctx, 0, ScheduleDimensions, ScheduleDimensions + 1);
-  TilingMap = isl_map_universe(isl_space_copy(Space));
-  LocalSpace = isl_local_space_from_space(Space);
-  PointDimension = ScheduleDimensions;
-  TileDimension = DimToVectorize;
-
-  // Create an identity map for everything except DimToVectorize and map
-  // DimToVectorize to the point loop at the innermost dimension.
-  for (int i = 0; i < ScheduleDimensions; i++)
-    if (i == DimToVectorize)
-      TilingMap =
-          isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, PointDimension);
-    else
-      TilingMap = isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, i);
-
-  // it % 'VectorWidth' = 0
-  LocalSpaceRange = isl_local_space_range(isl_local_space_copy(LocalSpace));
-  Aff = isl_aff_zero_on_domain(LocalSpaceRange);
-  Aff = isl_aff_set_constant_si(Aff, VectorWidth);
-  Aff = isl_aff_set_coefficient_si(Aff, isl_dim_in, TileDimension, 1);
-  VectorWidthMP = isl_val_int_from_si(ctx, VectorWidth);
-  Aff = isl_aff_mod_val(Aff, VectorWidthMP);
-  Modulo = isl_pw_aff_zero_set(isl_pw_aff_from_aff(Aff));
-  TilingMap = isl_map_intersect_range(TilingMap, Modulo);
-
-  // it <= ip
-  TilingMap = isl_map_order_le(TilingMap, isl_dim_out, TileDimension,
-                               isl_dim_out, PointDimension);
-
-  // ip <= it + ('VectorWidth' - 1)
-  c = isl_inequality_alloc(LocalSpace);
-  isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, 1);
-  isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, -1);
-  isl_constraint_set_constant_si(c, VectorWidth - 1);
-  TilingMap = isl_map_add_constraint(TilingMap, c);
+__isl_give isl_schedule_node *
+IslScheduleOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
+                                       unsigned DimToVectorize,
+                                       int VectorWidth) {
+  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 
-  return TilingMap;
+  auto Space = isl_schedule_node_band_get_space(Node);
+  auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
+  isl_space_free(Space);
+  assert(DimToVectorize < ScheduleDimensions);
+
+  if (DimToVectorize > 0) {
+    Node = isl_schedule_node_band_split(Node, DimToVectorize);
+    Node = isl_schedule_node_child(Node, 0);
+  }
+  if (DimToVectorize < ScheduleDimensions - 1)
+    Node = isl_schedule_node_band_split(Node, 1);
+  Space = isl_schedule_node_band_get_space(Node);
+  auto Sizes = isl_multi_val_zero(Space);
+  auto Ctx = isl_schedule_node_get_ctx(Node);
+  Sizes =
+      isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
+  Node = isl_schedule_node_band_tile(Node, Sizes);
+  Node = isl_schedule_node_child(Node, 0);
+  Node = isl_schedule_node_band_sink(Node);
+  Node = isl_schedule_node_child(Node, 0);
+  return Node;
 }
 
 isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
@@ -307,21 +275,12 @@ isl_schedule_node *IslScheduleOptimizer:
   if (PollyVectorizerChoice == VECTORIZER_NONE)
     return Res;
 
-  auto Schedule = isl_schedule_node_band_get_partial_schedule(Res);
-
-  for (int i = Dims - 1; i >= 0; i--) {
+  for (int i = Dims - 1; i >= 0; i--)
     if (isl_schedule_node_band_member_get_coincident(Res, i)) {
-      auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims);
-      auto TileUMap = isl_union_map_from_map(TileMap);
-      auto Schedule2 = isl_union_map_apply_range(
-          isl_union_map_from_multi_union_pw_aff(Schedule), TileUMap);
-      Schedule = isl_multi_union_pw_aff_from_union_map(Schedule2);
+      Res = IslScheduleOptimizer::prevectSchedBand(Res, i);
       break;
     }
-  }
 
-  Res = isl_schedule_node_delete(Res);
-  Res = isl_schedule_node_insert_partial_schedule(Res, Schedule);
   return Res;
 }
 

Modified: polly/trunk/test/ScheduleOptimizer/prevectorization-without-tiling.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/prevectorization-without-tiling.ll?rev=243458&r1=243457&r2=243458&view=diff
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/prevectorization-without-tiling.ll (original)
+++ polly/trunk/test/ScheduleOptimizer/prevectorization-without-tiling.ll Tue Jul 28 13:03:36 2015
@@ -55,17 +55,17 @@ attributes #0 = { nounwind uwtable "less
 
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
-; CHECK:   for (int c1 = 0; c1 <= 1535; c1 += 4)
+; CHECK:   for (int c1 = 0; c1 <= 383; c1 += 1)
 ; CHECK:     #pragma simd
-; CHECK:     for (int c2 = c1; c2 <= c1 + 3; c2 += 1)
-; CHECK:       Stmt_for_body3(c0, c2);
+; CHECK:     for (int c2 = 0; c2 <= 3; c2 += 1)
+; CHECK:       Stmt_for_body3(c0, 4 * c1 + c2);
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
-; CHECK:   for (int c1 = 0; c1 <= 1535; c1 += 4)
+; CHECK:   for (int c1 = 0; c1 <= 383; c1 += 1)
 ; CHECK:     for (int c2 = 0; c2 <= 1535; c2 += 1)
 ; CHECK:       #pragma simd
-; CHECK:       for (int c3 = c1; c3 <= c1 + 3; c3 += 1)
-; CHECK:         Stmt_for_body8(c0, c3, c2);
+; CHECK:       for (int c3 = 0; c3 <= 3; c3 += 1)
+; CHECK:         Stmt_for_body8(c0, 4 * c1 + c3, c2);
 
 !llvm.ident = !{!0}
 

Modified: polly/trunk/test/ScheduleOptimizer/prevectorization.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/prevectorization.ll?rev=243458&r1=243457&r2=243458&view=diff
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/prevectorization.ll (original)
+++ polly/trunk/test/ScheduleOptimizer/prevectorization.ll Tue Jul 28 13:03:36 2015
@@ -58,20 +58,20 @@ attributes #0 = { nounwind uwtable "less
 ; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 47; c1 += 1)
 ; CHECK:     for (int c2 = 0; c2 <= 31; c2 += 1)
-; CHECK:       for (int c3 = 0; c3 <= 31; c3 += 4)
+; CHECK:       for (int c3 = 0; c3 <= 7; c3 += 1)
 ; CHECK:         #pragma simd
-; CHECK:         for (int c4 = c3; c4 <= c3 + 3; c4 += 1)
-; CHECK:           Stmt_for_body3(32 * c0 + c2, 32 * c1 + c4);
+; CHECK:         for (int c4 = 0; c4 <= 3; c4 += 1)
+; CHECK:           Stmt_for_body3(32 * c0 + c2, 32 * c1 + 4 * c3 + c4);
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 47; c1 += 1)
 ; CHECK:     for (int c2 = 0; c2 <= 47; c2 += 1)
 ; CHECK:       for (int c3 = 0; c3 <= 31; c3 += 1)
-; CHECK:         for (int c4 = 0; c4 <= 31; c4 += 4)
+; CHECK:         for (int c4 = 0; c4 <= 7; c4 += 1)
 ; CHECK:           for (int c5 = 0; c5 <= 31; c5 += 1)
 ; CHECK:             #pragma simd
-; CHECK:             for (int c6 = c4; c6 <= c4 + 3; c6 += 1)
-; CHECK:               Stmt_for_body8(32 * c0 + c3, 32 * c1 + c6, 32 * c2 + c5);
+; CHECK:             for (int c6 = 0; c6 <= 3; c6 += 1)
+; CHECK:               Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
 
 !llvm.ident = !{!0}