[polly] r299662 - Restore the initial ordering of dimensions before applying the pattern matching

Roman Gareev via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 6 10:09:55 PDT 2017


Author: romangareev
Date: Thu Apr  6 12:09:54 2017
New Revision: 299662

URL: http://llvm.org/viewvc/llvm-project?rev=299662&view=rev
Log:
Restore the initial ordering of dimensions before applying the pattern matching

Dimensions of band nodes can be implicitly permuted by the algorithm applied
during the schedule generation.

For example, in case of the following matrix-matrix multiplication,

for (i = 0; i < 1024; i++)
  for (k = 0; k < 1024; k++)
    for (j = 0; j < 1024; j++)
      C[i][j] += A[i][k] * B[k][j];

it can produce the following schedule tree

domain: "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 and
                                        0 <= i2 <= 1023 }"
child:
  schedule: "[{ Stmt_for_body6[i0, i1, i2] -> [(i0)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i1)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i2)] }]"
  permutable: 1
  coincident: [ 1, 1, 0 ]

The current implementation of the pattern matching optimizations relies on the
initial ordering of dimensions. Otherwise, it can produce the miscompilation
(e.g., [1]).

This patch helps to restore the initial ordering of dimensions by recreating
the band node when the corresponding conditions are satisfied.

Refs.:

[1] - https://bugs.llvm.org/show_bug.cgi?id=32500

Reviewed-by: Michael Kruse <llvm at meinersbur.de>

Differential Revision: https://reviews.llvm.org/D31741

Added:
    polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
Modified:
    polly/trunk/lib/Transform/ScheduleOptimizer.cpp

Modified: polly/trunk/lib/Transform/ScheduleOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Transform/ScheduleOptimizer.cpp?rev=299662&r1=299661&r2=299662&view=diff
==============================================================================
--- polly/trunk/lib/Transform/ScheduleOptimizer.cpp (original)
+++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp Thu Apr  6 12:09:54 2017
@@ -1268,6 +1268,38 @@ static isl_schedule_node *markInterItera
   return isl_schedule_node_child(isl_schedule_node_insert_mark(Node, Id), 0);
 }
 
+/// Restore the initial ordering of dimensions of the band node
+///
+/// In case the band node represents all the dimensions of the iteration
+/// domain, recreate the band node to restore the initial ordering of the
+/// dimensions.
+///
+/// @param Node The band node to be modified.
+/// @return The modified schedule node.
+namespace {
+isl::schedule_node getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
+  assert(isl_schedule_node_get_type(Node.keep()) == isl_schedule_node_band);
+  if (isl_schedule_node_get_type(Node.child(0).keep()) !=
+      isl_schedule_node_leaf)
+    return Node;
+  auto Domain = isl::manage(isl_schedule_node_get_universe_domain(Node.keep()));
+  assert(isl_union_set_n_set(Domain.keep()) == 1);
+  if (isl_schedule_node_get_schedule_depth(Node.keep()) != 0 ||
+      (isl::set(isl::manage(Domain.copy())).dim(isl::dim::set) !=
+       isl_schedule_node_band_n_member(Node.keep())))
+    return Node;
+  Node = isl::manage(isl_schedule_node_delete(Node.take()));
+  auto PartialSchedulePwAff =
+      isl::manage(isl_union_set_identity_union_pw_multi_aff(Domain.take()));
+  auto PartialScheduleMultiPwAff =
+      isl::multi_union_pw_aff(PartialSchedulePwAff);
+  PartialScheduleMultiPwAff = isl::manage(isl_multi_union_pw_aff_reset_tuple_id(
+      PartialScheduleMultiPwAff.take(), isl_dim_set));
+  return isl::manage(isl_schedule_node_insert_partial_schedule(
+      Node.take(), PartialScheduleMultiPwAff.take()));
+}
+} // namespace
+
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI,
     MatMulInfoTy &MMI) {
@@ -1277,6 +1309,7 @@ __isl_give isl_schedule_node *ScheduleTr
   assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
                           "and, consequently, the corresponding scheduling "
                           "functions have at least three dimensions.");
+  Node = getBandNodeWithOriginDimOrder(isl::manage(Node)).take();
   Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
   int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
   int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
@@ -1304,7 +1337,12 @@ bool ScheduleTreeOptimizer::isMatrMultPa
     MatMulInfoTy &MMI) {
   auto *PartialSchedule =
       isl_schedule_node_band_get_partial_schedule_union_map(Node);
-  if (isl_schedule_node_band_n_member(Node) < 3 ||
+  Node = isl_schedule_node_child(Node, 0);
+  auto LeafType = isl_schedule_node_get_type(Node);
+  Node = isl_schedule_node_parent(Node);
+  if (LeafType != isl_schedule_node_leaf ||
+      isl_schedule_node_band_n_member(Node) < 3 ||
+      isl_schedule_node_get_schedule_depth(Node) != 0 ||
       isl_union_map_n_map(PartialSchedule) != 1) {
     isl_union_map_free(PartialSchedule);
     return false;

Added: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll?rev=299662&view=auto
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll (added)
+++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll Thu Apr  6 12:09:54 2017
@@ -0,0 +1,123 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -debug < %s 2>&1| FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 -polly-ast \
+; RUN: -analyze < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; REQUIRES: asserts
+;
+;    C := A * B + C
+;    Check that the pattern matching optimizations can detect different
+;    permutations of GEMM loop and produce the correct ISL AST. In this case,
+;    dimensions of band nodes can be implicitly permuted by the algorithm
+;    applied during the schedule generation. It should be taken into the
+;    account during the pattern matching optimizations.
+;    for (i = 0; i < _PB_NI; i++)
+;      for (k = 0; k < _PB_NK; ++k)
+;        for (j = 0; j < _PB_NJ; j++)
+;	   C[i][j] += A[i][k] * B[k][j];
+;
+; CHECK: The matrix multiplication pattern was detected
+;
+; PATTERN-MATCHING-OPTS:    // 1st level tiling - Tiles
+; PATTERN-MATCHING-OPTS-NEXT:    for (int c1 = 0; c1 <= 3; c1 += 1) {
+; PATTERN-MATCHING-OPTS-NEXT:      for (int c3 = 256 * c1; c3 <= 256 * c1 + 255; c3 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:        for (int c4 = 0; c4 <= 1023; c4 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:          CopyStmt_0(0, c3, c4);
+; PATTERN-MATCHING-OPTS-NEXT:      for (int c2 = 0; c2 <= 10; c2 += 1) {
+; PATTERN-MATCHING-OPTS-NEXT:        for (int c3 = 96 * c2; c3 <= min(1023, 96 * c2 + 95); c3 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:          for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:            CopyStmt_1(c3, c4, 0);
+; PATTERN-MATCHING-OPTS-NEXT:        // 1st level tiling - Points
+; PATTERN-MATCHING-OPTS-NEXT:        // Register tiling - Tiles
+; PATTERN-MATCHING-OPTS-NEXT:        for (int c3 = 0; c3 <= 127; c3 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:          for (int c4 = 0; c4 <= min(23, -24 * c2 + 255); c4 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:            for (int c5 = 0; c5 <= 255; c5 += 1) {
+; PATTERN-MATCHING-OPTS-NEXT:              // Register tiling - Points
+; PATTERN-MATCHING-OPTS-NEXT:              {
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 1);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 2);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 4);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 5);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 6);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 7);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 1);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 2);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 4);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 5);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 6);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 7);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 1);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 2);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 4);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 5);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 6);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 7);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 1);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 2);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 3);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 4);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 5);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 6);
+; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 7);
+; PATTERN-MATCHING-OPTS-NEXT:              }
+; PATTERN-MATCHING-OPTS-NEXT:            }
+; PATTERN-MATCHING-OPTS-NEXT:      }
+; PATTERN-MATCHING-OPTS-NEXT:    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc20, %entry.split
+  %indvars.iv41 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next42, %for.inc20 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc17, %for.cond1.preheader
+  %indvars.iv38 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next39, %for.inc17 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv41, i64 %indvars.iv38
+  %tmp = load double, double* %arrayidx8, align 8
+  %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv38, i64 %indvars.iv
+  %tmp1 = load double, double* %arrayidx12, align 8
+  %mul = fmul double %tmp, %tmp1
+  %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv41, i64 %indvars.iv
+  %tmp2 = load double, double* %arrayidx16, align 8
+  %add = fadd double %tmp2, %mul
+  store double %add, double* %arrayidx16, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body6, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body6
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond40 = icmp ne i64 %indvars.iv.next39, 1024
+  br i1 %exitcond40, label %for.cond4.preheader, label %for.inc20
+
+for.inc20:                                        ; preds = %for.inc17
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %exitcond43 = icmp ne i64 %indvars.iv.next42, 1024
+  br i1 %exitcond43, label %for.cond1.preheader, label %for.end22
+
+for.end22:                                        ; preds = %for.inc20
+  ret void
+}




More information about the llvm-commits mailing list