[polly] r310374 - Do not use isl_set_project_out to get all loop prefixes

Roman Gareev via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 8 09:15:33 PDT 2017


Author: romangareev
Date: Tue Aug  8 09:15:33 2017
New Revision: 310374

URL: http://llvm.org/viewvc/llvm-project?rev=310374&view=rev
Log:
Do not use isl_set_project_out to get all loop prefixes

Currently, only convex isolation sets can be efficiently processed by isl.
Consequently, as a temporary solution, we use a different algorithm for partial
tile isolation that helps to build convex isolation sets in some cases.

Reviewed-by: Tobias Grosser <tobias at grosser.es>

Differential Revision: https://reviews.llvm.org/D36278

Added:
    polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
    polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
Modified:
    polly/trunk/include/polly/ScheduleOptimizer.h
    polly/trunk/lib/Transform/ScheduleOptimizer.cpp

Modified: polly/trunk/include/polly/ScheduleOptimizer.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/ScheduleOptimizer.h?rev=310374&r1=310373&r2=310374&view=diff
==============================================================================
--- polly/trunk/include/polly/ScheduleOptimizer.h (original)
+++ polly/trunk/include/polly/ScheduleOptimizer.h Tue Aug  8 09:15:33 2017
@@ -338,13 +338,16 @@ private:
 /// We build a set of partial tile prefixes, which are prefixes of the vector
 /// loop that have exactly VectorWidth iterations.
 ///
-/// 1. Get all prefixes of the vector loop.
-/// 2. Extend it to a set, which has exactly VectorWidth iterations for
-///    any prefix from the set that was built on the previous step.
+/// 1. Drop all constraints involving the dimension that represents the
+///    vector loop.
+/// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
+///    iterations.
 /// 3. Subtract loop domain from it, project out the vector loop dimension and
-///    get a set of prefixes, which don't have exactly VectorWidth iterations.
-/// 4. Subtract it from all prefixes of the vector loop and get the desired
-///    set.
+///    get a set that contains prefixes, which do not have exactly VectorWidth
+///    iterations.
+/// 4. Project out the vector loop dimension of the set that was build on the
+///    first step and subtract the set built on the previous step to get the
+///    desired set of prefixes.
 ///
 /// @param ScheduleRange A range of a map, which describes a prefix schedule
 ///                      relation.

Modified: polly/trunk/lib/Transform/ScheduleOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Transform/ScheduleOptimizer.cpp?rev=310374&r1=310373&r2=310374&view=diff
==============================================================================
--- polly/trunk/lib/Transform/ScheduleOptimizer.cpp (original)
+++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp Tue Aug  8 09:15:33 2017
@@ -311,11 +311,12 @@ static isl::set addExtentConstraints(isl
 
 isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth) {
   unsigned Dims = ScheduleRange.dim(isl::dim::set);
-  isl::set LoopPrefixes = ScheduleRange.project_out(isl::dim::set, Dims - 1, 1);
-  isl::set ExtentPrefixes = LoopPrefixes.add_dims(isl::dim::set, 1);
-  ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
+  isl::set LoopPrefixes =
+      ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1);
+  auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth);
   isl::set BadPrefixes = ExtentPrefixes.subtract(ScheduleRange);
   BadPrefixes = BadPrefixes.project_out(isl::dim::set, Dims - 1, 1);
+  LoopPrefixes = LoopPrefixes.project_out(isl::dim::set, Dims - 1, 1);
   return LoopPrefixes.subtract(BadPrefixes);
 }
 

Added: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll?rev=310374&view=auto
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll (added)
+++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll Tue Aug  8 09:15:33 2017
@@ -0,0 +1,353 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 < %s \
+; RUN: | FileCheck %s
+;
+; Test whether isolation works as expected.
+;
+; CHECK:    // Inter iteration alias-free
+; CHECK-NEXT:    // 1st level tiling - Tiles
+; CHECK-NEXT:    for (int c1 = 0; c1 <= 1; c1 += 1) {
+; CHECK-NEXT:      for (int c3 = 0; c3 <= 1019; c3 += 1)
+; CHECK-NEXT:        for (int c4 = 512 * c1; c4 <= min(1019, 512 * c1 + 511); c4 += 1)
+; CHECK-NEXT:          CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:      for (int c2 = 0; c2 <= 2; c2 += 1) {
+; CHECK-NEXT:        for (int c3 = 384 * c2; c3 <= min(1019, 384 * c2 + 383); c3 += 1)
+; CHECK-NEXT:          for (int c5 = 512 * c1; c5 <= min(1019, 512 * c1 + 511); c5 += 1)
+; CHECK-NEXT:            CopyStmt_1(c3, 0, c5);
+; CHECK-NEXT:        // 1st level tiling - Points
+; CHECK-NEXT:        // Register tiling - Tiles
+; CHECK-NEXT:        {
+; CHECK-NEXT:          for (int c3 = 0; c3 <= 30; c3 += 1) {
+; CHECK-NEXT:            for (int c4 = 0; c4 <= min(47, -48 * c2 + 126); c4 += 1)
+; CHECK-NEXT:              for (int c5 = 0; c5 <= min(511, -512 * c1 + 1019); c5 += 1) {
+; CHECK-NEXT:                // Register tiling - Points
+; CHECK-NEXT:                {
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 1, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 2, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 3, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 4, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 5, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 6, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 1, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 2, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 3, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 4, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 5, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 6, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 7, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 8, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 9, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 10, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 11, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 12, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 13, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 14, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 15, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 16, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 17, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 18, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 19, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 20, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 21, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 22, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 23, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 24, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 25, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 26, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 27, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 28, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 29, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 30, 512 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + 7, 32 * c3 + 31, 512 * c1 + c5);
+; CHECK-NEXT:                }
+; CHECK-NEXT:              }
+; CHECK-NEXT:            if (c2 == 2)
+; CHECK-NEXT:              for (int c5 = 0; c5 <= min(511, -512 * c1 + 1019); c5 += 1) {
+; CHECK-NEXT:                // Register tiling - Points
+; CHECK-NEXT:                for (int c6 = 0; c6 <= 3; c6 += 1)
+; CHECK-NEXT:                  for (int c7 = 0; c7 <= 31; c7 += 1)
+; CHECK-NEXT:                    Stmt_for_body6(c6 + 1016, 32 * c3 + c7, 512 * c1 + c5);
+; CHECK-NEXT:              }
+; CHECK-NEXT:          }
+; CHECK-NEXT:          for (int c4 = 0; c4 <= min(47, -48 * c2 + 127); c4 += 1)
+; CHECK-NEXT:            for (int c5 = 0; c5 <= min(511, -512 * c1 + 1019); c5 += 1) {
+; CHECK-NEXT:              // Register tiling - Points
+; CHECK-NEXT:              for (int c6 = 0; c6 <= min(7, -384 * c2 - 8 * c4 + 1019); c6 += 1)
+; CHECK-NEXT:                for (int c7 = 0; c7 <= 27; c7 += 1)
+; CHECK-NEXT:                  Stmt_for_body6(384 * c2 + 8 * c4 + c6, c7 + 992, 512 * c1 + c5);
+; CHECK-NEXT:            }
+; CHECK-NEXT:        }
+; CHECK-NEXT:      }
+; CHECK-NEXT:    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, i8 signext %alpha, i8 signext %beta, [1020 x i8]* %C, [1020 x i8]* %A, [1020 x i8]* %B) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc23, %entry.split
+  %indvars.iv45 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next46, %for.inc23 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.inc20, %for.body
+  %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.inc20 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1020 x i8], [1020 x i8]* %A, i64 %indvars.iv45, i64 %indvars.iv
+  %tmp = load i8, i8* %arrayidx8, align 1
+  %arrayidx12 = getelementptr inbounds [1020 x i8], [1020 x i8]* %B, i64 %indvars.iv, i64 %indvars.iv42
+  %tmp1 = load i8, i8* %arrayidx12, align 1
+  %mul = mul i8 %tmp1, %tmp
+  %arrayidx17 = getelementptr inbounds [1020 x i8], [1020 x i8]* %C, i64 %indvars.iv45, i64 %indvars.iv42
+  %tmp2 = load i8, i8* %arrayidx17, align 1
+  %add = add i8 %mul, %tmp2
+  store i8 %add, i8* %arrayidx17, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1020
+  br i1 %exitcond, label %for.body6, label %for.inc20
+
+for.inc20:                                        ; preds = %for.body6
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond44 = icmp ne i64 %indvars.iv.next43, 1020
+  br i1 %exitcond44, label %for.body3, label %for.inc23
+
+for.inc23:                                        ; preds = %for.inc20
+  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+  %exitcond47 = icmp ne i64 %indvars.iv.next46, 1020
+  br i1 %exitcond47, label %for.body, label %for.end25
+
+for.end25:                                        ; preds = %for.inc23
+  ret void
+}

Added: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll?rev=310374&view=auto
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll (added)
+++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll Tue Aug  8 09:15:33 2017
@@ -0,0 +1,122 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=2 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=128 \
+; RUN: -polly-target-2nd-cache-level-size=262144 < %s \
+; RUN: | FileCheck %s
+;
+; Test whether isolation works as expected.
+;
+; CHECK:   // Inter iteration alias-free
+; CHECK-NEXT:    // 1st level tiling - Tiles
+; CHECK-NEXT:    for (int c0 = 0; c0 <= 1; c0 += 1)
+; CHECK-NEXT:      for (int c1 = 0; c1 <= 6; c1 += 1) {
+; CHECK-NEXT:        for (int c3 = 1536 * c0; c3 <= min(1999, 1536 * c0 + 1535); c3 += 1)
+; CHECK-NEXT:          for (int c4 = 307 * c1; c4 <= min(1999, 307 * c1 + 306); c4 += 1)
+; CHECK-NEXT:            CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:        for (int c2 = 0; c2 <= 24; c2 += 1) {
+; CHECK-NEXT:          if (c0 == 0)
+; CHECK-NEXT:            for (int c3 = 80 * c2; c3 <= 80 * c2 + 79; c3 += 1)
+; CHECK-NEXT:              for (int c5 = 307 * c1; c5 <= min(1999, 307 * c1 + 306); c5 += 1)
+; CHECK-NEXT:                CopyStmt_1(c3, 0, c5);
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          // Register tiling - Tiles
+; CHECK-NEXT:          {
+; CHECK-NEXT:            for (int c3 = 0; c3 <= min(255, -256 * c0 + 332); c3 += 1)
+; CHECK-NEXT:              for (int c4 = 0; c4 <= 15; c4 += 1)
+; CHECK-NEXT:                for (int c5 = 0; c5 <= min(306, -307 * c1 + 1999); c5 += 1) {
+; CHECK-NEXT:                  // Register tiling - Points
+; CHECK-NEXT:                  {
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5);
+; CHECK-NEXT:                    Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5);
+; CHECK-NEXT:                  }
+; CHECK-NEXT:                }
+; CHECK-NEXT:            if (c0 == 1)
+; CHECK-NEXT:              for (int c4 = 0; c4 <= 15; c4 += 1)
+; CHECK-NEXT:                for (int c5 = 0; c5 <= min(306, -307 * c1 + 1999); c5 += 1) {
+; CHECK-NEXT:                  // Register tiling - Points
+; CHECK-NEXT:                  for (int c6 = 0; c6 <= 4; c6 += 1)
+; CHECK-NEXT:                    for (int c7 = 0; c7 <= 1; c7 += 1)
+; CHECK-NEXT:                      Stmt_for_body6(80 * c2 + 5 * c4 + c6, c7 + 1998, 307 * c1 + c5);
+; CHECK-NEXT:                }
+; CHECK-NEXT:          }
+; CHECK-NEXT:        }
+; CHECK-NEXT:      }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [2000 x double]* %C, [2000 x double]* %A, [2000 x double]* %B) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc20, %entry.split
+  %indvars.iv41 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next42, %for.inc20 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.inc17, %for.body
+  %indvars.iv38 = phi i64 [ 0, %for.body ], [ %indvars.iv.next39, %for.inc17 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [2000 x double], [2000 x double]* %A, i64 %indvars.iv41, i64 %indvars.iv
+  %tmp = load double, double* %arrayidx8, align 8
+  %arrayidx12 = getelementptr inbounds [2000 x double], [2000 x double]* %B, i64 %indvars.iv, i64 %indvars.iv38
+  %tmp1 = load double, double* %arrayidx12, align 8
+  %mul = fmul double %tmp, %tmp1
+  %arrayidx16 = getelementptr inbounds [2000 x double], [2000 x double]* %C, i64 %indvars.iv41, i64 %indvars.iv38
+  %tmp2 = load double, double* %arrayidx16, align 8
+  %add = fadd double %tmp2, %mul
+  store double %add, double* %arrayidx16, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 2000
+  br i1 %exitcond, label %for.body6, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body6
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond40 = icmp ne i64 %indvars.iv.next39, 2000
+  br i1 %exitcond40, label %for.body3, label %for.inc20
+
+for.inc20:                                        ; preds = %for.inc17
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %exitcond43 = icmp ne i64 %indvars.iv.next42, 2000
+  br i1 %exitcond43, label %for.body, label %for.end22
+
+for.end22:                                        ; preds = %for.inc20
+  ret void
+}




More information about the llvm-commits mailing list