[polly] r293890 - A new algorithm for identification of a SCoP statement that implement a matrix

Thu Feb 2 06:23:15 PST 2017

Author: romangareev
Date: Thu Feb  2 08:23:14 2017
New Revision: 293890

URL: http://llvm.org/viewvc/llvm-project?rev=293890&view=rev
Log:
A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm at meinersbur.de>,
             Tobias Grosser <tobias at grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

Modified:
    polly/trunk/include/polly/ScheduleOptimizer.h
    polly/trunk/lib/Transform/ScheduleOptimizer.cpp
    polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts.ll
    polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
    polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll

Modified: polly/trunk/include/polly/ScheduleOptimizer.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/ScheduleOptimizer.h?rev=293890&r1=293889&r2=293890&view=diff
==============================================================================

--- polly/trunk/include/polly/ScheduleOptimizer.h (original)
+++ polly/trunk/include/polly/ScheduleOptimizer.h Thu Feb  2 08:23:14 2017
@@ -12,6 +12,7 @@
 #ifndef POLLY_SCHEDULE_OPTIMIZER_H
 #define POLLY_SCHEDULE_OPTIMIZER_H
 
+#include "polly/DependenceInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "isl/ctx.h"
@@ -42,6 +43,31 @@ struct MacroKernelParamsTy {
 };
 
 namespace polly {
+/// Additional parameters of the schedule optimizer.
+///
+/// Target Transform Info and the SCoP dependencies used by the schedule
+/// optimizer.
+///
+struct OptimizerAdditionalInfoTy {
+  const llvm::TargetTransformInfo *TTI;
+  const Dependences *D;
+};
+
+/// Parameters of the matrix multiplication operands.
+///
+/// Parameters, which describe access relations that represent operands of the
+/// matrix multiplication.
+///
+struct MatMulInfoTy {
+  MemoryAccess *A = nullptr;
+  MemoryAccess *B = nullptr;
+  MemoryAccess *ReadFromC = nullptr;
+  MemoryAccess *WriteToC = nullptr;
+  int i = -1;
+  int j = -1;
+  int k = -1;
+};
+
 extern bool DisablePollyTiling;
 class Scop;
 } // namespace polly
@@ -59,11 +85,11 @@ public:
   ///
   /// @param Schedule The schedule object the transformations will be applied
   ///                 to.
-  /// @param TTI      Target Transform Info.
+  /// @param OAI      Target Transform Info and the SCoP dependencies.
   /// @returns        The transformed schedule.
   static __isl_give isl_schedule *
   optimizeSchedule(__isl_take isl_schedule *Schedule,
-                   const llvm::TargetTransformInfo *TTI = nullptr);
+                   const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
 
   /// Apply schedule tree transformations.
   ///
@@ -75,11 +101,11 @@ public:
   ///   - Prevectorization
   ///
   /// @param Node The schedule object post-transformations will be applied to.
-  /// @param TTI  Target Transform Info.
+  /// @param OAI  Target Transform Info and the SCoP dependencies.
   /// @returns    The transformed schedule.
   static __isl_give isl_schedule_node *
   optimizeScheduleNode(__isl_take isl_schedule_node *Node,
-                       const llvm::TargetTransformInfo *TTI = nullptr);
+                       const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
 
   /// Decide if the @p NewSchedule is profitable for @p S.
   ///
@@ -128,10 +154,11 @@ private:
 
   /// Apply the BLIS matmul optimization pattern.
   ///
-  /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three
-  /// nested loops around a macro-kernel, plus two packing routines.
-  /// The macro-kernel is implemented in terms of two additional loops around
-  /// a micro-kernel. The micro-kernel is a loop around a rank-1
+  /// Make the loops containing the matrix multiplication be the innermost
+  /// loops and apply the BLIS matmul optimization pattern. BLIS implements
+  /// gemm as three nested loops around a macro-kernel, plus two packing
+  /// routines. The macro-kernel is implemented in terms of two additional
+  /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
   /// (i.e., outer product) update.
   ///
   /// For a detailed description please see [1].
@@ -167,9 +194,13 @@ private:
   /// @param Node The node that contains a band to be optimized. The node
   ///             is required to successfully pass
   ///             ScheduleTreeOptimizer::isMatrMultPattern.
+  /// @param TTI  Target Transform Info.
+  /// @param MMI  Parameters of the matrix multiplication operands.
+  /// @returns    The transformed schedule.
   static __isl_give isl_schedule_node *
   optimizeMatMulPattern(__isl_take isl_schedule_node *Node,
-                        const llvm::TargetTransformInfo *TTI);
+                        const llvm::TargetTransformInfo *TTI,
+                        polly::MatMulInfoTy &MMI);
 
   /// Check if this node is a band node we want to tile.
   ///
@@ -266,7 +297,11 @@ private:
   /// the one used to get close-to-peak performance of matrix multiplications.
   ///
   /// @param Node The node to check.
-  static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node);
+  /// @param D    The SCoP dependencies.
+  /// @param MMI  Parameters of the matrix multiplication operands.
+  static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node,
+                                const polly::Dependences *D,
+                                polly::MatMulInfoTy &MMI);
 
   /// Create the BLIS macro-kernel.
   ///

Modified: polly/trunk/lib/Transform/ScheduleOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Transform/ScheduleOptimizer.cpp?rev=293890&r1=293889&r2=293890&view=diff
==============================================================================
--- polly/trunk/lib/Transform/ScheduleOptimizer.cpp (original)
+++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp Thu Feb  2 08:23:14 2017
@@ -468,26 +468,302 @@ ScheduleTreeOptimizer::standardBandOpts(
   return Node;
 }
 
-/// Check whether output dimensions of the map rely on the specified input
-/// dimension.
+/// Get the position of a dimension with a non-zero coefficient.
 ///
-/// @param IslMap The isl map to be considered.
-/// @param DimNum The number of an input dimension to be checked.
-static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
-  auto *CheckedAccessRelation =
-      isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
-  CheckedAccessRelation =
-      isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
-  auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
-  CheckedAccessRelation =
-      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
-  InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
-  CheckedAccessRelation =
-      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
-  auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
-  isl_map_free(CheckedAccessRelation);
-  isl_map_free(IslMap);
-  return res;
+/// Check that isl constraint @p Constraint has only one non-zero
+/// coefficient for dimensions that have type @p DimType. If this is true,
+/// return the position of the dimension corresponding to the non-zero
+/// coefficient and negative value, otherwise.
+///
+/// @param Constraint The isl constraint to be checked.
+/// @param DimType    The type of the dimensions.
+/// @return           The position of the dimension in case the isl
+///                   constraint satisfies the requirements, a negative
+///                   value, otherwise.
+static int getMatMulConstraintDim(__isl_keep isl_constraint *Constraint,
+                                  enum isl_dim_type DimType) {
+  int DimPos = -1;
+  auto *LocalSpace = isl_constraint_get_local_space(Constraint);
+  int LocalSpaceDimNum = isl_local_space_dim(LocalSpace, DimType);
+  for (int i = 0; i < LocalSpaceDimNum; i++) {
+    auto *Val = isl_constraint_get_coefficient_val(Constraint, DimType, i);
+    if (isl_val_is_zero(Val)) {
+      isl_val_free(Val);
+      continue;
+    }
+    if (DimPos >= 0 || (DimType == isl_dim_out && !isl_val_is_one(Val)) ||
+        (DimType == isl_dim_in && !isl_val_is_negone(Val))) {
+      isl_val_free(Val);
+      isl_local_space_free(LocalSpace);
+      return -1;
+    }
+    DimPos = i;
+    isl_val_free(Val);
+  }
+  isl_local_space_free(LocalSpace);
+  return DimPos;
+}
+
+/// Check the form of the isl constraint.
+///
+/// Check that the @p DimInPos input dimension of the isl constraint
+/// @p Constraint has a coefficient that is equal to negative one, the @p
+/// DimOutPos has a coefficient that is equal to one and others
+/// have coefficients equal to zero.
+///
+/// @param Constraint The isl constraint to be checked.
+/// @param DimInPos   The input dimension of the isl constraint.
+/// @param DimOutPos  The output dimension of the isl constraint.
+/// @return           isl_stat_ok in case the isl constraint satisfies
+///                   the requirements, isl_stat_error otherwise.
+static isl_stat isMatMulOperandConstraint(__isl_keep isl_constraint *Constraint,
+                                          int &DimInPos, int &DimOutPos) {
+  auto *Val = isl_constraint_get_constant_val(Constraint);
+  if (!isl_constraint_is_equality(Constraint) || !isl_val_is_zero(Val)) {
+    isl_val_free(Val);
+    return isl_stat_error;
+  }
+  isl_val_free(Val);
+  DimInPos = getMatMulConstraintDim(Constraint, isl_dim_in);
+  if (DimInPos < 0)
+    return isl_stat_error;
+  DimOutPos = getMatMulConstraintDim(Constraint, isl_dim_out);
+  if (DimOutPos < 0)
+    return isl_stat_error;
+  return isl_stat_ok;
+}
+
+/// Check that the access relation corresponds to a non-constant operand
+/// of the matrix multiplication.
+///
+/// Access relations that correspond to non-constant operands of the matrix
+/// multiplication depend only on two input dimensions and have two output
+/// dimensions. The function checks that the isl basic map @p bmap satisfies
+/// the requirements. The two input dimensions can be specified via @p user
+/// array.
+///
+/// @param bmap The isl basic map to be checked.
+/// @param user The input dimensions of @p bmap.
+/// @return     isl_stat_ok in case isl basic map satisfies the requirements,
+///             isl_stat_error otherwise.
+static isl_stat isMatMulOperandBasicMap(__isl_take isl_basic_map *bmap,
+                                        void *user) {
+  auto *Constraints = isl_basic_map_get_constraint_list(bmap);
+  isl_basic_map_free(bmap);
+  if (isl_constraint_list_n_constraint(Constraints) != 2) {
+    isl_constraint_list_free(Constraints);
+    return isl_stat_error;
+  }
+  int InPosPair[] = {-1, -1};
+  auto DimInPos = user ? static_cast<int *>(user) : InPosPair;
+  for (int i = 0; i < 2; i++) {
+    auto *Constraint = isl_constraint_list_get_constraint(Constraints, i);
+    int InPos, OutPos;
+    if (isMatMulOperandConstraint(Constraint, InPos, OutPos) ==
+            isl_stat_error ||
+        OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos)) {
+      isl_constraint_free(Constraint);
+      isl_constraint_list_free(Constraints);
+      return isl_stat_error;
+    }
+    DimInPos[OutPos] = InPos;
+    isl_constraint_free(Constraint);
+  }
+  isl_constraint_list_free(Constraints);
+  return isl_stat_ok;
+}
+
+/// Permute the two dimensions of the isl map.
+///
+/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
+/// have type @p DimType.
+///
+/// @param Map     The isl map to be modified.
+/// @param DimType The type of the dimensions.
+/// @param DstPos  The first dimension.
+/// @param SrcPos  The second dimension.
+/// @return        The modified map.
+__isl_give isl_map *permuteDimensions(__isl_take isl_map *Map,
+                                      enum isl_dim_type DimType,
+                                      unsigned DstPos, unsigned SrcPos) {
+  assert(DstPos < isl_map_dim(Map, DimType) &&
+         SrcPos < isl_map_dim(Map, DimType));
+  if (DstPos == SrcPos)
+    return Map;
+  isl_id *DimId = nullptr;
+  if (isl_map_has_tuple_id(Map, DimType))
+    DimId = isl_map_get_tuple_id(Map, DimType);
+  auto FreeDim = DimType == isl_dim_in ? isl_dim_out : isl_dim_in;
+  isl_id *FreeDimId = nullptr;
+  if (isl_map_has_tuple_id(Map, FreeDim))
+    FreeDimId = isl_map_get_tuple_id(Map, FreeDim);
+  auto MaxDim = std::max(DstPos, SrcPos);
+  auto MinDim = std::min(DstPos, SrcPos);
+  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MaxDim, 1);
+  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MinDim, 1);
+  Map = isl_map_move_dims(Map, DimType, MinDim, FreeDim, 1, 1);
+  Map = isl_map_move_dims(Map, DimType, MaxDim, FreeDim, 0, 1);
+  if (DimId)
+    Map = isl_map_set_tuple_id(Map, DimType, DimId);
+  if (FreeDimId)
+    Map = isl_map_set_tuple_id(Map, FreeDim, FreeDimId);
+  return Map;
+}
+
+/// Check the form of the access relation.
+///
+/// Check that the access relation @p AccMap has the form M[i][j], where i
+/// is a @p FirstPos and j is a @p SecondPos.
+///
+/// @param AccMap    The access relation to be checked.
+/// @param FirstPos  The index of the input dimension that is mapped to
+///                  the first output dimension.
+/// @param SecondPos The index of the input dimension that is mapped to the
+///                  second output dimension.
+/// @return          True in case @p AccMap has the expected form and false,
+///                  otherwise.
+static bool isMatMulOperandAcc(__isl_keep isl_map *AccMap, int &FirstPos,
+                               int &SecondPos) {
+  int DimInPos[] = {FirstPos, SecondPos};
+  if (isl_map_foreach_basic_map(AccMap, isMatMulOperandBasicMap,
+                                static_cast<void *>(DimInPos)) != isl_stat_ok ||
+      DimInPos[0] < 0 || DimInPos[1] < 0)
+    return false;
+  FirstPos = DimInPos[0];
+  SecondPos = DimInPos[1];
+  return true;
+}
+
+/// Does the memory access represent a non-scalar operand of the matrix
+/// multiplication.
+///
+/// Check that the memory access @p MemAccess is the read access to a non-scalar
+/// operand of the matrix multiplication or its result.
+///
+/// @param MemAccess The memory access to be checked.
+/// @param MMI       Parameters of the matrix multiplication operands.
+/// @return          True in case the memory access represents the read access
+///                  to a non-scalar operand of the matrix multiplication and
+///                  false, otherwise.
+static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
+                                        MatMulInfoTy &MMI) {
+  if (!MemAccess->isArrayKind() || !MemAccess->isRead())
+    return false;
+  isl_map *AccMap = MemAccess->getAccessRelation();
+  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j) && !MMI.ReadFromC &&
+      isl_map_n_basic_map(AccMap) == 1) {
+    MMI.ReadFromC = MemAccess;
+    isl_map_free(AccMap);
+    return true;
+  }
+  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k) && !MMI.A &&
+      isl_map_n_basic_map(AccMap) == 1) {
+    MMI.A = MemAccess;
+    isl_map_free(AccMap);
+    return true;
+  }
+  if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j) && !MMI.B &&
+      isl_map_n_basic_map(AccMap) == 1) {
+    MMI.B = MemAccess;
+    isl_map_free(AccMap);
+    return true;
+  }
+  isl_map_free(AccMap);
+  return false;
+}
+
+/// Check accesses to operands of the matrix multiplication.
+///
+/// Check that accesses of the SCoP statement, which corresponds to
+/// the partial schedule @p PartialSchedule, are scalar in terms of loops
+/// containing the matrix multiplication, in case they do not represent
+/// accesses to the non-scalar operands of the matrix multiplication or
+/// its result.
+///
+/// @param  PartialSchedule The partial schedule of the SCoP statement.
+/// @param  MMI             Parameters of the matrix multiplication operands.
+/// @return                 True in case the corresponding SCoP statement
+///                         represents matrix multiplication and false,
+///                         otherwise.
+static bool containsOnlyMatrMultAcc(__isl_keep isl_map *PartialSchedule,
+                                    MatMulInfoTy &MMI) {
+  auto *InputDimId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
+  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimId));
+  isl_id_free(InputDimId);
+  unsigned OutDimNum = isl_map_dim(PartialSchedule, isl_dim_out);
+  assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
+                          "and, consequently, the corresponding scheduling "
+                          "functions have at least three dimensions.");
+  auto *MapI = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
+                                 MMI.i, OutDimNum - 1);
+  auto *MapJ = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
+                                 MMI.j, OutDimNum - 1);
+  auto *MapK = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
+                                 MMI.k, OutDimNum - 1);
+  for (auto *MemA = Stmt->begin(); MemA != Stmt->end() - 1; MemA++) {
+    auto *MemAccessPtr = *MemA;
+    if (MemAccessPtr->isArrayKind() && MemAccessPtr != MMI.WriteToC &&
+        !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
+        !(MemAccessPtr->isStrideZero(isl_map_copy(MapI)) &&
+          MemAccessPtr->isStrideZero(isl_map_copy(MapJ)) &&
+          MemAccessPtr->isStrideZero(isl_map_copy(MapK)))) {
+      isl_map_free(MapI);
+      isl_map_free(MapJ);
+      isl_map_free(MapK);
+      return false;
+    }
+  }
+  isl_map_free(MapI);
+  isl_map_free(MapJ);
+  isl_map_free(MapK);
+  return true;
+}
+
+/// Check for dependencies corresponding to the matrix multiplication.
+///
+/// Check that there is only true dependence of the form
+/// S(..., k, ...) -> S(..., k + 1, â¦), where S is the SCoP statement
+/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
+/// to the dependency produced by the matrix multiplication.
+///
+/// @param  Schedule The schedule of the SCoP statement.
+/// @param  D The SCoP dependencies.
+/// @param  Pos The parameter to desribe an acceptable true dependence.
+///             In case it has a negative value, try to determine its
+///             acceptable value.
+/// @return True in case dependencies correspond to the matrix multiplication
+///         and false, otherwise.
+static bool containsOnlyMatMulDep(__isl_keep isl_map *Schedule,
+                                  const Dependences *D, int &Pos) {
+  auto *WAR = D->getDependences(Dependences::TYPE_WAR);
+  if (!isl_union_map_is_empty(WAR)) {
+    isl_union_map_free(WAR);
+    return false;
+  }
+  isl_union_map_free(WAR);
+  auto *RAW = D->getDependences(Dependences::TYPE_RAW);
+  auto *Domain = isl_map_domain(isl_map_copy(Schedule));
+  auto *Space = isl_space_map_from_domain_and_range(isl_set_get_space(Domain),
+                                                    isl_set_get_space(Domain));
+  isl_set_free(Domain);
+  auto *Deltas = isl_map_deltas(isl_union_map_extract_map(RAW, Space));
+  int DeltasDimNum = isl_set_dim(Deltas, isl_dim_set);
+  for (int i = 0; i < DeltasDimNum; i++) {
+    auto *Val = isl_set_plain_get_val_if_fixed(Deltas, isl_dim_set, i);
+    if (Pos < 0 && isl_val_is_one(Val))
+      Pos = i;
+    if (isl_val_is_nan(Val) ||
+        !(isl_val_is_zero(Val) || (i == Pos && isl_val_is_one(Val)))) {
+      isl_val_free(Val);
+      isl_union_map_free(RAW);
+      isl_set_free(Deltas);
+      return false;
+    }
+    isl_val_free(Val);
+  }
+  isl_union_map_free(RAW);
+  isl_set_free(Deltas);
+  return true;
 }
 
 /// Check if the SCoP statement could probably be optimized with analytical
@@ -495,50 +771,57 @@ static bool isInputDimUsed(__isl_take is
 ///
 /// containsMatrMult tries to determine whether the following conditions
 /// are true:
-/// 1. all memory accesses of the statement will have stride 0 or 1,
-///    if we interchange loops (switch the variable used in the inner
-///    loop to the outer loop).
-/// 2. all memory accesses of the statement except from the last one, are
-///    read memory access and the last one is write memory access.
-/// 3. all subscripts of the last memory access of the statement don't contain
-///    the variable used in the inner loop.
+/// 1. The last memory access modeling an array, MA1, represents writing to
+///    memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
+///    S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
+///    under consideration.
+/// 2. There is only one loop-carried true dependency, and it has the
+///    form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
+///    loop-carried or anti dependencies.
+/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
+///    reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
+///    S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
+///    and all memory accesses of the SCoP that are different from MA1, MA2,
+///    MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
+///    of loops i1, i2 and i3.
 ///
 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
 ///        to check.
-static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
-  auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
-  auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
+/// @D     The SCoP dependencies.
+/// @MMI   Parameters of the matrix multiplication operands.
+static bool containsMatrMult(__isl_keep isl_map *PartialSchedule,
+                             const Dependences *D, MatMulInfoTy &MMI) {
+  auto *InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
+  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
   isl_id_free(InputDimsId);
-  if (ScpStmt->size() <= 1)
+  if (Stmt->size() <= 1)
     return false;
-  auto MemA = ScpStmt->begin();
-  for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
-       i++, MemA++)
-    if (!(*MemA)->isRead() ||
-        ((*MemA)->isArrayKind() &&
-         !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
-           (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
+  for (auto *MemA = Stmt->end() - 1; MemA != Stmt->begin(); MemA--) {
+    auto *MemAccessPtr = *MemA;
+    if (!MemAccessPtr->isArrayKind())
+      continue;
+    if (!MemAccessPtr->isWrite())
       return false;
-  MemA++;
-  if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
-      !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
-        (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
-    return false;
-  auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
-  return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
-}
-
-/// Circular shift of output dimensions of the integer map.
-///
-/// @param IslMap The isl map to be modified.
-static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
-  auto DimNum = isl_map_dim(IslMap, isl_dim_out);
-  if (DimNum == 0)
-    return IslMap;
-  auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
-  IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
-  IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
-  return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
+    auto *AccMap = MemAccessPtr->getAccessRelation();
+    if (isl_map_n_basic_map(AccMap) != 1 ||
+        !isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) {
+      isl_map_free(AccMap);
+      return false;
+    }
+    isl_map_free(AccMap);
+    MMI.WriteToC = MemAccessPtr;
+    break;
+  }
+
+  if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
+    return false;
+
+  if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
+    return false;
+
+  if (!MMI.A || !MMI.B || !MMI.ReadFromC)
+    return false;
+  return true;
 }
 
 /// Permute two dimensions of the band node.
@@ -581,12 +864,15 @@ __isl_give isl_schedule_node *ScheduleTr
   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
       MacroKernelParams.Kc == 1)
     return Node;
-  Node = tileNode(
-      Node, "1st level tiling",
-      {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
+  int DimOutNum = isl_schedule_node_band_n_member(Node);
+  std::vector<int> TileSizes(DimOutNum, 1);
+  TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
+  TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
+  TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
+  Node = tileNode(Node, "1st level tiling", TileSizes, 1);
   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
-  Node = permuteBandNodeDimensions(Node, 1, 2);
-  Node = permuteBandNodeDimensions(Node, 0, 2);
+  Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
+  Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 }
 
@@ -659,165 +945,6 @@ getMacroKernelParams(const MicroKernelPa
   return {Mc, Nc, Kc};
 }
 
-/// Identify a memory access through the shape of its memory access relation.
-///
-/// Identify the unique memory access in @p Stmt, that has an access relation
-/// equal to @p ExpectedAccessRelation.
-///
-/// @param Stmt The SCoP statement that contains the memory accesses under
-///             consideration.
-/// @param ExpectedAccessRelation The access relation that identifies
-///                               the memory access.
-/// @return  The memory access of @p Stmt whose memory access relation is equal
-///          to @p ExpectedAccessRelation. nullptr in case there is no or more
-///          than one such access.
-MemoryAccess *
-identifyAccessByAccessRelation(ScopStmt *Stmt,
-                               __isl_take isl_map *ExpectedAccessRelation) {
-  if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
-    ExpectedAccessRelation =
-        isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
-  MemoryAccess *IdentifiedAccess = nullptr;
-  for (auto *Access : *Stmt) {
-    auto *AccessRelation = Access->getAccessRelation();
-    AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
-    if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
-      if (IdentifiedAccess) {
-        isl_map_free(AccessRelation);
-        isl_map_free(ExpectedAccessRelation);
-        return nullptr;
-      }
-      IdentifiedAccess = Access;
-    }
-    isl_map_free(AccessRelation);
-  }
-  isl_map_free(ExpectedAccessRelation);
-  return IdentifiedAccess;
-}
-
-/// Add constrains to @Dim dimension of @p ExtMap.
-///
-/// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
-/// the following constraint will be added
-/// Bound * OM <= IM <= Bound * (OM + 1) - 1,
-/// where M is @p Dim and Bound is @p Bound.
-///
-/// @param ExtMap The isl map to be modified.
-/// @param Dim The output dimension to be modfied.
-/// @param Bound The value that is used to specify the constraint.
-/// @return The modified isl map
-__isl_give isl_map *
-addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
-                                   unsigned Bound) {
-  assert(Bound != 0);
-  auto *ExtMapSpace = isl_map_get_space(ExtMap);
-  auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
-  auto *Constr =
-      isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
-  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
-  Constr =
-      isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
-  ExtMap = isl_map_add_constraint(ExtMap, Constr);
-  Constr = isl_constraint_alloc_inequality(ConstrSpace);
-  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
-  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
-  Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
-  return isl_map_add_constraint(ExtMap, Constr);
-}
-
-/// Create an access relation that is specific for matrix multiplication
-/// pattern.
-///
-/// Create an access relation of the following form:
-/// { [O0, O1, O2]->[I1, I2, I3] :
-///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
-///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
-///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
-///   where FirstOutputDimBound is @p FirstOutputDimBound,
-///   SecondOutputDimBound is @p SecondOutputDimBound,
-///   ThirdOutputDimBound is @p ThirdOutputDimBound
-///
-/// @param Ctx The isl context.
-/// @param FirstOutputDimBound,
-///        SecondOutputDimBound,
-///        ThirdOutputDimBound The parameters of the access relation.
-/// @return The specified access relation.
-__isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
-                                 unsigned SecondOutputDimBound,
-                                 unsigned ThirdOutputDimBound) {
-  auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
-  auto *extensionMap = isl_map_universe(NewRelSpace);
-  if (!FirstOutputDimBound)
-    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
-  else
-    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
-                                                      FirstOutputDimBound);
-  if (!SecondOutputDimBound)
-    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
-  else
-    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
-                                                      SecondOutputDimBound);
-  if (!ThirdOutputDimBound)
-    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
-  else
-    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
-                                                      ThirdOutputDimBound);
-  return extensionMap;
-}
-
-/// Create an access relation that is specific to the matrix
-///        multiplication pattern.
-///
-/// Create an access relation of the following form:
-/// Stmt[O0, O1, O2]->[OI, OJ],
-/// where I is @p I, J is @J
-///
-/// @param Stmt The SCoP statement for which to generate the access relation.
-/// @param I The index of the input dimension that is mapped to the first output
-///          dimension.
-/// @param J The index of the input dimension that is mapped to the second
-///          output dimension.
-/// @return The specified access relation.
-__isl_give isl_map *
-getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
-  auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
-  auto *AccessRel = isl_map_universe(AccessRelSpace);
-  AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
-  AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
-  AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
-  return AccessRel;
-}
-
-/// Identify the memory access that corresponds to the access to the second
-/// operand of the matrix multiplication.
-///
-/// Identify the memory access that corresponds to the access
-/// to the matrix B of the matrix multiplication C = A x B.
-///
-/// @param Stmt The SCoP statement that contains the memory accesses
-///             under consideration.
-/// @return The memory access of @p Stmt that corresponds to the access
-///         to the second operand of the matrix multiplication.
-MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
-  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
-  return identifyAccessByAccessRelation(Stmt, OriginalRel);
-}
-
-/// Identify the memory access that corresponds to the access to the first
-/// operand of the matrix multiplication.
-///
-/// Identify the memory access that corresponds to the access
-/// to the matrix A of the matrix multiplication C = A x B.
-///
-/// @param Stmt The SCoP statement that contains the memory accesses
-///             under consideration.
-/// @return The memory access of @p Stmt that corresponds to the access
-///         to the first operand of the matrix multiplication.
-MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
-  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
-  return identifyAccessByAccessRelation(Stmt, OriginalRel);
-}
-
 /// Create an access relation that is specific to
 ///        the matrix multiplication pattern.
 ///
@@ -893,21 +1020,15 @@ createExtensionNode(__isl_take isl_sched
 ///                     transformations.
 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
 ///                                 to be taken into account.
+/// @param MMI Parameters of the matrix multiplication operands.
 /// @return The optimized schedule node.
 static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
     __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
-    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
-  // Check whether memory accesses of the SCoP statement correspond to
-  // the matrix multiplication pattern and if this is true, obtain them.
+    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams,
+    MatMulInfoTy &MMI) {
   auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
   auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
   isl_id_free(InputDimsId);
-  MemoryAccess *MemAccessA = identifyAccessA(Stmt);
-  MemoryAccess *MemAccessB = identifyAccessB(Stmt);
-  if (!MemAccessA || !MemAccessB) {
-    isl_map_free(MapOldIndVar);
-    return Node;
-  }
 
   // Create a copy statement that corresponds to the memory access to the
   // matrix B, the second operand of the matrix multiplication.
@@ -920,23 +1041,23 @@ static __isl_give isl_schedule_node *opt
   unsigned SecondDimSize = MacroParams.Kc;
   unsigned ThirdDimSize = MicroParams.Nr;
   auto *SAI = Stmt->getParent()->createScopArrayInfo(
-      MemAccessB->getElementType(), "Packed_B",
+      MMI.B->getElementType(), "Packed_B",
       {FirstDimSize, SecondDimSize, ThirdDimSize});
   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-  auto *OldAcc = MemAccessB->getAccessRelation();
-  MemAccessB->setNewAccessRelation(AccRel);
+  auto *OldAcc = MMI.B->getAccessRelation();
+  MMI.B->setNewAccessRelation(AccRel);
   auto *ExtMap =
-      getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
-  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
-  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
-  ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
+      isl_map_project_out(isl_map_copy(MapOldIndVar), isl_dim_out, 2,
+                          isl_map_dim(MapOldIndVar, isl_dim_out) - 2);
+  ExtMap = isl_map_reverse(ExtMap);
+  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.i, 0);
   auto *Domain = Stmt->getDomain();
 
   // Restrict the domains of the copy statements to only execute when also its
   // originating statement is executed.
   auto *DomainId = isl_set_get_tuple_id(Domain);
   auto *NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
+      OldAcc, MMI.B->getAccessRelation(), isl_set_copy(Domain));
   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
   ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
@@ -945,20 +1066,21 @@ static __isl_give isl_schedule_node *opt
   // Create a copy statement that corresponds to the memory access
   // to the matrix A, the first operand of the matrix multiplication.
   Node = isl_schedule_node_child(Node, 0);
-  AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
+  AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 4, 6);
   FirstDimSize = MacroParams.Mc / MicroParams.Mr;
   ThirdDimSize = MicroParams.Mr;
   SAI = Stmt->getParent()->createScopArrayInfo(
-      MemAccessA->getElementType(), "Packed_A",
+      MMI.A->getElementType(), "Packed_A",
       {FirstDimSize, SecondDimSize, ThirdDimSize});
   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-  OldAcc = MemAccessA->getAccessRelation();
-  MemAccessA->setNewAccessRelation(AccRel);
-  ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
-  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
-  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
-  NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
+  OldAcc = MMI.A->getAccessRelation();
+  MMI.A->setNewAccessRelation(AccRel);
+  ExtMap = isl_map_project_out(MapOldIndVar, isl_dim_out, 3,
+                               isl_map_dim(MapOldIndVar, isl_dim_out) - 3);
+  ExtMap = isl_map_reverse(ExtMap);
+  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.j, 0);
+  NewStmt = Stmt->getParent()->addScopStmt(OldAcc, MMI.A->getAccessRelation(),
+                                           isl_set_copy(Domain));
 
   // Restrict the domains of the copy statements to only execute when also its
   // originating statement is executed.
@@ -998,8 +1120,19 @@ getInductionVariablesSubstitution(__isl_
 }
 
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
-    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
+    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI,
+    MatMulInfoTy &MMI) {
   assert(TTI && "The target transform info should be provided.");
+  int DimOutNum = isl_schedule_node_band_n_member(Node);
+  assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
+                          "and, consequently, the corresponding scheduling "
+                          "functions have at least three dimensions.");
+  Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
+  int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
+  int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
+  Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
+  NewK = MMI.k == DimOutNum - 2 ? MMI.j : MMI.k;
+  Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
   auto MicroKernelParams = getMicroKernelParams(TTI);
   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
   Node = createMacroKernel(Node, MacroKernelParams);
@@ -1012,21 +1145,21 @@ __isl_give isl_schedule_node *ScheduleTr
   if (!MapOldIndVar)
     return Node;
   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
-                                          MacroKernelParams);
+                                          MacroKernelParams, MMI);
 }
 
 bool ScheduleTreeOptimizer::isMatrMultPattern(
-    __isl_keep isl_schedule_node *Node) {
+    __isl_keep isl_schedule_node *Node, const Dependences *D,
+    MatMulInfoTy &MMI) {
   auto *PartialSchedule =
       isl_schedule_node_band_get_partial_schedule_union_map(Node);
-  if (isl_schedule_node_band_n_member(Node) != 3 ||
+  if (isl_schedule_node_band_n_member(Node) < 3 ||
       isl_union_map_n_map(PartialSchedule) != 1) {
     isl_union_map_free(PartialSchedule);
     return false;
   }
   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
-  NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
-  if (containsMatrMult(NewPartialSchedule)) {
+  if (containsMatrMult(NewPartialSchedule, D, MMI)) {
     isl_map_free(NewPartialSchedule);
     return true;
   }
@@ -1040,11 +1173,13 @@ ScheduleTreeOptimizer::optimizeBand(__is
   if (!isTileableBandNode(Node))
     return Node;
 
-  if (PMBasedOpts && User && isMatrMultPattern(Node)) {
+  const OptimizerAdditionalInfoTy *OAI =
+      static_cast<const OptimizerAdditionalInfoTy *>(User);
+
+  MatMulInfoTy MMI;
+  if (PMBasedOpts && User && isMatrMultPattern(Node, OAI->D, MMI)) {
     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
-    const llvm::TargetTransformInfo *TTI;
-    TTI = static_cast<const llvm::TargetTransformInfo *>(User);
-    Node = optimizeMatMulPattern(Node, TTI);
+    Node = optimizeMatMulPattern(Node, OAI->TTI, MMI);
   }
 
   return standardBandOpts(Node, User);
@@ -1052,9 +1187,9 @@ ScheduleTreeOptimizer::optimizeBand(__is
 
 __isl_give isl_schedule *
 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
-                                        const llvm::TargetTransformInfo *TTI) {
+                                        const OptimizerAdditionalInfoTy *OAI) {
   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
-  Root = optimizeScheduleNode(Root, TTI);
+  Root = optimizeScheduleNode(Root, OAI);
   isl_schedule_free(Schedule);
   auto S = isl_schedule_node_get_schedule(Root);
   isl_schedule_node_free(Root);
@@ -1062,9 +1197,9 @@ ScheduleTreeOptimizer::optimizeSchedule(
 }
 
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
-    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
+    __isl_take isl_schedule_node *Node, const OptimizerAdditionalInfoTy *OAI) {
   Node = isl_schedule_node_map_descendant_bottom_up(
-      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
+      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(OAI)));
   return Node;
 }
 
@@ -1264,8 +1399,9 @@ bool IslScheduleOptimizer::runOnScop(Sco
 
   Function &F = S.getFunction();
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)};
   isl_schedule *NewSchedule =
-      ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
+      ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
 
   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
     isl_schedule_free(NewSchedule);

Modified: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts.ll?rev=293890&r1=293889&r2=293890&view=diff
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts.ll (original)
+++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts.ll Thu Feb  2 08:23:14 2017
@@ -15,63 +15,49 @@
 ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
 
-define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) {
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {
 bb:
   br label %bb8
 
-bb8:                                              ; preds = %bb39, %bb
-  %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ]
-  %tmp9 = icmp slt i32 %tmp, 1056
-  br i1 %tmp9, label %bb10, label %bb41
-
-bb10:                                             ; preds = %bb8
-  br label %bb11
-
-bb11:                                             ; preds = %bb37, %bb10
-  %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ]
-  %tmp13 = icmp slt i32 %tmp12, 1056
-  br i1 %tmp13, label %bb14, label %bb39
-
-bb14:                                             ; preds = %bb11
-  %tmp15 = sext i32 %tmp12 to i64
-  %tmp16 = sext i32 %tmp to i64
-  %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16
-  %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15
-  %tmp19 = load double, double* %tmp18, align 8
-  %tmp20 = fmul double %tmp19, %arg4
-  store double %tmp20, double* %tmp18, align 8
-  br label %bb21
-
-bb21:                                             ; preds = %bb24, %bb14
-  %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ]
-  %tmp23 = icmp slt i32 %tmp22, 1024
-  br i1 %tmp23, label %bb24, label %bb37
-
-bb24:                                             ; preds = %bb21
-  %tmp25 = sext i32 %tmp22 to i64
-  %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16
-  %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25
-  %tmp28 = load double, double* %tmp27, align 8
-  %tmp29 = fmul double %arg3, %tmp28
-  %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25
-  %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15
-  %tmp32 = load double, double* %tmp31, align 8
-  %tmp33 = fmul double %tmp29, %tmp32
-  %tmp34 = load double, double* %tmp18, align 8
-  %tmp35 = fadd double %tmp34, %tmp33
-  store double %tmp35, double* %tmp18, align 8
-  %tmp36 = add nsw i32 %tmp22, 1
-  br label %bb21
-
-bb37:                                             ; preds = %bb21
-  %tmp38 = add nsw i32 %tmp12, 1
-  br label %bb11
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
 
-bb39:                                             ; preds = %bb11
-  %tmp40 = add nsw i32 %tmp, 1
-  br label %bb8
-
-bb41:                                             ; preds = %bb8
+bb32:                                             ; preds = %bb29
   ret void
 }

Modified: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll?rev=293890&r1=293889&r2=293890&view=diff
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll (original)
+++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll Thu Feb  2 08:23:14 2017
@@ -17,63 +17,49 @@
 ; CHECK-NOT: The matrix multiplication pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
 
-define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) {
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {
 bb:
   br label %bb8
 
-bb8:                                              ; preds = %bb39, %bb
-  %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ]
-  %tmp9 = icmp slt i32 %tmp, 1056
-  br i1 %tmp9, label %bb10, label %bb41
-
-bb10:                                             ; preds = %bb8
-  br label %bb11
-
-bb11:                                             ; preds = %bb37, %bb10
-  %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ]
-  %tmp13 = icmp slt i32 %tmp12, 1056
-  br i1 %tmp13, label %bb14, label %bb39
-
-bb14:                                             ; preds = %bb11
-  %tmp15 = sext i32 %tmp12 to i64
-  %tmp16 = sext i32 %tmp to i64
-  %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16
-  %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15
-  %tmp19 = load double, double* %tmp18, align 8
-  %tmp20 = fmul double %tmp19, %arg4
-  store double %tmp20, double* %tmp18, align 8
-  br label %bb21
-
-bb21:                                             ; preds = %bb24, %bb14
-  %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ]
-  %tmp23 = icmp slt i32 %tmp22, 1024
-  br i1 %tmp23, label %bb24, label %bb37
-
-bb24:                                             ; preds = %bb21
-  %tmp25 = sext i32 %tmp22 to i64
-  %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16
-  %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25
-  %tmp28 = load double, double* %tmp27, align 8
-  %tmp29 = fmul double %arg3, %tmp28
-  %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25
-  %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15
-  %tmp32 = load double, double* %tmp31, align 8
-  %tmp33 = fmul double %tmp29, %tmp32
-  %tmp34 = load double, double* %tmp18, align 8
-  %tmp35 = fadd double %tmp34, %tmp33
-  store double %tmp35, double* %tmp18, align 8
-  %tmp36 = add nsw i32 %tmp22, 1
-  br label %bb21
-
-bb37:                                             ; preds = %bb21
-  %tmp38 = add nsw i32 %tmp12, 2
-  br label %bb11
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 2
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
 
-bb39:                                             ; preds = %bb11
-  %tmp40 = add nsw i32 %tmp, 1
-  br label %bb8
-
-bb41:                                             ; preds = %bb8
+bb32:                                             ; preds = %bb29
   ret void
 }

Modified: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll?rev=293890&r1=293889&r2=293890&view=diff
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll (original)
+++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll Thu Feb  2 08:23:14 2017
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:          // 1st level tiling - Points
 ; CHECK-NEXT:          for (int c2 = 0; c2 <= 31; c2 += 1)
 ; CHECK-NEXT:            for (int c3 = 0; c3 <= 31; c3 += 1)
-; CHECK-NEXT:              Stmt_bb14(32 * c0 + c2, 32 * c1 + c3);
+; CHECK-NEXT:              Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
 ; CHECK-NEXT:        }
 ; CHECK-NEXT:      // Register tiling - Tiles
 ; CHECK-NEXT:      for (int c0 = 0; c0 <= 131; c0 += 1)
@@ -41,38 +41,38 @@
 ; CHECK-NEXT:            // 1st level tiling - Tiles
 ; CHECK-NEXT:            // 1st level tiling - Points
 ; CHECK-NEXT:            {
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 7, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 7, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 7, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 7, c2);
 ; CHECK-NEXT:            }
 ; CHECK-NEXT:          }
 ; CHECK-NEXT:    }
@@ -84,11 +84,17 @@
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          // 1st level tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c2 = 0; c2 <= 31; c2 += 1)
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:            for (int c3 = 0; c3 <= 31; c3 += 1)
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:              Stmt_bb14(32 * c0 + c2, 32 * c1 + c3);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:              Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      // 1st level tiling - Tiles
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:      for (int c1 = 0; c1 <= 3; c1 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:      for (int c1 = 0; c1 <= 3; c1 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:        for (int c3 = 0; c3 <= 1055; c3 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:            CopyStmt_0(0, c3, c4);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        for (int c2 = 0; c2 <= 10; c2 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:            for (int c5 = 256 * c1; c5 <= 256 * c1 + 255; c5 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:              CopyStmt_1(c3, 0, c5);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          // 1st level tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          // Register tiling - Tiles
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c3 = 0; c3 <= 131; c3 += 1)
@@ -96,43 +102,44 @@
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:              for (int c5 = 0; c5 <= 255; c5 += 1) {
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // Register tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // 1st level tiling - Tiles
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // 1st level tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:               // 1st level tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                {
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:              }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        }
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:      }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:    }
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -142,60 +149,43 @@ define internal void @kernel_gemm(i32 %a
 bb:
   br label %bb8
 
-bb8:                                              ; preds = %bb39, %bb
-  %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ]
-  %tmp9 = icmp slt i32 %tmp, 1056
-  br i1 %tmp9, label %bb10, label %bb41
-
-bb10:                                             ; preds = %bb8
-  br label %bb11
-
-bb11:                                             ; preds = %bb37, %bb10
-  %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ]
-  %tmp13 = icmp slt i32 %tmp12, 1056
-  br i1 %tmp13, label %bb14, label %bb39
-
-bb14:                                             ; preds = %bb11
-  %tmp15 = sext i32 %tmp12 to i64
-  %tmp16 = sext i32 %tmp to i64
-  %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16
-  %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15
-  %tmp19 = load double, double* %tmp18, align 8
-  %tmp20 = fmul double %tmp19, %arg4
-  store double %tmp20, double* %tmp18, align 8
-  br label %bb21
-
-bb21:                                             ; preds = %bb24, %bb14
-  %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ]
-  %tmp23 = icmp slt i32 %tmp22, 1024
-  br i1 %tmp23, label %bb24, label %bb37
-
-bb24:                                             ; preds = %bb21
-  %tmp25 = sext i32 %tmp22 to i64
-  %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16
-  %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25
-  %tmp28 = load double, double* %tmp27, align 8
-  %tmp29 = fmul double %arg3, %tmp28
-  %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25
-  %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15
-  %tmp32 = load double, double* %tmp31, align 8
-  %tmp33 = fmul double %tmp29, %tmp32
-  %tmp34 = load double, double* %tmp18, align 8
-  %tmp35 = fadd double %tmp34, %tmp33
-  store double %tmp35, double* %tmp18, align 8
-  %tmp36 = add nsw i32 %tmp22, 1
-  br label %bb21
-
-bb37:                                             ; preds = %bb21
-  %tmp38 = add nsw i32 %tmp12, 1
-  br label %bb11
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
 
-bb39:                                             ; preds = %bb11
-  %tmp40 = add nsw i32 %tmp, 1
-  br label %bb8
-
-bb41:                                             ; preds = %bb8
+bb32:                                             ; preds = %bb29
   ret void
 }
-
-attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }