[polly] r310338 - [ScheduleOptimizer] Make matmul pattern detection work with delicm output

Mon Aug 7 23:15:15 PDT 2017

Author: grosser
Date: Mon Aug  7 23:15:15 2017
New Revision: 310338

URL: http://llvm.org/viewvc/llvm-project?rev=310338&view=rev
Log:
[ScheduleOptimizer] Make matmul pattern detection work with delicm output

In certain cases delicm might decide to not leave the original array write in
the loop body, but to remove it and instead leave a transformed phi node as
write access. This commit teached the matmul pattern detection to order the
memory accesses according to when the access actually happens and use this
information to detect the new pattern. This makes pattern based matmul
optimization work for 2mm and 3mm in polybench 4 after
polly-position=before-vectorizer has been enabled.

Added:
    polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
Modified:
    polly/trunk/include/polly/Simplify.h
    polly/trunk/lib/Transform/ScheduleOptimizer.cpp
    polly/trunk/lib/Transform/Simplify.cpp

Modified: polly/trunk/include/polly/Simplify.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/Simplify.h?rev=310338&r1=310337&r2=310338&view=diff
==============================================================================

--- polly/trunk/include/polly/Simplify.h (original)
+++ polly/trunk/include/polly/Simplify.h Mon Aug  7 23:15:15 2017
@@ -14,12 +14,33 @@
 #ifndef POLLY_TRANSFORM_SIMPLIFY_H
 #define POLLY_TRANSFORM_SIMPLIFY_H
 
+#include "llvm/ADT/SmallVector.h"
+
 namespace llvm {
 class PassRegistry;
 class Pass;
 } // namespace llvm
 
 namespace polly {
+
+class MemoryAccess;
+class ScopStmt;
+
+/// Return a vector that contains MemoryAccesses in the order in
+/// which they are executed.
+///
+/// The order is:
+/// - Implicit reads (BlockGenerator::generateScalarLoads)
+/// - Explicit reads and writes (BlockGenerator::generateArrayLoad,
+///   BlockGenerator::generateArrayStore)
+///   - In block statements, the accesses are in order in which their
+///     instructions are executed.
+///   - In region statements, that order of execution is not predictable at
+///     compile-time.
+/// - Implicit writes (BlockGenerator::generateScalarStores)
+///   The order in which implicit writes are executed relative to each other is
+///   undefined.
+llvm::SmallVector<MemoryAccess *, 32> getAccessesInOrder(ScopStmt &Stmt);
 llvm::Pass *createSimplifyPass();
 } // namespace polly
 

Modified: polly/trunk/lib/Transform/ScheduleOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Transform/ScheduleOptimizer.cpp?rev=310338&r1=310337&r2=310338&view=diff
==============================================================================
--- polly/trunk/lib/Transform/ScheduleOptimizer.cpp (original)
+++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp Mon Aug  7 23:15:15 2017
@@ -52,6 +52,7 @@
 #include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
+#include "polly/Simplify.h"
 #include "polly/Support/GICHelper.h"
 #include "polly/Support/ISLOStream.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -670,7 +671,9 @@ static bool containsOnlyMatrMultAcc(isl:
       permuteDimensions(PartialSchedule, isl::dim::out, MMI.j, OutDimNum - 1);
   auto MapK =
       permuteDimensions(PartialSchedule, isl::dim::out, MMI.k, OutDimNum - 1);
-  for (auto *MemA = Stmt->begin(); MemA != Stmt->end() - 1; MemA++) {
+
+  auto Accesses = getAccessesInOrder(*Stmt);
+  for (auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++) {
     auto *MemAccessPtr = *MemA;
     if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC &&
         !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
@@ -745,7 +748,9 @@ static bool containsMatrMult(isl::map Pa
   auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
   if (Stmt->size() <= 1)
     return false;
-  for (auto *MemA = Stmt->end() - 1; MemA != Stmt->begin(); MemA--) {
+
+  auto Accesses = getAccessesInOrder(*Stmt);
+  for (auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--) {
     auto *MemAccessPtr = *MemA;
     if (!MemAccessPtr->isLatestArrayKind())
       continue;

Modified: polly/trunk/lib/Transform/Simplify.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Transform/Simplify.cpp?rev=310338&r1=310337&r2=310338&view=diff
==============================================================================
--- polly/trunk/lib/Transform/Simplify.cpp (original)
+++ polly/trunk/lib/Transform/Simplify.cpp Mon Aug  7 23:15:15 2017
@@ -98,39 +98,6 @@ static isl::union_map underapproximatedA
   return UResult;
 }
 
-/// Return a vector that contains MemoryAccesses in the order in
-/// which they are executed.
-///
-/// The order is:
-/// - Implicit reads (BlockGenerator::generateScalarLoads)
-/// - Explicit reads and writes (BlockGenerator::generateArrayLoad,
-///   BlockGenerator::generateArrayStore)
-///   - In block statements, the accesses are in order in which their
-///     instructions are executed.
-///   - In region statements, that order of execution is not predictable at
-///     compile-time.
-/// - Implicit writes (BlockGenerator::generateScalarStores)
-///   The order in which implicit writes are executed relative to each other is
-///   undefined.
-static SmallVector<MemoryAccess *, 32> getAccessesInOrder(ScopStmt &Stmt) {
-
-  SmallVector<MemoryAccess *, 32> Accesses;
-
-  for (MemoryAccess *MemAcc : Stmt)
-    if (isImplicitRead(MemAcc))
-      Accesses.push_back(MemAcc);
-
-  for (MemoryAccess *MemAcc : Stmt)
-    if (isExplicitAccess(MemAcc))
-      Accesses.push_back(MemAcc);
-
-  for (MemoryAccess *MemAcc : Stmt)
-    if (isImplicitWrite(MemAcc))
-      Accesses.push_back(MemAcc);
-
-  return Accesses;
-}
-
 class Simplify : public ScopPass {
 private:
   /// The last/current SCoP that is/has been processed.
@@ -700,6 +667,27 @@ public:
 char Simplify::ID;
 } // anonymous namespace
 
+namespace polly {
+SmallVector<MemoryAccess *, 32> getAccessesInOrder(ScopStmt &Stmt) {
+
+  SmallVector<MemoryAccess *, 32> Accesses;
+
+  for (MemoryAccess *MemAcc : Stmt)
+    if (isImplicitRead(MemAcc))
+      Accesses.push_back(MemAcc);
+
+  for (MemoryAccess *MemAcc : Stmt)
+    if (isExplicitAccess(MemAcc))
+      Accesses.push_back(MemAcc);
+
+  for (MemoryAccess *MemAcc : Stmt)
+    if (isImplicitWrite(MemAcc))
+      Accesses.push_back(MemAcc);
+
+  return Accesses;
+}
+} // namespace polly
+
 Pass *polly::createSimplifyPass() { return new Simplify(); }
 
 INITIALIZE_PASS_BEGIN(Simplify, "polly-simplify", "Polly - Simplify", false,

Added: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll?rev=310338&view=auto
==============================================================================
--- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll (added)
+++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll Mon Aug  7 23:15:15 2017
@@ -0,0 +1,76 @@
+; RUN: opt %loadPolly \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-optree -polly-delicm -polly-simplify \
+; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: asserts
+
+; Check that the pattern matching detects the matrix multiplication pattern
+; after a full run of -polly-optree and -polly-delicm, where the write access
+; is not through the original memory access, but trough a PHI node that was
+; delicmed. This test covers the polybench 2mm and 3mm cases.
+;
+; CHECK: The matrix multiplication pattern was detected
+;
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1800 x double]* nocapture %tmp, [2200 x double]* nocapture readonly %A, [1800 x double]* nocapture readonly %B, [2400 x double]* nocapture readnone %C, [2400 x double]* nocapture readnone %D) local_unnamed_addr #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc25, %entry.split
+  %indvars.iv50 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next51, %for.inc25 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.inc22, %for.body
+  %indvars.iv46 = phi i64 [ 0, %for.body ], [ %indvars.iv.next47, %for.inc22 ]
+  %arrayidx5 = getelementptr inbounds [1800 x double], [1800 x double]* %tmp, i64 %indvars.iv50, i64 %indvars.iv46
+  store double 0.000000e+00, double* %arrayidx5, align 8, !tbaa !2
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body8, %for.body3
+  %0 = phi double [ 0.000000e+00, %for.body3 ], [ %add, %for.body8 ]
+  %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]
+  %arrayidx12 = getelementptr inbounds [2200 x double], [2200 x double]* %A, i64 %indvars.iv50, i64 %indvars.iv
+  %1 = load double, double* %arrayidx12, align 8, !tbaa !2
+  %mul = fmul double %1, %alpha
+  %arrayidx16 = getelementptr inbounds [1800 x double], [1800 x double]* %B, i64 %indvars.iv, i64 %indvars.iv46
+  %2 = load double, double* %arrayidx16, align 8, !tbaa !2
+  %mul17 = fmul double %mul, %2
+  %add = fadd double %0, %mul17
+  store double %add, double* %arrayidx5, align 8, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 2200
+  br i1 %exitcond, label %for.inc22, label %for.body8
+
+for.inc22:                                        ; preds = %for.body8
+  %indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
+  %exitcond48 = icmp eq i64 %indvars.iv.next47, 1800
+  br i1 %exitcond48, label %for.inc25, label %for.body3
+
+for.inc25:                                        ; preds = %for.inc22
+  %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1
+  %exitcond52 = icmp eq i64 %indvars.iv.next51, 1600
+  br i1 %exitcond52, label %for.end27, label %for.body
+
+for.end27:                                        ; preds = %for.inc25
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vl,-avx512vpopcntdq,-clflushopt,-clwb,-clzero,-fma4,-lwp,-mwaitx,-pku,-prefetchwt1,-prfchw,-rdseed,-rtm,-sgx,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 (trunk 309912) (llvm/trunk 309933)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"double", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}