[Mlir-commits] [mlir] d7058ac - [mlir] Add MemRef filter to affine data copy optimization

Fri Feb 14 13:47:49 PST 2020

Author: Diego Caballero
Date: 2020-02-14T13:41:45-08:00
New Revision: d7058acc145c637f82cf4aa90358bdcacccf766c

URL: https://github.com/llvm/llvm-project/commit/d7058acc145c637f82cf4aa90358bdcacccf766c
DIFF: https://github.com/llvm/llvm-project/commit/d7058acc145c637f82cf4aa90358bdcacccf766c.diff

LOG: [mlir] Add MemRef filter to affine data copy optimization

This patch extends affine data copy optimization utility with an
optional memref filter argument. When the memref filter is used, data
copy optimization will only generate copies for such a memref.

Note: this patch is just porting the memref filter feature from Uday's
'hop' branch: https://github.com/bondhugula/llvm-project/tree/hop.

Reviewed By: bondhugula

Differential Revision: https://reviews.llvm.org/D74342

Added: 
    mlir/test/lib/Transforms/TestAffineDataCopy.cpp

Modified: 
    mlir/include/mlir/Transforms/LoopUtils.h
    mlir/lib/Transforms/AffineDataCopyGeneration.cpp
    mlir/lib/Transforms/Utils/LoopUtils.cpp
    mlir/test/Transforms/affine-data-copy.mlir
    mlir/test/Transforms/dma-generate.mlir
    mlir/test/lib/Transforms/CMakeLists.txt
    mlir/test/lib/Transforms/TestLoopFusion.cpp
    mlir/tools/mlir-opt/mlir-opt.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index 53f2fea1ccfc..cf6316cae643 100644

--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -171,9 +171,11 @@ struct AffineCopyOptions {
 /// by its root affine.for. Since we generate alloc's and dealloc's for all fast
 /// buffers (before and after the range of operations resp. or at a hoisted
 /// position), all of the fast memory capacity is assumed to be available for
-/// processing this block range.
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
 uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
                                 const AffineCopyOptions &copyOptions,
+                                Optional<Value> filterMemRef,
                                 DenseSet<Operation *> &copyNests);
 
 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
@@ -220,6 +222,11 @@ void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
 /// ```
 void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
                            ArrayRef<Value> numProcessors);
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void gatherLoops(FuncOp func,
+                 DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops);
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_LOOP_UTILS_H

diff  --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
index 449dcfafeceb..5409c557da83 100644
--- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
@@ -179,7 +179,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
     if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
       // Perform the copying up unti this 'for' op first.
       affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
-                             copyNests);
+                             /*filterMemRef=*/llvm::None, copyNests);
 
       // Returns true if the footprint is known to exceed capacity.
       auto exceedsCapacity = [&](AffineForOp forOp) {
@@ -213,7 +213,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
         // consumed capacity. The footprint check above guarantees this inner
         // loop's footprint fits.
         affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
-                               copyNests);
+                               /*filterMemRef=*/llvm::None, copyNests);
       }
       // Get to the next load or store op after 'forOp'.
       curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
@@ -236,7 +236,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
     assert(!curBegin->isKnownTerminator() && "can't be a terminator");
     // Exclude the affine terminator - hence, the std::prev.
     affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
-                           copyOptions, copyNests);
+                           copyOptions, /*filterMemRef=*/llvm::None, copyNests);
   }
 
   return success();

diff  --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 56f954f21422..da3d819cbc3e 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -1585,16 +1585,21 @@ static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
   return true;
 }
 
-/// Generates copies for a contiguous sequence of operations in `block` in the
-/// iterator range [`begin', `end'), where `end' can't be past the terminator of
-/// the block (since additional operations are potentially inserted right before
-/// `end'. Returns the total size of the fast buffers used.
-//  Since we generate alloc's and dealloc's for all fast buffers (before and
-//  after the range of operations resp.), all of the fast memory capacity is
-//  assumed to be available for processing this block range.
+/// Performs explicit copying for the contiguous sequence of operations in the
+/// block iterator range [`begin', `end'), where `end' can't be past the
+/// terminator of the block (since additional operations are potentially
+/// inserted right before `end`. Returns the total size of fast memory space
+/// buffers used. `copyOptions` provides various parameters, and the output
+/// argument `copyNests` is the set of all copy nests inserted, each represented
+/// by its root affine.for. Since we generate alloc's and dealloc's for all fast
+/// buffers (before and after the range of operations resp. or at a hoisted
+/// position), all of the fast memory capacity is assumed to be available for
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
 uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
                                       Block::iterator end,
                                       const AffineCopyOptions &copyOptions,
+                                      Optional<Value> filterMemRef,
                                       DenseSet<Operation *> &copyNests) {
   if (begin == end)
     return 0;
@@ -1631,12 +1636,14 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
   block->walk(begin, end, [&](Operation *opInst) {
     // Gather regions to allocate to buffers in faster memory space.
     if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
-      if ((loadOp.getMemRefType().getMemorySpace() !=
+      if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) ||
+          (loadOp.getMemRefType().getMemorySpace() !=
            copyOptions.slowMemorySpace))
         return;
     } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
-      if (storeOp.getMemRefType().getMemorySpace() !=
-          copyOptions.slowMemorySpace)
+      if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) ||
+          storeOp.getMemRefType().getMemorySpace() !=
+              copyOptions.slowMemorySpace)
         return;
     } else {
       // Neither load nor a store op.
@@ -1776,3 +1783,24 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
 
   return totalCopyBuffersSizeInBytes;
 }
+
+/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
+static void gatherLoopsInBlock(
+    Block *block, unsigned currLoopDepth,
+    DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  auto &loopsAtDepth = depthToLoops[currLoopDepth];
+  for (auto &op : *block) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      loopsAtDepth.push_back(forOp);
+      gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
+    }
+  }
+}
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void mlir::gatherLoops(
+    FuncOp func,
+    DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  for (auto &block : func)
+    gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
+}

diff  --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir
index c83beb183021..b2e4fbbf76c1 100644
--- a/mlir/test/Transforms/affine-data-copy.mlir
+++ b/mlir/test/Transforms/affine-data-copy.mlir
@@ -2,6 +2,12 @@
 // Small buffer size to trigger fine copies.
 // RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
 
+// Test affine data copy with a memref filter. We use a test pass that invokes
+// affine data copy utility on the input loop nest.
+// '-test-affine-data-copy-memref-filter' passes the first memref found in an
+// affine.load op in the innermost loop as a filter.
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
+
 // -copy-skip-non-stride-loops forces the copies to be placed right inside the
 // tile space loops, avoiding the sensitivity of copy placement depth to memory
 // footprint -- so that one could write a definite test case and not have to
@@ -16,6 +22,7 @@
 // CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>
 
 // CHECK-LABEL: func @matmul
+// FILTER-LABEL: func @matmul
 func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
   affine.for %i = 0 to 4096 step 128 {
     affine.for %j = 0 to 4096 step 128 {
@@ -110,11 +117,29 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 // CHECK:   }
 // CHECK: }
 
+// Check that only one memref is copied when memref filter is used.
+
+//      FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
+//      FILTER:   alloc() : memref<128x4096xf32>
+//  FILTER-NOT:   alloc()
+//      FILTER:   affine.for %{{.*}} = 0 to 128 {
+//      FILTER:     affine.for %{{.*}} = 0 to 4096 {
+//      FILTER:     affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:       affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:         affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT:           affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT:             affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+//      FILTER:   dealloc %1 : memref<128x4096xf32>
+//  FILTER-NOT:   dealloc %1 : memref<128x4096xf32>
+
+// -----
+
 //
 // This test case will lead to single element buffers. These are eventually
 // expected to be turned into registers via alloca and mem2reg.
 //
-// CHECK-SMALL: func @foo
+// CHECK-SMALL-LABEL: func @foo
+// FILTER-LABEL: func @foo
 func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
   affine.for %i = 0 to 1024 {
     affine.for %j = 0 to 1024 {
@@ -161,3 +186,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
 // CHECK-SMALL:   }
 // CHECK-SMALL: }
 // CHECK-SMALL: return
+
+// Check that only one memref is copied when memref filter is used.
+
+//      FILTER: alloc() : memref<1024x1024xf32>
+//  FILTER-NOT: alloc()
+//      FILTER: affine.for %{{.*}} = 0 to 1024 {
+//      FILTER:   affine.for %{{.*}} = 0 to 1024 {
+//      FILTER: affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT:   affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+//      FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
+//  FILTER-NOT: dealloc

diff  --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir
index 9724f990f97c..b1e71e694690 100644
--- a/mlir/test/Transforms/dma-generate.mlir
+++ b/mlir/test/Transforms/dma-generate.mlir
@@ -543,7 +543,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>,
 // CHECK:         affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
 // CHECK:         affine.for %{{.*}} =
 
-// ----
+// -----
 
 #map3 = affine_map<(d0) -> (d0)>
 #map12 = affine_map<(d0) -> (d0 + 3)>
@@ -551,6 +551,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>,
 #map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)>
 #map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)>
 // Test for test case in b/128303048 #4.
+// CHECK-LABEL: func @test_memref_bounds
 func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) {
   %c0 = constant 0 : index
   affine.for %i8 = 0 to 9 step 3 {

diff  --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 47a0dd92cd06..8c422e718f1f 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(MLIRTestTransforms
+  TestAffineDataCopy.cpp
   TestAllReduceLowering.cpp
   TestCallGraph.cpp
   TestConstantFold.cpp

diff  --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
new file mode 100644
index 000000000000..e03d45cb9dd4
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
@@ -0,0 +1,86 @@
+//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to test affine data copy utility functions and
+// options.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#define PASS_NAME "test-affine-data-copy"
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
+
+namespace {
+
+struct TestAffineDataCopy : public FunctionPass<TestAffineDataCopy> {
+  TestAffineDataCopy() = default;
+  TestAffineDataCopy(const TestAffineDataCopy &pass){};
+
+  void runOnFunction() override;
+
+private:
+  Option<bool> clMemRefFilter{
+      *this, "memref-filter",
+      llvm::cl::desc(
+          "Enable memref filter testing in affine data copy optimization"),
+      llvm::cl::init(false)};
+};
+
+} // end anonymous namespace
+
+void TestAffineDataCopy::runOnFunction() {
+  // Gather all AffineForOps by loop depth.
+  DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
+  gatherLoops(getFunction(), depthToLoops);
+  assert(depthToLoops.size() && "Loop nest not found");
+
+  // Only support tests with a single loop nest and a single innermost loop
+  // for now.
+  unsigned innermostLoopIdx = depthToLoops.size() - 2;
+  if (depthToLoops[0].size() != 1 || depthToLoops[innermostLoopIdx].size() != 1)
+    return;
+
+  auto loopNest = depthToLoops[0][0];
+  auto innermostLoop = depthToLoops[innermostLoopIdx][0];
+  Optional<Value> memrefFilter;
+  if (clMemRefFilter) {
+    // Gather MemRef filter. For simplicity, we use the first loaded memref
+    // found in the innermost loop.
+    for (auto &op : *innermostLoop.getBody()) {
+      if (auto load = dyn_cast<AffineLoadOp>(op)) {
+        memrefFilter = load.getMemRef();
+        break;
+      }
+    }
+  }
+
+  AffineCopyOptions copyOptions = {/*generateDma=*/false,
+                                   /*slowMemorySpace=*/0,
+                                   /*fastMemorySpace=*/0,
+                                   /*tagMemorySpace=*/0,
+                                   /*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
+  DenseSet<Operation *> copyNests;
+  affineDataCopyGenerate(loopNest.getBody()->begin(),
+                         std::prev(loopNest.getBody()->end()), copyOptions,
+                         memrefFilter, copyNests);
+}
+
+namespace mlir {
+void registerTestAffineDataCopyPass() {
+  PassRegistration<TestAffineDataCopy>(
+      PASS_NAME, "Tests affine data copy utility functions.");
+}
+} // namespace mlir

diff  --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp
index 9ffa347173f6..9214fa9fc433 100644
--- a/mlir/test/lib/Transforms/TestLoopFusion.cpp
+++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/STLExtras.h"
@@ -54,19 +55,6 @@ struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
 
 } // end anonymous namespace
 
-// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
-static void
-gatherLoops(Block *block, unsigned currLoopDepth,
-            DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
-  auto &loopsAtDepth = depthToLoops[currLoopDepth];
-  for (auto &op : *block) {
-    if (auto forOp = dyn_cast<AffineForOp>(op)) {
-      loopsAtDepth.push_back(forOp);
-      gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops);
-    }
-  }
-}
-
 // Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
 // in range ['loopDepth' + 1, 'maxLoopDepth'].
 // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
@@ -194,8 +182,7 @@ void TestLoopFusion::runOnFunction() {
     do {
       depthToLoops.clear();
       // Gather all AffineForOps by loop depth.
-      for (auto &block : getFunction())
-        gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+      gatherLoops(getFunction(), depthToLoops);
 
       // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'.
     } while (iterateLoops(depthToLoops, testLoopFusionTransformation,
@@ -204,8 +191,7 @@ void TestLoopFusion::runOnFunction() {
   }
 
   // Gather all AffineForOps by loop depth.
-  for (Block &block : getFunction())
-    gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+  gatherLoops(getFunction(), depthToLoops);
 
   // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
   if (clTestDependenceCheck)

diff  --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index bf6b57c2b624..4df330e77bcd 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -34,6 +34,7 @@ void registerPassManagerTestPass();
 void registerPatternsTestPass();
 void registerSimpleParametricTilingPass();
 void registerSymbolTestPasses();
+void registerTestAffineDataCopyPass();
 void registerTestAllReduceLoweringPass();
 void registerTestCallGraphPass();
 void registerTestConstantFold();
@@ -85,6 +86,7 @@ void registerTestPasses() {
   registerPatternsTestPass();
   registerSimpleParametricTilingPass();
   registerSymbolTestPasses();
+  registerTestAffineDataCopyPass();
   registerTestAllReduceLoweringPass();
   registerTestCallGraphPass();
   registerTestConstantFold();