[Mlir-commits] [mlir] 7625742 - [mlir][Linalg] Add support for tileAndDistribute on tensors.

Mon Nov 16 03:16:08 PST 2020

Author: Nicolas Vasilache
Date: 2020-11-16T11:12:50Z
New Revision: 76257422378e54dc2b59ff034e2955e9518e6c99

URL: https://github.com/llvm/llvm-project/commit/76257422378e54dc2b59ff034e2955e9518e6c99
DIFF: https://github.com/llvm/llvm-project/commit/76257422378e54dc2b59ff034e2955e9518e6c99.diff

LOG: [mlir][Linalg] Add support for tileAndDistribute on tensors.

scf.parallel is currently not a good fit for tiling on tensors.
Instead provide a path to parallelism directly through scf.for.
For now, this transformation ignores the distribution scheme and always does a block-cyclic mapping (where block is the tile size).

Differential revision: https://reviews.llvm.org/D90475

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
    mlir/include/mlir/Dialect/SCF/SCF.h
    mlir/lib/Dialect/Linalg/Utils/Utils.cpp
    mlir/lib/Dialect/SCF/EDSC/Builders.cpp
    mlir/lib/Dialect/SCF/SCF.cpp
    mlir/test/Dialect/Linalg/tile-and-distribute.mlir
    mlir/test/EDSC/builder-api-test.cpp
    mlir/test/lib/Transforms/TestLinalgTransforms.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h b/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
index fe8df4c2d0e4..8622d8c98315 100644

--- a/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
+++ b/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
@@ -24,15 +24,15 @@ namespace edsc {
 
 /// Adapters for building loop nests using the builder and the location stored
 /// in ScopedContext. Actual builders are in scf::buildLoopNest.
-scf::ValueVector loopNestBuilder(ValueRange lbs, ValueRange ubs,
+scf::LoopNest loopNestBuilder(ValueRange lbs, ValueRange ubs,
                                  ValueRange steps,
                                  function_ref<void(ValueRange)> fun = nullptr);
-scf::ValueVector loopNestBuilder(Value lb, Value ub, Value step,
+scf::LoopNest loopNestBuilder(Value lb, Value ub, Value step,
                                  function_ref<void(Value)> fun = nullptr);
-scf::ValueVector loopNestBuilder(
+scf::LoopNest loopNestBuilder(
     Value lb, Value ub, Value step, ValueRange iterArgInitValues,
     function_ref<scf::ValueVector(Value, ValueRange)> fun = nullptr);
-scf::ValueVector loopNestBuilder(
+scf::LoopNest loopNestBuilder(
     ValueRange lbs, ValueRange ubs, ValueRange steps,
     ValueRange iterArgInitValues,
     function_ref<scf::ValueVector(ValueRange, ValueRange)> fun = nullptr);

diff  --git a/mlir/include/mlir/Dialect/SCF/SCF.h b/mlir/include/mlir/Dialect/SCF/SCF.h
index 55c8cbf5fa74..619ebd2639e7 100644
--- a/mlir/include/mlir/Dialect/SCF/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/SCF.h
@@ -51,6 +51,11 @@ ParallelOp getParallelForInductionVarOwner(Value val);
 
 /// An owning vector of values, handy to return from functions.
 using ValueVector = std::vector<Value>;
+using LoopVector = std::vector<scf::ForOp>;
+struct LoopNest {
+  ResultRange getResults() { return loops.front().getResults(); }
+  LoopVector loops;
+};
 
 /// Creates a perfect nest of "for" loops, i.e. all loops but the innermost
 /// contain only another loop and a terminator. The lower, upper bounds and
@@ -65,11 +70,12 @@ using ValueVector = std::vector<Value>;
 /// yielded from the loop body and forwarded back through the loop nest. If the
 /// function is not provided, the loop nest is not expected to have iteration
 /// arguments, the body of the innermost loop will be left empty, containing
-/// only the zero-operand terminator. Returns the values yielded by the
-/// outermost loop. If bound arrays are empty, the body builder will be called
+/// only the zero-operand terminator. Returns the LoopNest containing the list
+/// of perfectly nest scf::ForOp build during the call.
+/// If bound arrays are empty, the body builder will be called
 /// once to construct the IR outside of the loop with an empty list of induction
 /// variables.
-ValueVector buildLoopNest(
+LoopNest buildLoopNest(
     OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
     ValueRange steps, ValueRange iterArgs,
     function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
@@ -78,7 +84,8 @@ ValueVector buildLoopNest(
 /// A convenience version for building loop nests without iteration arguments
 /// (like for reductions). Does not take the initial value of reductions or
 /// expect the body building functions to return their current value.
-ValueVector buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
+/// The built nested scf::For are captured in `capturedLoops` when non-null.
+LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
                           ValueRange ubs, ValueRange steps,
                           function_ref<void(OpBuilder &, Location, ValueRange)>
                               bodyBuilder = nullptr);

diff  --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 210d17516718..e5f0ba013e01 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -24,6 +24,7 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
 
 using namespace mlir;
 using namespace mlir::linalg;
@@ -171,10 +172,27 @@ void GenerateLoopNest<scf::ForOp>::doit(
     ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
     ArrayRef<Attribute> iteratorTypes,
     function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
-    Optional<LinalgLoopDistributionOptions>) {
+    Optional<LinalgLoopDistributionOptions> distributionOptions) {
+  // Create procInfo so it dominate loops, if appropriate.
+  OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
+  Location loc = edsc::ScopedContext::getLocation();
+  SmallVector<ProcInfo, 2> procInfo;
+  if (distributionOptions.hasValue())
+    procInfo = distributionOptions->procInfo(builder, loc, ArrayRef<Range>{});
+
   SmallVector<Value, 4> lbs, ubs, steps;
   unpackRanges(loopRanges, lbs, ubs, steps);
-  edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
+  LoopNest loopNest =
+      edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
+
+  if (!distributionOptions.hasValue() || loopNest.loops.empty())
+    return;
+
+  // TODO: support distributionMethod, which is currently ignored.
+  for (auto it : llvm::zip(loopNest.loops, procInfo,
+                           distributionOptions->distributionMethod))
+    mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
+                          std::get<1>(it).nprocs);
 }
 
 /// Specialization to build affine "for" nest.

diff  --git a/mlir/lib/Dialect/SCF/EDSC/Builders.cpp b/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
index 45097186a248..d0ac5f0c3439 100644
--- a/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
+++ b/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
@@ -14,7 +14,7 @@
 using namespace mlir;
 using namespace mlir::edsc;
 
-mlir::scf::ValueVector
+mlir::scf::LoopNest
 mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
                             function_ref<void(ValueRange)> fun) {
   // Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
@@ -29,7 +29,7 @@ mlir::edsc::loopNestBuilder(ValueRange lbs, ValueRange ubs, ValueRange steps,
       });
 }
 
-mlir::scf::ValueVector
+mlir::scf::LoopNest
 mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
                             function_ref<void(Value)> fun) {
   // Delegates to the ValueRange-based version by wrapping the lambda.
@@ -42,7 +42,7 @@ mlir::edsc::loopNestBuilder(Value lb, Value ub, Value step,
                          wrapper);
 }
 
-mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
+mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
     Value lb, Value ub, Value step, ValueRange iterArgInitValues,
     function_ref<scf::ValueVector(Value, ValueRange)> fun) {
   // Delegates actual construction to scf::buildLoopNest by wrapping `fun` into
@@ -61,7 +61,7 @@ mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
       });
 }
 
-mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
+mlir::scf::LoopNest mlir::edsc::loopNestBuilder(
     ValueRange lbs, ValueRange ubs, ValueRange steps,
     ValueRange iterArgInitValues,
     function_ref<scf::ValueVector(ValueRange, ValueRange)> fun) {

diff  --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp
index bc8671b9ba85..fe2eb9ced469 100644
--- a/mlir/lib/Dialect/SCF/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/SCF.cpp
@@ -305,7 +305,7 @@ void ForOp::getNumRegionInvocations(ArrayRef<Attribute> operands,
               step.getValue().getSExtValue());
 }
 
-ValueVector mlir::scf::buildLoopNest(
+LoopNest mlir::scf::buildLoopNest(
     OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
     ValueRange steps, ValueRange iterArgs,
     function_ref<ValueVector(OpBuilder &, Location, ValueRange, ValueRange)>
@@ -323,7 +323,7 @@ ValueVector mlir::scf::buildLoopNest(
     assert(results.size() == iterArgs.size() &&
            "loop nest body must return as many values as loop has iteration "
            "arguments");
-    return results;
+    return LoopNest();
   }
 
   // First, create the loop structure iteratively using the body-builder
@@ -372,11 +372,13 @@ ValueVector mlir::scf::buildLoopNest(
   builder.setInsertionPointToEnd(loops.back().getBody());
   builder.create<scf::YieldOp>(loc, results);
 
-  // Return the results of the outermost loop.
-  return ValueVector(loops.front().result_begin(), loops.front().result_end());
+  // Return the loops.
+  LoopNest res;
+  res.loops.assign(loops.begin(), loops.end());
+  return res;
 }
 
-ValueVector mlir::scf::buildLoopNest(
+LoopNest mlir::scf::buildLoopNest(
     OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs,
     ValueRange steps,
     function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {

diff  --git a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
index 6ff4be0169fb..2a6a7ba7b7e3 100644
--- a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
@@ -172,3 +172,43 @@ func @gemm6(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 //      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
 //      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[OFFSETX_2]]]
 //      CHECK:     linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
+
+// -----
+
+// CHECK-LABEL: func @matmul_tensors(
+// CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[TB:[0-9a-z]+]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[TC:[0-9a-z]+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func @matmul_tensors(
+  %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
+    -> tensor<?x?xf32> {
+//      CHECK: %[[C8:.*]] = constant 8 : index
+//      CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
+//      CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//      CHECK: %[[LBY:.*]] = muli %[[BIDY]], %[[C8]] : index
+//      CHECK: %[[STEPY:.*]] = muli %[[NBLOCKSY]], %[[C8]] : index
+//      CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xf32>) {
+//      CHECK: %[[LBX:.*]] = muli %[[BIDX]], %[[C8]] : index
+//      CHECK: %[[STEPX:.*]] = muli %[[NBLOCKSX]], %[[C8]] : index
+//      CHECK:   %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
+//      CHECK:       %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
+//      CHECK:       %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
+//      CHECK:       %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME:                                  init(%[[sTC]] : tensor<?x?xf32>)  -> tensor<?x?xf32>
+//      CHECK:       %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}]  : tensor<?x?xf32> into tensor<?x?xf32>
+//      CHECK:       scf.yield %[[TD]] : tensor<?x?xf32>
+//      CHECK:     scf.yield %[[TD2]] : tensor<?x?xf32>
+//      CHECK:   scf.yield %[[TD1]] : tensor<?x?xf32>
+  %0 = linalg.matmul {__internal_linalg_transform__ = "tensors_distribute1"}
+       ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
+      init(%arg2: tensor<?x?xf32>)
+    -> tensor<?x?xf32>
+
+//      CHECK: return %[[TD0]] : tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+

diff  --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
index 1a866066523e..7677c175ec94 100644
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -1223,7 +1223,7 @@ TEST_FUNC(builder_loop_for_yield) {
                                  [&](Value iv, ValueRange args) {
                                    Value sum = args[0] + args[1];
                                    return scf::ValueVector{args[1], sum};
-                                 });
+                                 }).getResults();
   results[0] + results[1];
 
   // clang-format off

diff  --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index 253d4adf903c..8857bbe09eef 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -409,6 +409,22 @@ static void fillTileAndDistributePatterns(MLIRContext *context,
         LinalgMarker(Identifier::get("distribute6", context),
                      Identifier::get("after_distribute6", context)));
   }
+
+  {
+    LinalgLoopDistributionOptions cyclicNprocsEqNiters;
+    cyclicNprocsEqNiters.distributionMethod.resize(
+        2, DistributionMethod::CyclicNumProcsEqNumIters);
+    cyclicNprocsEqNiters.procInfo =
+        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::Loops)
+            .setDistributionOptions(cyclicNprocsEqNiters),
+        LinalgMarker(Identifier::get("tensors_distribute1", context),
+                     Identifier::get("tensors_after_distribute1", context)));
+  }
 }
 
 static void