[Mlir-commits] [mlir] 41d4120 - [mlir][Linalg] Allow distribution `scf.parallel` loops generated in

Mon Aug 10 14:52:40 PDT 2020

Author: MaheshRavishankar
Date: 2020-08-10T14:52:17-07:00
New Revision: 41d4120017f99386a62a9c0aac25fd2a369d0e02

URL: https://github.com/llvm/llvm-project/commit/41d4120017f99386a62a9c0aac25fd2a369d0e02
DIFF: https://github.com/llvm/llvm-project/commit/41d4120017f99386a62a9c0aac25fd2a369d0e02.diff

LOG: [mlir][Linalg] Allow distribution `scf.parallel` loops generated in
Linalg to processors.

This changes adds infrastructure to distribute the loops generated in
Linalg to processors at the time of generation. This addresses use
case where the instantiation of loop is done just to distribute
them. The option to distribute is added to TilingOptions for now and
will allow specifying the distribution as a transformation option,
just like tiling and promotion are specified as options.

Differential Revision: https://reviews.llvm.org/D85147

Added: 
    mlir/test/Dialect/Linalg/tile-and-distribute.mlir

Modified: 
    mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
    mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
    mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
    mlir/lib/Dialect/Linalg/Utils/Utils.cpp
    mlir/test/lib/Transforms/TestLinalgTransforms.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 8fce95781c4d..f438b6587c8b 100644

--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -270,6 +270,15 @@ struct LinalgTilingOptions {
     loopType = lt;
     return *this;
   }
+
+  /// When specified, specifies distribution of generated tile loops to
+  /// processors.
+  Optional<LinalgLoopDistributionOptions> distribution = None;
+  LinalgTilingOptions &
+  setDistributionOptions(LinalgLoopDistributionOptions &distributionOptions) {
+    distribution = distributionOptions;
+    return *this;
+  }
 };
 
 /// Canonicalization patterns relevant to apply after tiling patterns. These are

diff  --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index 0dbf863c2f69..794ebcbc2645 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -156,6 +156,70 @@ void applyPermutationToVector(SmallVector<T, N> &inVec,
   inVec = auxVec;
 }
 
+/// Scheme used to distribute loops to processors.
+enum class DistributionMethod {
+  /// Cyclic distribution where no assumption is made about the dynamic
+  /// relationship between number of processors and number of iterations of the
+  /// distributed loop. Distributes the following loop
+  ///
+  /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
+  ///
+  /// to
+  ///
+  /// scf.parallel(%iv)= (%lb + %procId * %step) to (%ub) step (%step * %nprocs)
+  Cyclic = 0,
+
+  /// Cyclic distribtuion where the number of processors can be assumed to be
+  /// more than or equal to the number of iterations of the distributed loop. In
+  /// such cases, a simple in-bounds check is enough (instead of materializing a
+  /// loop). Distributes the following loop
+  ///
+  /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
+  ///
+  /// to
+  ///
+  /// %iv = %lb + %procId * %step
+  /// %cond = cmpi "slt", %iv, %ub
+  /// scf.if %cond {
+  ///   ...
+  /// }
+  CyclicNumProcsGeNumIters = 1,
+
+  /// Cyclic distribtuion where the number of processors can be assumed to be
+  ///  equal to the number of iterations of the distributed loop. In such cases,
+  ///  no bounds check is needed. Distributes the following loop
+  ///
+  /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
+  ///
+  /// to
+  ///
+  /// %iv = %lb + %procId * %step
+  CyclicNumProcsEqNumIters = 2
+};
+
+/// Callback function type used to get processor ID, and number of processors
+/// used for distribution.
+struct ProcInfo {
+  Value procId;
+  Value nprocs;
+};
+using ProcInfoCallBackFn =
+    std::function<ProcInfo(OpBuilder &b, Location loc, unsigned loopNum)>;
+
+/// Options that allow distribution of loops generated in Linalg transforms to
+/// processors while generating the loops.
+struct LinalgLoopDistributionOptions {
+  /// Callback function that returns the Value for processor ID, and number of
+  /// processors used to execute a given loop.
+  ProcInfoCallBackFn procInfo;
+  /// Specification of how to distribute the `scf.parallel` loops that are
+  /// generated. As the `scf.parallel` loop is generated, the elements of this
+  /// vector is used (from left to right) and the specified distribution is
+  /// applied. If the vector is less than the number of `scf.parallel` loops
+  /// generated, then no distribution is applied.
+  SmallVector<DistributionMethod, 0> distributionMethod = {};
+};
+
 /// Utility class used to generate nested loops with ranges described by
 /// `loopRanges` and loop type described by the `iteratorTypes`. `bodyBuilderFn`
 /// is used to generate the body of the innermost loop. It is passed a range
@@ -168,7 +232,8 @@ struct GenerateLoopNest {
 
   static void doit(ArrayRef<SubViewOp::Range> loopRanges,
                    ArrayRef<Attribute> iteratorTypes,
-                   function_ref<void(ValueRange)> bodyBuilderFn);
+                   function_ref<void(ValueRange)> bodyBuilderFn,
+                   Optional<LinalgLoopDistributionOptions> = None);
 };
 
 } // namespace linalg

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 2c327276610a..6dc98628850f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -382,7 +382,8 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
   if (!options.interchangeVector.empty())
     applyPermutationToVector(iteratorTypes, options.interchangeVector);
   GenerateLoopNest<LoopTy>::doit(
-      loopRanges, iteratorTypes, [&](ValueRange localIvs) {
+      loopRanges, iteratorTypes,
+      [&](ValueRange localIvs) {
         auto &b = ScopedContext::getBuilderRef();
         auto loc = ScopedContext::getLocation();
         ivs.assign(localIvs.begin(), localIvs.end());
@@ -401,7 +402,8 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
         auto operands = getAssumedNonViewOperands(op);
         views.append(operands.begin(), operands.end());
         res = op.clone(b, loc, views);
-      });
+      },
+      options.distribution);
 
   // 4. Transforms index arguments of `linalg.generic` w.r.t. to the tiling.
   transformIndexedGenericOpIndices(b, res, ivs, loopIndexToRangeIndex);
@@ -410,8 +412,14 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
   SmallVector<Operation *, 8> loops;
   loops.reserve(ivs.size());
   for (auto iv : ivs) {
-    loops.push_back(iv.cast<BlockArgument>().getOwner()->getParentOp());
-    assert(loops.back() && "no owner found for induction variable!");
+    if (iv.isa<BlockArgument>()) {
+      loops.push_back(iv.cast<BlockArgument>().getOwner()->getParentOp());
+      assert(loops.back() && "no owner found for induction variable!");
+    } else {
+      // TODO: Instead of doing this, try to recover the ops used instead of the
+      // loop.
+      loops.push_back(nullptr);
+    }
   }
   return TiledLinalgOp{res, loops};
 }

diff  --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 968f5b1d82c5..4e9cbe9d913d 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -11,6 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
+
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
@@ -149,7 +151,8 @@ namespace linalg {
 template <>
 void GenerateLoopNest<scf::ForOp>::doit(
     ArrayRef<SubViewOp::Range> loopRanges, ArrayRef<Attribute> iteratorTypes,
-    function_ref<void(ValueRange)> bodyBuilderFn) {
+    function_ref<void(ValueRange)> bodyBuilderFn,
+    Optional<LinalgLoopDistributionOptions>) {
   SmallVector<Value, 4> lbs, ubs, steps;
   unpackRanges(loopRanges, lbs, ubs, steps);
   edsc::loopNestBuilder(lbs, ubs, steps, bodyBuilderFn);
@@ -159,7 +162,8 @@ void GenerateLoopNest<scf::ForOp>::doit(
 template <>
 void GenerateLoopNest<AffineForOp>::doit(
     ArrayRef<SubViewOp::Range> loopRanges, ArrayRef<Attribute> iteratorTypes,
-    function_ref<void(ValueRange)> bodyBuilderFn) {
+    function_ref<void(ValueRange)> bodyBuilderFn,
+    Optional<LinalgLoopDistributionOptions>) {
   SmallVector<Value, 4> lbs, ubs, steps;
   unpackRanges(loopRanges, lbs, ubs, steps);
 
@@ -175,12 +179,24 @@ void GenerateLoopNest<AffineForOp>::doit(
   edsc::affineLoopNestBuilder(lbs, ubs, constantSteps, bodyBuilderFn);
 }
 
-/// Generates a loop nest consisting of scf.parallel and scf.for, depending on
-/// the `iteratorTypes.` Consecutive parallel loops create a single scf.parallel
-/// operation; each sequential loop creates a new scf.for operation. The body
-/// of the innermost loop is populated by `bodyBuilderFn` that accepts a range
-/// of induction variables for all loops. `ivStorage` is used to store the
-/// partial list of induction variables.
+/// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
+static void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc,
+                                              Value procId, Value nprocs,
+                                              Value &lb, Value &ub,
+                                              Value &step) {
+  using edsc::op::operator+;
+  using edsc::op::operator*;
+  lb = lb + (procId * step);
+  step = nprocs * step;
+}
+
+/// Generates a loop nest consisting of scf.parallel and scf.for, depending
+/// on the `iteratorTypes.` Consecutive parallel loops create a single
+/// scf.parallel operation; each sequential loop creates a new scf.for
+/// operation. The body of the innermost loop is populated by
+/// `bodyBuilderFn` that accepts a range of induction variables for all
+/// loops. `ivStorage` is used to store the partial list of induction
+/// variables.
 // TODO: this function can be made iterative instead. However, it
 // will have at most as many recursive calls as nested loops, which rarely
 // exceeds 10.
@@ -188,7 +204,8 @@ static void
 generateParallelLoopNest(ValueRange lbs, ValueRange ubs, ValueRange steps,
                          ArrayRef<Attribute> iteratorTypes,
                          function_ref<void(ValueRange)> bodyBuilderFn,
-                         SmallVectorImpl<Value> &ivStorage) {
+                         SmallVectorImpl<Value> &ivStorage,
+                         ArrayRef<DistributionMethod> distributionMethod = {}) {
   assert(lbs.size() == ubs.size());
   assert(lbs.size() == steps.size());
   assert(lbs.size() == iteratorTypes.size());
@@ -200,8 +217,8 @@ generateParallelLoopNest(ValueRange lbs, ValueRange ubs, ValueRange steps,
 
   // Find the outermost parallel loops and drop their types from the list.
   unsigned nLoops = iteratorTypes.size();
-  iteratorTypes = iteratorTypes.drop_while(isParallelIteratorType);
-  unsigned nOuterPar = nLoops - iteratorTypes.size();
+  unsigned nOuterPar =
+      nLoops - iteratorTypes.drop_while(isParallelIteratorType).size();
 
   // If there are no outer parallel loops, generate one sequential loop and
   // recurse. Note that we wouldn't have dropped anything from `iteratorTypes`
@@ -211,41 +228,132 @@ generateParallelLoopNest(ValueRange lbs, ValueRange ubs, ValueRange steps,
       ivStorage.push_back(iv);
       generateParallelLoopNest(lbs.drop_front(), ubs.drop_front(),
                                steps.drop_front(), iteratorTypes.drop_front(),
-                               bodyBuilderFn, ivStorage);
+                               bodyBuilderFn, ivStorage, distributionMethod);
     });
     return;
   }
+  if (distributionMethod.empty()) {
+    // Generate a single parallel loop-nest operation for all outermost
+    // parallel loops and recurse.
+    edsc::OperationBuilder<scf::ParallelOp>(
+        lbs.take_front(nOuterPar), ubs.take_front(nOuterPar),
+        steps.take_front(nOuterPar),
+        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
+          edsc::ScopedContext context(nestedBuilder, nestedLoc);
+          ivStorage.append(localIvs.begin(), localIvs.end());
+          generateParallelLoopNest(
+              lbs.drop_front(nOuterPar), ubs.drop_front(nOuterPar),
+              steps.drop_front(nOuterPar), iteratorTypes.drop_front(nOuterPar),
+              bodyBuilderFn, ivStorage,
+              (distributionMethod.size() < nOuterPar)
+                  ? ArrayRef<DistributionMethod>()
+                  : distributionMethod.drop_front(nOuterPar));
+        });
+    return;
+  }
+
+  // Process all consecutive similarly distributed loops simultaneously.
+  DistributionMethod methodToUse = distributionMethod[0];
+  unsigned numProcessed = 1;
+  for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) {
+    if (distributionMethod[i] != methodToUse)
+      break;
+    numProcessed++;
+  }
 
-  // Generate a single parallel loop-nest operation for all outermost parallel
-  // loops and recurse.
-  edsc::OperationBuilder<scf::ParallelOp>(
-      lbs.take_front(nOuterPar), ubs.take_front(nOuterPar),
-      steps.take_front(nOuterPar),
-      [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
-        edsc::ScopedContext context(nestedBuilder, nestedLoc);
-        ivStorage.append(localIvs.begin(), localIvs.end());
-        generateParallelLoopNest(lbs.drop_front(nOuterPar),
-                                 ubs.drop_front(nOuterPar),
-                                 steps.drop_front(nOuterPar), iteratorTypes,
-                                 bodyBuilderFn, ivStorage);
-      });
+  switch (methodToUse) {
+  case DistributionMethod::Cyclic: {
+    // Generate a single parallel loop-nest operation for all outermost
+    // parallel loops and recurse.
+    edsc::OperationBuilder<scf::ParallelOp>(
+        lbs.take_front(numProcessed), ubs.take_front(numProcessed),
+        steps.take_front(numProcessed),
+        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
+          edsc::ScopedContext context(nestedBuilder, nestedLoc);
+          ivStorage.append(localIvs.begin(), localIvs.end());
+          generateParallelLoopNest(
+              lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
+              steps.drop_front(numProcessed),
+              iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
+              (distributionMethod.size() < numProcessed)
+                  ? ArrayRef<DistributionMethod>()
+                  : distributionMethod.drop_front(numProcessed));
+        });
+    return;
+  }
+  case DistributionMethod::CyclicNumProcsGeNumIters: {
+    // Check (for the processed loops) that the iteration is in-bounds.
+    using edsc::op::slt;
+    using edsc::op::operator&&;
+    Value cond = slt(lbs[0], ubs[0]);
+    for (unsigned i = 1; i < numProcessed; ++i)
+      cond = cond && slt(lbs[i], ubs[i]);
+    ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
+    edsc::conditionBuilder(cond, [&]() {
+      generateParallelLoopNest(
+          lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
+          steps.drop_front(numProcessed),
+          iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
+          distributionMethod.drop_front(numProcessed));
+    });
+    return;
+  }
+  case DistributionMethod::CyclicNumProcsEqNumIters:
+    // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed
+    // with inner loop generation.
+    ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
+    generateParallelLoopNest(
+        lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
+        steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed),
+        bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed));
+    return;
+  }
 }
 
 /// Specialization for generating a mix of parallel and sequential scf loops.
 template <>
 void GenerateLoopNest<scf::ParallelOp>::doit(
     ArrayRef<SubViewOp::Range> loopRanges, ArrayRef<Attribute> iteratorTypes,
-    function_ref<void(ValueRange)> bodyBuilderFn) {
-  SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs;
-  unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage);
-  ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage);
-
+    function_ref<void(ValueRange)> bodyBuilderFn,
+    Optional<LinalgLoopDistributionOptions> distributionOptions) {
   // This function may be passed more iterator types than ranges.
   assert(iteratorTypes.size() >= loopRanges.size() &&
          "expected iterator type for all ranges");
   iteratorTypes = iteratorTypes.take_front(loopRanges.size());
-  ivs.reserve(iteratorTypes.size());
-  generateParallelLoopNest(lbs, ubs, steps, iteratorTypes, bodyBuilderFn, ivs);
+  SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs;
+  unsigned numLoops = iteratorTypes.size();
+  ivs.reserve(numLoops);
+  lbsStorage.reserve(numLoops);
+  ubsStorage.reserve(numLoops);
+  stepsStorage.reserve(numLoops);
+
+  // Get the loop lb, ub, and step.
+  unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage);
+
+  // Modify the lb, ub, and step based on the distribution options.
+  SmallVector<DistributionMethod, 0> distributionMethod;
+  if (distributionOptions) {
+    auto &options = distributionOptions.getValue();
+    unsigned index = 0;
+    OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
+    Location loc = edsc::ScopedContext::getLocation();
+    distributionMethod.assign(distributionOptions->distributionMethod.begin(),
+                              distributionOptions->distributionMethod.end());
+    for (auto iteratorType : enumerate(iteratorTypes))
+      if (isParallelIteratorType(iteratorType.value()) &&
+          index < distributionMethod.size()) {
+        unsigned i = iteratorType.index();
+        ProcInfo procInfo = options.procInfo(builder, loc, index);
+        updateBoundsForCyclicDistribution(builder, loc, procInfo.procId,
+                                          procInfo.nprocs, lbsStorage[i],
+                                          ubsStorage[i], stepsStorage[i]);
+        index++;
+      }
+  }
+  ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage);
+  generateParallelLoopNest(lbs, ubs, steps, iteratorTypes, bodyBuilderFn, ivs,
+                           distributionMethod);
+
   assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
 }
 

diff  --git a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
new file mode 100644
index 000000000000..e1bc28e133bd
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
@@ -0,0 +1,168 @@
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-tile-and-distribute-options -split-input-file | FileCheck %s
+
+func @gemm1(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+  linalg.matmul %a, %b, %c {__internal_linalg_transform__ = "distribute1"}
+    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
+  return
+}
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
+//      CHECK: func @gemm1(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+//      CHECK: %[[T1:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[T2:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: scf.for %[[ARG3:.*]] =
+//      CHECK:   %[[T3:.*]] = affine.apply #[[MAP0]]()[%[[T1]]]
+//      CHECK:   %[[SV1:.*]] = subview %[[ARG0]][%[[T3]], %[[ARG3]]]
+//      CHECK:   %[[T11:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
+//      CHECK:   %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[T11]]]
+//      CHECK:   %[[T15:.*]] = affine.apply #[[MAP0]]()[%[[T1]]]
+//      CHECK:   %[[T18:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
+//      CHECK:   %[[SV3:.*]] = subview %[[ARG2]][%[[T15]], %[[T18]]]
+//      CHECK:   linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
+
+// -----
+
+func @gemm2(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+  linalg.matmul %a, %b, %c {__internal_linalg_transform__ = "distribute2"}
+    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
+  return
+}
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
+//      CHECK: func @gemm2(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK: %[[T5:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[T6:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
+//      CHECK: %[[T7:.*]] = cmpi "slt", %[[T4]], %{{.*}}
+//      CHECK: %[[T8:.*]] = cmpi "slt", %[[T6]], %{{.*}}
+//      CHECK: %[[T9:.*]] = and %[[T7]], %[[T8]]
+//      CHECK: scf.if %[[T9]]
+//      CHECK:   scf.for %[[ARG3:.*]] =
+//      CHECK:     %[[T10:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[T10]], %[[ARG3]]]
+//      CHECK:     %[[T18:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
+//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[T18]]]
+//      CHECK:     %[[T22:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK:     %[[T25:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
+//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[T22]], %[[T25]]]
+//      CHECK:     linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
+
+// -----
+
+func @gemm3(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+  linalg.matmul %a, %b, %c {__internal_linalg_transform__ = "distribute3"}
+    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
+  return
+}
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
+//      CHECK: func @gemm3(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[T4:.*]] = "gpu.grid_dim"() {dimension = "y"}
+//      CHECK: %[[T5:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK: %[[T6:.*]] = affine.apply #[[MAP0]]()[%[[T4]]]
+//      CHECK: %[[T7:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[T8:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//      CHECK: %[[T9:.*]] = affine.apply #[[MAP0]]()[%[[T7]]]
+//      CHECK: %[[T10:.*]] = affine.apply #[[MAP0]]()[%[[T8]]]
+//      CHECK: scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) = (%[[T5]], %[[T9]]) to (%{{.*}}, %{{.*}}) step (%[[T6]], %[[T10]])
+//      CHECK:   scf.for %[[ARG5:.*]] =
+//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[ARG3]], %[[ARG5]]]
+//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG5]], %[[ARG4]]]
+//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[ARG4]]]
+//      CHECK:     linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
+
+// -----
+
+func @gemm4(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+  linalg.matmul %a, %b, %c {__internal_linalg_transform__ = "distribute4"}
+    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
+  return
+}
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
+//      CHECK: func @gemm4(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+//      CHECK: %[[T2:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK: %[[T5:.*]] = cmpi "slt", %[[T4]], %{{.*}}
+//      CHECK: scf.if %[[T5]]
+//      CHECK:   scf.for %[[ARG3:.*]] =
+//      CHECK:     %[[T6:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
+//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[T6]], %[[ARG3]]]
+//      CHECK:     %[[T14:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[T14]]]
+//      CHECK:     %[[T18:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
+//      CHECK:     %[[T21:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[T18]], %[[T21]]]
+//      CHECK:     linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
+
+// -----
+
+func @gemm5(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+  linalg.matmul %a, %b, %c {__internal_linalg_transform__ = "distribute5"}
+    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
+  return
+}
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
+//      CHECK: func @gemm5(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK: %[[T5:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[T6:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//      CHECK: %[[T7:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
+//      CHECK: %[[T8:.*]] = affine.apply #[[MAP0]]()[%[[T6]]]
+//      CHECK: %[[T9:.*]] = cmpi "slt", %[[T4]], %{{.*}}
+//      CHECK: scf.if %[[T9]]
+//      CHECK:   scf.parallel (%[[ARG3.*]]) = (%[[T7]]) to (%{{.*}}) step (%[[T8]])
+//      CHECK:     scf.for %[[ARG4:.*]] =
+//      CHECK:      %[[T10:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK:       %[[SV1:.*]] = subview %[[ARG0]][%[[T10]], %[[ARG4]]]
+//      CHECK:       %[[SV2:.*]] = subview %[[ARG1]][%[[ARG4]], %[[ARG3]]]
+//      CHECK:       %[[T21:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK:       %[[SV3:.*]] = subview %[[ARG2]][%[[T21]], %[[ARG3]]]
+//      CHECK:       linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
+
+// -----
+
+func @gemm6(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+  linalg.matmul %a, %b, %c {__internal_linalg_transform__ = "distribute6"}
+    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
+  return
+}
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
+//      CHECK: func @gemm6(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
+//      CHECK: %[[T2:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[T3:.*]] = "gpu.grid_dim"() {dimension = "y"}
+//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
+//      CHECK: %[[T5:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
+//      CHECK: %[[T6:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: scf.parallel (%[[ARG3.*]]) = (%[[T4]]) to (%{{.*}}) step (%[[T5]])
+//      CHECK:   scf.for %[[ARG4:.*]] =
+//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[ARG3]], %[[ARG4]]]
+//      CHECK:     %[[T14:.*]] = affine.apply #[[MAP0]]()[%[[T6]]]
+//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG4]], %[[T14]]]
+//      CHECK:     %[[T20:.*]] = affine.apply #[[MAP0]]()[%[[T6]]]
+//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[T20]]]
+//      CHECK:     linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]

diff  --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index ff37110f093a..f6c1160d35b0 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
@@ -49,6 +50,10 @@ struct TestLinalgTransforms
   Option<bool> testPromotionOptions{*this, "test-linalg-promotion-options",
                                     llvm::cl::desc("Test promotion options"),
                                     llvm::cl::init(false)};
+  Option<bool> testTileAndDistributionOptions{
+      *this, "test-tile-and-distribute-options",
+      llvm::cl::desc("Test tile and distribute options"),
+      llvm::cl::init(false)};
   Option<bool> testVectorTransferForwardingPatterns{
       *this, "test-vector-transfer-forwarding-patterns",
       llvm::cl::desc(
@@ -143,6 +148,11 @@ static void applyPatterns(FuncOp funcOp) {
       /*loweringType=*/LinalgLoweringType::Loops,
       LinalgMarker(Identifier::get("REG", ctx)));
 
+  //===--------------------------------------------------------------------===//
+  // Linalg distribution patterns.
+  //===--------------------------------------------------------------------===//
+  LinalgLoopDistributionOptions distributionOptions;
+
   //===--------------------------------------------------------------------===//
   // Linalg to vector contraction patterns.
   //===--------------------------------------------------------------------===//
@@ -278,6 +288,122 @@ static void fillPromotionCallBackPatterns(MLIRContext *ctx,
       LinalgMarker(Identifier::get("PROMOTE", ctx)));
 }
 
+template <typename IdOp, typename NProcsOp>
+static ProcInfo getGpuProcIds(OpBuilder &b, Location loc, unsigned loopNum) {
+  Type indexType = b.getIndexType();
+  switch (loopNum) {
+  case 0:
+    return {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
+            b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
+  case 1:
+    return {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
+            b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
+  default:
+    llvm_unreachable("test patterns handles only upto 2-level nested loops");
+  }
+  return {nullptr, nullptr};
+}
+
+static void fillTileAndDistributePatterns(MLIRContext *context,
+                                          OwningRewritePatternList &patterns) {
+  {
+    LinalgLoopDistributionOptions cyclicNprocsEqNiters;
+    cyclicNprocsEqNiters.distributionMethod.resize(
+        2, DistributionMethod::CyclicNumProcsEqNumIters);
+    cyclicNprocsEqNiters.procInfo =
+        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::ParallelLoops)
+            .setDistributionOptions(cyclicNprocsEqNiters),
+        LinalgMarker(Identifier::get("distribute1", context),
+                     Identifier::get("after_distribute1", context)));
+  }
+
+  {
+    LinalgLoopDistributionOptions cyclicNprocsGeNiters;
+    cyclicNprocsGeNiters.distributionMethod.resize(
+        2, DistributionMethod::CyclicNumProcsGeNumIters);
+    cyclicNprocsGeNiters.procInfo =
+        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::ParallelLoops)
+            .setDistributionOptions(cyclicNprocsGeNiters),
+        LinalgMarker(Identifier::get("distribute2", context),
+                     Identifier::get("after_distribute2", context)));
+  }
+
+  {
+    LinalgLoopDistributionOptions cyclicNprocsDefault;
+    cyclicNprocsDefault.distributionMethod.resize(2,
+                                                  DistributionMethod::Cyclic);
+    cyclicNprocsDefault.procInfo =
+        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::ParallelLoops)
+            .setDistributionOptions(cyclicNprocsDefault),
+        LinalgMarker(Identifier::get("distribute3", context),
+                     Identifier::get("after_distribute3", context)));
+  }
+
+  {
+    LinalgLoopDistributionOptions cyclicNprocsMixed1;
+    cyclicNprocsMixed1.distributionMethod = {
+        DistributionMethod::CyclicNumProcsEqNumIters,
+        DistributionMethod::CyclicNumProcsGeNumIters};
+    cyclicNprocsMixed1.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::ParallelLoops)
+            .setDistributionOptions(cyclicNprocsMixed1),
+        LinalgMarker(Identifier::get("distribute4", context),
+                     Identifier::get("after_distribute4", context)));
+  }
+
+  {
+    LinalgLoopDistributionOptions cyclicNprocsMixed2;
+    cyclicNprocsMixed2.distributionMethod = {
+        DistributionMethod::CyclicNumProcsGeNumIters,
+        DistributionMethod::Cyclic};
+    cyclicNprocsMixed2.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::ParallelLoops)
+            .setDistributionOptions(cyclicNprocsMixed2),
+        LinalgMarker(Identifier::get("distribute5", context),
+                     Identifier::get("after_distribute5", context)));
+  }
+
+  {
+    LinalgLoopDistributionOptions cyclicNprocsMixed3;
+    cyclicNprocsMixed3.distributionMethod = {
+        DistributionMethod::Cyclic,
+        DistributionMethod::CyclicNumProcsEqNumIters};
+    cyclicNprocsMixed3.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+
+    patterns.insert<LinalgTilingPattern<MatmulOp>>(
+        context,
+        LinalgTilingOptions()
+            .setTileSizes({8, 8, 4})
+            .setLoopType(LinalgTilingLoopType::ParallelLoops)
+            .setDistributionOptions(cyclicNprocsMixed3),
+        LinalgMarker(Identifier::get("distribute6", context),
+                     Identifier::get("after_distribute6", context)));
+  }
+}
+
 static void
 applyMatmulToVectorPatterns(FuncOp funcOp,
                             bool testMatmulToVectorPatterns1dTiling,
@@ -344,6 +470,12 @@ void TestLinalgTransforms::runOnFunction() {
     applyPatternsAndFoldGreedily(getFunction(), patterns);
     return;
   }
+  if (testTileAndDistributionOptions) {
+    OwningRewritePatternList patterns;
+    fillTileAndDistributePatterns(&getContext(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+    return;
+  }
   if (testPatterns)
     return applyPatterns(getFunction());
   if (testMatmulToVectorPatterns1dTiling || testMatmulToVectorPatterns2dTiling)