[Mlir-commits] [mlir] 7a7eacc - [MLIR][GPU] Implement a simple greedy loop mapper.

Tue Feb 25 02:48:30 PST 2020

Author: Stephan Herhut
Date: 2020-02-25T11:42:42+01:00
New Revision: 7a7eacc797f7cc603d50987883ea95aee99d6b22

URL: https://github.com/llvm/llvm-project/commit/7a7eacc797f7cc603d50987883ea95aee99d6b22
DIFF: https://github.com/llvm/llvm-project/commit/7a7eacc797f7cc603d50987883ea95aee99d6b22.diff

LOG: [MLIR][GPU] Implement a simple greedy loop mapper.

Summary:
The mapper assigns annotations to loop.parallel operations that
are compatible with the loop to gpu mapping pass. The outermost
loop uses the grid dimensions, followed by block dimensions. All
remaining loops are mapped to sequential loops.

Differential Revision: https://reviews.llvm.org/D74963

Added: 
    mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
    mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
    mlir/test/Dialect/GPU/mapping.mlir
    mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp

Modified: 
    mlir/include/mlir/Dialect/LoopOps/LoopOps.td
    mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
    mlir/lib/Dialect/GPU/CMakeLists.txt
    mlir/test/lib/Transforms/CMakeLists.txt
    mlir/tools/mlir-opt/mlir-opt.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
new file mode 100644
index 000000000000..34faf43ea1d6

--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
@@ -0,0 +1,50 @@
+//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utilities to generate mappings for parallel
+// loops to GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+
+namespace mlir {
+
+struct Region;
+
+namespace gpu {
+
+/// Name of the mapping attribute produced by loop mappers.
+static constexpr const char *kMappingAttributeName = "mapping";
+/// Name of the processor sub-attribute that identifies the hardware id
+/// to map a loop to.
+static constexpr const char *kProcessorEntryName = "processor";
+/// Name of the map sub-attribute that identifies the affine map to apply
+/// to the hardware id to compute the iteration number of the loop. This
+/// map is expected to be extended by step and lower bound computations:
+///   index = map(hardware_id) * step + lowerbound
+static constexpr const char *kIndexMapEntryName = "map";
+/// Name of the bound sub-attribute that itendities the affine map to
+/// compute an upper bound of iterations for the hardware id. This is
+/// applied to an upper bound on the number of iterations:
+///   launchBound = bound(upperbound-lowerbound ceildiv step)
+static constexpr const char *kBoundMapEntryName = "bound";
+
+} // end namespace gpu
+
+/// Maps the parallel loops found in the given function to workgroups. The first
+/// loop encountered will be mapped to the global workgroup and the second loop
+/// encountered to the local workgroup. Within each mapping, the first three
+/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
+/// mapped to sequential loops.
+void greedilyMapParallelLoopsToGPU(Region &region);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H

diff  --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
index aae2cbf6462e..3fa8b16ecde5 100644
--- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -289,6 +289,9 @@ def ParallelOp : Loop_Op<"parallel",
 
   let extraClassDeclaration = [{
     Block *getBody() { return &region().front(); }
+    unsigned getNumInductionVars() {
+      return getBody()->getNumArguments();
+    }
     iterator_range<Block::args_iterator> getInductionVars() {
       return {getBody()->args_begin(), getBody()->args_end()};
     }

diff  --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index 41a8fe068735..f28409f23045 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExpr.h"
@@ -508,23 +509,20 @@ struct MappingAnnotation {
 
 } // namespace
 
-static constexpr const char *kProcessorEntryName = "processor";
-static constexpr const char *kIndexMapEntryName = "map";
-static constexpr const char *kBoundMapEntryName = "bound";
-
 /// Extracts the mapping annotations from the provided attribute. The attribute
 /// is expected to be of the form
 /// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
 /// where the bound is optional.
 static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
   DictionaryAttr dict = attribute.cast<DictionaryAttr>();
-  unsigned processor = dict.get(kProcessorEntryName)
+  unsigned processor = dict.get(gpu::kProcessorEntryName)
                            .cast<IntegerAttr>()
                            .getValue()
                            .getSExtValue();
-  AffineMap map = dict.get(kIndexMapEntryName).cast<AffineMapAttr>().getValue();
+  AffineMap map =
+      dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
   AffineMapAttr boundAttr =
-      dict.get(kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
+      dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
   AffineMap bound;
   if (boundAttr)
     bound = boundAttr.getValue();
@@ -583,7 +581,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
                                          PatternRewriter &rewriter) {
   // TODO(herhut): Verify that this is a valid GPU mapping.
   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
-  ArrayAttr mapping = parallelOp.getAttrOfType<ArrayAttr>("mapping");
+  ArrayAttr mapping =
+      parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
 
   // TODO(herhut): Support reductions.
   if (!mapping || parallelOp.getNumResults() != 0)

diff  --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 17342f85f9d2..dec7a8bd6867 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(MLIRGPU
   Transforms/AllReduceLowering.cpp
   Transforms/KernelOutlining.cpp
   Transforms/MemoryPromotion.cpp
+  Transforms/ParallelLoopMapper.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU

diff  --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
new file mode 100644
index 000000000000..f85a0c702729
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -0,0 +1,89 @@
+//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities to generate mappings for parallel loops to
+// GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::loop;
+
+namespace {
+
+enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
+
+static constexpr int kNumHardwareIds = 3;
+
+} // namespace
+
+/// Bounded increment on MappingLevel. Increments to the next
+/// level unless Sequential was already reached.
+MappingLevel &operator++(MappingLevel &mappingLevel) {
+  if (mappingLevel < Sequential) {
+    mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
+  }
+  return mappingLevel;
+}
+
+/// Computed the hardware id to use for a given mapping level. Will
+/// assign x,y and z hardware ids for the first 3 dimensions and use
+/// sequential after.
+static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
+  if (dimension >= kNumHardwareIds || level == Sequential)
+    return Sequential * kNumHardwareIds;
+  return (level * kNumHardwareIds) + dimension;
+}
+
+/// Add mapping information to the given parallel loop. Do not add
+/// mapping information if the loop already has it. Also, don't
+/// start a mapping at a nested loop.
+static void mapParallelOp(ParallelOp parallelOp,
+                          MappingLevel mappingLevel = MapGrid) {
+  // Do not try to add a mapping to already mapped loops or nested loops.
+  if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
+      ((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
+    return;
+
+  MLIRContext *ctx = parallelOp.getContext();
+  Builder b(ctx);
+  SmallVector<Attribute, 4> attrs;
+  attrs.reserve(parallelOp.getNumInductionVars());
+  for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
+    SmallVector<NamedAttribute, 3> entries;
+    entries.emplace_back(b.getNamedAttr(
+        kProcessorEntryName,
+        b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
+    entries.emplace_back(b.getNamedAttr(
+        kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
+    entries.emplace_back(b.getNamedAttr(
+        kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
+    attrs.push_back(DictionaryAttr::get(entries, ctx));
+  }
+  parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
+  ++mappingLevel;
+  // Parallel loop operations are immediately nested, so do not use
+  // walk but just iterate over the operations.
+  for (Operation &op : *parallelOp.getBody()) {
+    if (ParallelOp nested = dyn_cast<ParallelOp>(op))
+      mapParallelOp(nested, mappingLevel);
+  }
+}
+
+void mlir::greedilyMapParallelLoopsToGPU(Region &region) {
+  region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
+}

diff  --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
new file mode 100644
index 000000000000..6721738a1048
--- /dev/null
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]}
+// CHECK-NOT: mapping
+
+// -----
+
+func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
+                       %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3)
+                                       step (%four, %four, %four, %four)  {
+    loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                             step (%one, %one, %one, %one)  {
+      loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                               step (%one, %one, %one, %one)  {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop_4d(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:               loop.parallel
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK-NOT: mapping

diff  --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 8c422e718f1f..91672ea129c0 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(MLIRTestTransforms
   TestConstantFold.cpp
   TestLoopFusion.cpp
   TestGpuMemoryPromotion.cpp
+  TestGpuParallelLoopMapping.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp

diff  --git a/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp
new file mode 100644
index 000000000000..df96d9ce96ea
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp
@@ -0,0 +1,38 @@
+//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for mapping parallel
+// loops to gpu hardware ids.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the mapping of parallel loops to hardware ids using
+/// a greedy mapping stratgegy.
+class TestGpuGreedyParallelLoopMappingPass
+    : public OperationPass<TestGpuGreedyParallelLoopMappingPass, FuncOp> {
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    for (Region &region : op->getRegions())
+      greedilyMapParallelLoopsToGPU(region);
+  }
+};
+} // end namespace
+
+namespace mlir {
+void registerTestGpuParallelLoopMappingPass() {
+  PassRegistration<TestGpuGreedyParallelLoopMappingPass> registration(
+      "test-gpu-greedy-parallel-loop-mapping",
+      "Greedily maps all parallel loops to gpu hardware ids.");
+}
+} // namespace mlir

diff  --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 291e31bfa6fe..dd957a59aa30 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -50,6 +50,7 @@ void registerTestMemRefDependenceCheck();
 void registerTestMemRefStrideCalculation();
 void registerTestOpaqueLoc();
 void registerTestParallelismDetection();
+void registerTestGpuParallelLoopMappingPass();
 void registerTestVectorConversions();
 void registerTestVectorToLoopsPass();
 void registerVectorizerTestPass();
@@ -103,6 +104,7 @@ void registerTestPasses() {
   registerTestMemRefStrideCalculation();
   registerTestOpaqueLoc();
   registerTestParallelismDetection();
+  registerTestGpuParallelLoopMappingPass();
   registerTestVectorConversions();
   registerTestVectorToLoopsPass();
   registerVectorizerTestPass();