[Mlir-commits] [mlir] 715783d - [MLIR][GPU] Implement initial mapping from loop.parallel to gpu.launch.

Thu Feb 13 07:54:30 PST 2020

Author: Stephan Herhut
Date: 2020-02-13T16:54:16+01:00
New Revision: 715783d415fe60be0230cbdf53ffae5adaa87950

URL: https://github.com/llvm/llvm-project/commit/715783d415fe60be0230cbdf53ffae5adaa87950
DIFF: https://github.com/llvm/llvm-project/commit/715783d415fe60be0230cbdf53ffae5adaa87950.diff

LOG: [MLIR][GPU] Implement initial mapping from loop.parallel to gpu.launch.

Summary:
To unblock other work, this implements basic lowering based on mapping
attributes that have to be provided on all loop.parallel. The lowering
does not yet support reduce.

Differential Revision: https://reviews.llvm.org/D73893

Added: 
    mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir

Modified: 
    mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
    mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
index 79e58fec3db9..0bf49deb0762 100644

--- a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
+++ b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
@@ -12,6 +12,8 @@
 
 namespace mlir {
 class AffineForOp;
+class MLIRContext;
+class OwningRewritePatternList;
 struct LogicalResult;
 class Value;
 
@@ -72,6 +74,11 @@ LogicalResult convertLoopToGPULaunch(loop::ForOp forOp,
                                      ArrayRef<Value> numWorkGroups,
                                      ArrayRef<Value> workGroupSizes);
 
+/// Adds the conversion pattern from `loop.parallel` to `gpu.launch` to the
+/// provided pattern list.
+void populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
+                                       MLIRContext *ctx);
+
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_

diff  --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index 9c6c8d187b59..70935f6da70b 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -20,8 +20,12 @@
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/Ops.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Support/Debug.h"
@@ -487,3 +491,327 @@ LogicalResult mlir::convertLoopToGPULaunch(loop::ForOp forOp,
                                            ArrayRef<Value> workGroupSizes) {
   return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes);
 }
+
+namespace {
+struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
+  using OpRewritePattern<ParallelOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(ParallelOp parallelOp,
+                                     PatternRewriter &rewriter) const override;
+};
+
+struct MappingAnnotation {
+  unsigned processor;
+  AffineMap indexMap;
+  AffineMap boundMap;
+};
+
+} // namespace
+
+static constexpr const char *kProcessorEntryName = "processor";
+static constexpr const char *kIndexMapEntryName = "map";
+static constexpr const char *kBoundMapEntryName = "bound";
+
+/// Extracts the mapping annotations from the provided attribute. The attribute
+/// is expected to be of the form
+/// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
+/// where the bound is optional.
+static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
+  DictionaryAttr dict = attribute.cast<DictionaryAttr>();
+  unsigned processor = dict.get(kProcessorEntryName)
+                           .cast<IntegerAttr>()
+                           .getValue()
+                           .getSExtValue();
+  AffineMap map = dict.get(kIndexMapEntryName).cast<AffineMapAttr>().getValue();
+  AffineMapAttr boundAttr =
+      dict.get(kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
+  AffineMap bound;
+  if (boundAttr)
+    bound = boundAttr.getValue();
+  return {processor, map, bound};
+}
+
+/// Tries to derive a static upper bound from the defining operation of
+/// `upperBound`.
+static Value deriveStaticUpperBound(Value upperBound) {
+  Value constantBound = {};
+  if (AffineMinOp minOp =
+          dyn_cast_or_null<AffineMinOp>(upperBound.getDefiningOp())) {
+    auto map = minOp.map();
+    auto operands = minOp.operands();
+    for (int sub = 0, e = map.getNumResults(); sub < e; ++sub) {
+      AffineExpr expr = map.getResult(sub);
+      if (AffineDimExpr dimExpr = expr.dyn_cast<AffineDimExpr>()) {
+        auto dimOperand = operands[dimExpr.getPosition()];
+        auto defOp = dimOperand.getDefiningOp();
+        if (ConstantOp constOp = dyn_cast_or_null<ConstantOp>(defOp)) {
+          constantBound = constOp;
+          break;
+        }
+      }
+    }
+  }
+  return constantBound;
+}
+
+/// Modifies the current transformation state to capture the effect of the given
+/// `loop.parallel` operation on index substitutions and the operations to be
+/// inserted.
+/// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
+/// this function will
+/// - compute the loop index based on the hardware id and affine map from the
+///   mapping and update `cloningMap` to substitute all uses.
+/// - derive a new upper bound for the hardware id and augment the provided
+///   `gpu.launch operation` accordingly.
+/// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
+///   and update the rewriter to insert into the conditional's body.
+/// If the dimension is mapped to sequential,
+/// - insert a for loop into the body and update the rewriter to insert into
+///   the for loop's body.
+/// - update the `cloningMap` to replace uses of the index with the index of
+///   the new for loop.
+/// In either case,
+/// - append the instructions from the loops body to worklist, in reverse order.
+/// To note the end of the current scope in case a loop or conditional was
+/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
+/// worklist. This signals the processor of the worklist to pop the rewriter
+/// one scope-level up.
+static LogicalResult processParallelLoop(ParallelOp parallelOp,
+                                         gpu::LaunchOp launchOp,
+                                         BlockAndValueMapping &cloningMap,
+                                         SmallVectorImpl<Operation *> &worklist,
+                                         PatternRewriter &rewriter) {
+  // TODO(herhut): Verify that this is a valid GPU mapping.
+  // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
+  ArrayAttr mapping = parallelOp.getAttrOfType<ArrayAttr>("mapping");
+
+  // TODO(herhut): Support reductions.
+  if (!mapping || parallelOp.getNumResults() != 0)
+    return failure();
+
+  Location loc = parallelOp.getLoc();
+
+  auto launchIndependent = [&launchOp](Value val) {
+    return val.getParentRegion()->isAncestor(launchOp.getParentRegion());
+  };
+
+  auto ensureLaunchIndependent = [&launchOp, &rewriter,
+                                  launchIndependent](Value val) -> Value {
+    if (launchIndependent(val))
+      return val;
+    if (ConstantOp constOp = dyn_cast_or_null<ConstantOp>(val.getDefiningOp()))
+      return rewriter.create<ConstantOp>(constOp.getLoc(), constOp.getValue());
+    return {};
+  };
+
+  for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(),
+                               parallelOp.lowerBound(), parallelOp.upperBound(),
+                               parallelOp.step())) {
+    Attribute mappingAttribute;
+    Value iv, lowerBound, upperBound, step;
+    std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
+    MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute);
+    Value newIndex;
+
+    if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) {
+      // Use the corresponding thread/grid index as replacement for the loop iv.
+      // TODO(herhut): Make the iv calculation depend on lower & upper bound.
+      Value operand = launchOp.body().front().getArgument(annotation.processor);
+      Value appliedMap =
+          rewriter.create<AffineApplyOp>(loc, annotation.indexMap, operand);
+      // Add the lower bound, as the maps are 0 based but the loop might not be.
+      // TODO(herhut): Maybe move this explicitly into the maps?
+      newIndex = rewriter.create<AddIOp>(
+          loc, appliedMap, cloningMap.lookupOrDefault(lowerBound));
+      // If there was also a bound, insert that, too.
+      // TODO(herhut): Check that we do not assign bounds twice.
+      if (annotation.boundMap) {
+        // We pass as the single opererand to the bound-map the number of
+        // iterations, which is upperBound - lowerBound. To support inner loops
+        // with dynamic upper bounds (as generated by e.g. tiling), try to
+        // derive a max for the bounds. If the used bound for the hardware id is
+        // inprecise, wrap the contained code into a conditional.
+        // If the lower-bound is constant or defined before the launch, we can
+        // use it in the launch bounds. Otherwise fail.
+        if (!launchIndependent(lowerBound) &&
+            !isa<ConstantOp>(lowerBound.getDefiningOp()))
+          return failure();
+        // If the upper-bound is constant or defined before the launch, we can
+        // use it in the launch bounds directly. Otherwise try derive a bound.
+        bool boundIsPrecise = launchIndependent(upperBound) ||
+                              isa<ConstantOp>(upperBound.getDefiningOp());
+        if (!boundIsPrecise) {
+          upperBound = deriveStaticUpperBound(upperBound);
+          if (!upperBound)
+            return failure();
+        }
+        {
+          PatternRewriter::InsertionGuard guard(rewriter);
+          rewriter.setInsertionPoint(launchOp);
+
+          Value iterations = rewriter.create<SubIOp>(
+              loc,
+              ensureLaunchIndependent(cloningMap.lookupOrDefault(upperBound)),
+              ensureLaunchIndependent(cloningMap.lookupOrDefault(lowerBound)));
+          Value launchBound = rewriter.create<AffineApplyOp>(
+              loc, annotation.boundMap, iterations);
+          launchOp.setOperand(annotation.processor, launchBound);
+        }
+        if (!boundIsPrecise) {
+          // We are using an approximation, create a surrounding conditional.
+          Value originalBound = std::get<3>(config);
+          CmpIOp pred = rewriter.create<CmpIOp>(
+              loc, CmpIPredicate::slt, newIndex,
+              cloningMap.lookupOrDefault(originalBound));
+          loop::IfOp ifOp = rewriter.create<loop::IfOp>(loc, pred, false);
+          rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());
+          // Put a sentinel into the worklist so we know when to pop out of the
+          // if body again. We use the launchOp here, as that cannot be part of
+          // the bodies instruction.
+          worklist.push_back(launchOp.getOperation());
+        }
+      }
+    } else {
+      // Create a sequential for loop.
+      auto loopOp = rewriter.create<loop::ForOp>(
+          loc, cloningMap.lookupOrDefault(lowerBound),
+          cloningMap.lookupOrDefault(upperBound),
+          cloningMap.lookupOrDefault(step));
+      newIndex = loopOp.getInductionVar();
+      rewriter.setInsertionPointToStart(loopOp.getBody());
+      // Put a sentinel into the worklist so we know when to pop out of the loop
+      // body again. We use the launchOp here, as that cannot be part of the
+      // bodies instruction.
+      worklist.push_back(launchOp.getOperation());
+    }
+    cloningMap.map(iv, newIndex);
+  }
+  Block *body = parallelOp.getBody();
+  worklist.reserve(worklist.size() + body->getOperations().size());
+  for (Operation &op : llvm::reverse(body->without_terminator()))
+    worklist.push_back(&op);
+  return success();
+}
+
+/// Lower a `loop.parallel` operation into a corresponding `gpu.launch`
+/// operation.
+///
+/// This essentially transforms a loop nest into a corresponding SIMT function.
+/// The conversion is driven by mapping annotations on the `loop.parallel`
+/// operations. The mapping is provided via a `DictionaryAttribute` named
+/// `mapping`, which has three entries:
+///  - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
+///               thread dimensions and 6 is sequential.
+///  - map : An affine map that is used to pre-process hardware ids before
+///          substitution.
+///  - bound : An affine map that is used to compute the bound of the hardware
+///            id based on an upper bound of the number of iterations.
+/// If the `loop.parallel` contains nested `loop.parallel` operations, those
+/// need to be annotated, as well. Structurally, the transformation works by
+/// splicing all operations from nested `loop.parallel` operations into a single
+/// sequence. Indices mapped to hardware ids are substituted with those ids,
+/// wheras sequential mappings result in a sequential for-loop. To have more
+/// flexibility when mapping code to hardware ids, the transform supports two
+/// affine maps. The first `map` is used to compute the actual index for
+/// substitution from the hardware id. The second `bound` is used to compute the
+/// launch dimension for the hardware id from the number of iterations the
+/// mapped loop is performing. Note that the number of iterations might be
+/// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
+/// the hardware id might iterate over additional indices. The transformation
+/// caters for this by predicating the created sequence of instructions on
+/// the actual loop bound. This only works if an static upper bound for the
+/// dynamic loop bound can be defived, currently via analyzing `affine.min`
+/// operations.
+PatternMatchResult
+ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
+                                             PatternRewriter &rewriter) const {
+  // Create a launch operation. We start with bound one for all grid/block
+  // sizes. Those will be refined later as we discover them from mappings.
+  Location loc = parallelOp.getLoc();
+  Value constantOne = rewriter.create<ConstantIndexOp>(parallelOp.getLoc(), 1);
+  gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
+      parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
+      constantOne, constantOne);
+  rewriter.setInsertionPointToEnd(&launchOp.body().front());
+  rewriter.create<gpu::TerminatorOp>(loc);
+  rewriter.setInsertionPointToStart(&launchOp.body().front());
+
+  BlockAndValueMapping cloningMap;
+  SmallVector<Operation *, 16> worklist;
+  if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
+                                 rewriter)))
+    return matchFailure();
+
+  // Whether we have seen any side-effects. Reset when leaving an inner scope.
+  bool seenSideeffects = false;
+  // Whether we have left a nesting scope (and hence are no longer innermost).
+  bool leftNestingScope = false;
+  while (!worklist.empty()) {
+    Operation *op = worklist.pop_back_val();
+    launchOp.dump();
+
+    // Now walk over the body and clone it.
+    // TODO: This is only correct if there either is no further loop.parallel
+    //       nested or this code is side-effect free. Otherwise we might need
+    //       predication. We are overly consertaive for now and only allow
+    //       side-effects in the innermost scope.
+    if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
+      // Before entering a nested scope, make sure there have been no
+      // sideeffects until now.
+      if (seenSideeffects)
+        return matchFailure();
+      // A nested loop.parallel needs insertion of code to compute indices.
+      // Insert that now. This will also update the worklist with the loops
+      // body.
+      processParallelLoop(nestedParallel, launchOp, cloningMap, worklist,
+                          rewriter);
+    } else if (op == launchOp.getOperation()) {
+      // Found our sentinel value. We have finished the operations from one
+      // nesting level, pop one level back up.
+      auto parent = rewriter.getInsertionPoint()->getParentOp();
+      rewriter.setInsertionPointAfter(parent);
+      leftNestingScope = true;
+      seenSideeffects = false;
+    } else {
+      // Otherwise we copy it over.
+      Operation *clone = rewriter.clone(*op, cloningMap);
+      cloningMap.map(op->getResults(), clone->getResults());
+      // Check for side effects.
+      seenSideeffects |= !clone->hasNoSideEffect();
+      // If we are no longer in the innermost scope, sideeffects are disallowed.
+      if (seenSideeffects && leftNestingScope)
+        return matchFailure();
+    }
+  }
+
+  rewriter.eraseOp(parallelOp);
+  return matchSuccess();
+}
+
+namespace {
+struct ParallelLoopToGpuPass : public OperationPass<ParallelLoopToGpuPass> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
+                                             MLIRContext *ctx) {
+  patterns.insert<ParallelToGpuLaunchLowering>(ctx);
+}
+
+void ParallelLoopToGpuPass::runOnOperation() {
+  OwningRewritePatternList patterns;
+  populateParallelLoopToGPUPatterns(patterns, &getContext());
+  ConversionTarget target(getContext());
+  target.addLegalDialect<StandardOpsDialect>();
+  target.addLegalDialect<AffineOpsDialect>();
+  target.addLegalDialect<gpu::GPUDialect>();
+  target.addLegalDialect<loop::LoopOpsDialect>();
+  target.addIllegalOp<loop::ParallelOp>();
+  if (failed(applyPartialConversion(getOperation(), target, patterns)))
+    signalPassFailure();
+}
+
+static PassRegistration<ParallelLoopToGpuPass>
+    pass("convert-parallel-loops-to-gpu", "Convert mapped loop.parallel ops"
+                                          " to gpu launch operations.");
\ No newline at end of file

diff  --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
new file mode 100644
index 000000000000..eebb602fba59
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
@@ -0,0 +1,326 @@
+// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
+
+// 2-d parallel loop mapped to block.y and block.x
+
+func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index, %arg4 : index, 
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %step = constant 2 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%arg4, %step)  {
+    %val = load %buf[%i0, %i1] : memref<?x?xf32>
+    store %val, %res[%i1, %i0] : memref<?x?xf32>
+  } { mapping = [{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}] }
+  return
+}
+
+// CHECK:       #map0 = affine_map<(d0) -> (d0)>
+// CHECK:       module {
+
+// CHECK-LABEL:   func @parallel_loop_bidy_bidx(
+// CHECK-SAME:                        [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref<?x?xf32>, [[VAL_6:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_7:%.*]] = constant 2 : index
+// CHECK:           [[VAL_8:%.*]] = constant 1 : index
+// CHECK:           [[VAL_9:%.*]] = subi [[VAL_2]], [[VAL_0]] : index
+// CHECK:           [[VAL_10:%.*]] = affine.apply #map0([[VAL_9]])
+// CHECK:           [[VAL_11:%.*]] = subi [[VAL_3]], [[VAL_1]] : index
+// CHECK:           [[VAL_12:%.*]] = affine.apply #map0([[VAL_11]])
+// CHECK:           gpu.launch blocks([[VAL_13:%.*]], [[VAL_14:%.*]], [[VAL_15:%.*]]) in ([[VAL_16:%.*]] = [[VAL_12]], [[VAL_17:%.*]] = [[VAL_10]], [[VAL_18:%.*]] = [[VAL_8]]) threads([[VAL_19:%.*]], [[VAL_20:%.*]], [[VAL_21:%.*]]) in ([[VAL_22:%.*]] = [[VAL_8]], [[VAL_23:%.*]] = [[VAL_8]], [[VAL_24:%.*]] = [[VAL_8]]) {
+// CHECK:             [[VAL_25:%.*]] = affine.apply #map0([[VAL_14]])
+// CHECK:             [[VAL_26:%.*]] = addi [[VAL_25]], [[VAL_0]] : index
+// CHECK:             [[VAL_27:%.*]] = affine.apply #map0([[VAL_13]])
+// CHECK:             [[VAL_28:%.*]] = addi [[VAL_27]], [[VAL_1]] : index
+// CHECK:             [[VAL_29:%.*]] = load [[VAL_5]]{{\[}}[[VAL_26]], [[VAL_28]]] : memref<?x?xf32>
+// CHECK:             store [[VAL_29]], [[VAL_6]]{{\[}}[[VAL_28]], [[VAL_26]]] : memref<?x?xf32>
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:       }
+
+// -----
+
+// tiled 2-d parallel loop mapped to block.y and block.x and thread.y and thread.x.
+
+func @parallel_loop_tiled(%arg0 : index, %arg1 : index, %arg2 : index,
+                        %arg3 : index,
+                        %buf : memref<?x?xf32>,
+                        %res : memref<?x?xf32>) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+      %idx0 = addi %i0, %si0 : index
+      %idx1 = addi %i1, %si1 : index
+      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
+      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
+    } { mapping = [
+        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+        {processor = 3, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+     ] }
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// CHECK:       #map0 = affine_map<(d0) -> (d0)>
+// CHECK:       module {
+
+// CHECK-LABEL:   func @parallel_loop_tiled(
+// CHECK-SAME:                              [[VAL_30:%.*]]: index, [[VAL_31:%.*]]: index, [[VAL_32:%.*]]: index, [[VAL_33:%.*]]: index, [[VAL_34:%.*]]: memref<?x?xf32>, [[VAL_35:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_36:%.*]] = constant 0 : index
+// CHECK:           [[VAL_37:%.*]] = constant 1 : index
+// CHECK:           [[VAL_38:%.*]] = constant 4 : index
+// CHECK:           [[VAL_39:%.*]] = constant 1 : index
+// CHECK:           [[VAL_40:%.*]] = subi [[VAL_32]], [[VAL_30]] : index
+// CHECK:           [[VAL_41:%.*]] = affine.apply #map0([[VAL_40]])
+// CHECK:           [[VAL_42:%.*]] = subi [[VAL_33]], [[VAL_31]] : index
+// CHECK:           [[VAL_43:%.*]] = affine.apply #map0([[VAL_42]])
+// CHECK:           [[VAL_44:%.*]] = subi [[VAL_38]], [[VAL_36]] : index
+// CHECK:           [[VAL_45:%.*]] = affine.apply #map0([[VAL_44]])
+// CHECK:           [[VAL_46:%.*]] = subi [[VAL_38]], [[VAL_36]] : index
+// CHECK:           [[VAL_47:%.*]] = affine.apply #map0([[VAL_46]])
+// CHECK:           gpu.launch blocks([[VAL_48:%.*]], [[VAL_49:%.*]], [[VAL_50:%.*]]) in ([[VAL_51:%.*]] = [[VAL_43]], [[VAL_52:%.*]] = [[VAL_41]], [[VAL_53:%.*]] = [[VAL_39]]) threads([[VAL_54:%.*]], [[VAL_55:%.*]], [[VAL_56:%.*]]) in ([[VAL_57:%.*]] = [[VAL_47]], [[VAL_58:%.*]] = [[VAL_45]], [[VAL_59:%.*]] = [[VAL_39]]) {
+// CHECK:             [[VAL_60:%.*]] = affine.apply #map0([[VAL_49]])
+// CHECK:             [[VAL_61:%.*]] = addi [[VAL_60]], [[VAL_30]] : index
+// CHECK:             [[VAL_62:%.*]] = affine.apply #map0([[VAL_48]])
+// CHECK:             [[VAL_63:%.*]] = addi [[VAL_62]], [[VAL_31]] : index
+// CHECK:             [[VAL_64:%.*]] = affine.apply #map0([[VAL_55]])
+// CHECK:             [[VAL_65:%.*]] = addi [[VAL_64]], [[VAL_36]] : index
+// CHECK:             [[VAL_66:%.*]] = affine.apply #map0([[VAL_54]])
+// CHECK:             [[VAL_67:%.*]] = addi [[VAL_66]], [[VAL_36]] : index
+// CHECK:             [[VAL_68:%.*]] = addi [[VAL_61]], [[VAL_65]] : index
+// CHECK:             [[VAL_69:%.*]] = addi [[VAL_63]], [[VAL_67]] : index
+// CHECK:             [[VAL_70:%.*]] = load [[VAL_34]]{{\[}}[[VAL_68]], [[VAL_69]]] : memref<?x?xf32>
+// CHECK:             store [[VAL_70]], [[VAL_35]]{{\[}}[[VAL_69]], [[VAL_68]]] : memref<?x?xf32>
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:       }
+
+// -----
+
+// 2-d parallel loop mapped to block.y and sequential
+
+func @parallel_loop_bidy_seq(%arg0 : index, %arg1 : index, %arg2 : index,
+                             %arg3 : index, %arg4 : index,
+                             %buf : memref<?x?xf32>,
+                             %res : memref<?x?xf32>) {
+  %step = constant 2 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%arg4, %step)  {
+    %val = load %buf[%i0, %i1] : memref<?x?xf32>
+    store %val, %res[%i1, %i0] : memref<?x?xf32>
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// CHECK:       #map0 = affine_map<(d0) -> (d0)>
+// CHECK:       module {
+
+// CHECK-LABEL:   func @parallel_loop_bidy_seq(
+// CHECK-SAME:                        [[VAL_71:%.*]]: index, [[VAL_72:%.*]]: index, [[VAL_73:%.*]]: index, [[VAL_74:%.*]]: index, [[VAL_75:%.*]]: index, [[VAL_76:%.*]]: memref<?x?xf32>, [[VAL_77:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_78:%.*]] = constant 2 : index
+// CHECK:           [[VAL_79:%.*]] = constant 1 : index
+// CHECK:           [[VAL_80:%.*]] = subi [[VAL_73]], [[VAL_71]] : index
+// CHECK:           [[VAL_81:%.*]] = affine.apply #map0([[VAL_80]])
+// CHECK:           gpu.launch blocks([[VAL_82:%.*]], [[VAL_83:%.*]], [[VAL_84:%.*]]) in ([[VAL_85:%.*]] = [[VAL_79]], [[VAL_86:%.*]] = [[VAL_81]], [[VAL_87:%.*]] = [[VAL_79]]) threads([[VAL_88:%.*]], [[VAL_89:%.*]], [[VAL_90:%.*]]) in ([[VAL_91:%.*]] = [[VAL_79]], [[VAL_92:%.*]] = [[VAL_79]], [[VAL_93:%.*]] = [[VAL_79]]) {
+// CHECK:             [[VAL_94:%.*]] = affine.apply #map0([[VAL_83]])
+// CHECK:             [[VAL_95:%.*]] = addi [[VAL_94]], [[VAL_71]] : index
+// CHECK:             loop.for [[VAL_96:%.*]] = [[VAL_72]] to [[VAL_74]] step [[VAL_78]] {
+// CHECK:               [[VAL_97:%.*]] = load [[VAL_76]]{{\[}}[[VAL_95]], [[VAL_96]]] : memref<?x?xf32>
+// CHECK:               store [[VAL_97]], [[VAL_77]]{{\[}}[[VAL_96]], [[VAL_95]]] : memref<?x?xf32>
+// CHECK:             }
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:       }
+
+// -----
+
+// tiled 2-d parallel loop mapped to block.y and seq. and thread.y and seq.
+
+func @parallel_loop_tiled_seq(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index,
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+      %idx0 = addi %i0, %si0 : index
+      %idx1 = addi %i1, %si1 : index
+      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
+      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
+    } { mapping = [
+        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+        {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+      ] }
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// CHECK:       #map0 = affine_map<(d0) -> (d0)>
+// CHECK:       module {
+
+// CHECK-LABEL:   func @parallel_loop_tiled_seq(
+// CHECK-SAME:                        [[VAL_98:%.*]]: index, [[VAL_99:%.*]]: index, [[VAL_100:%.*]]: index, [[VAL_101:%.*]]: index, [[VAL_102:%.*]]: memref<?x?xf32>, [[VAL_103:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_104:%.*]] = constant 0 : index
+// CHECK:           [[VAL_105:%.*]] = constant 1 : index
+// CHECK:           [[VAL_106:%.*]] = constant 4 : index
+// CHECK:           [[VAL_107:%.*]] = constant 1 : index
+// CHECK:           [[VAL_108:%.*]] = subi [[VAL_100]], [[VAL_98]] : index
+// CHECK:           [[VAL_109:%.*]] = affine.apply #map0([[VAL_108]])
+// CHECK:           [[VAL_110:%.*]] = subi [[VAL_106]], [[VAL_104]] : index
+// CHECK:           [[VAL_111:%.*]] = affine.apply #map0([[VAL_110]])
+// CHECK:           gpu.launch blocks([[VAL_112:%.*]], [[VAL_113:%.*]], [[VAL_114:%.*]]) in ([[VAL_115:%.*]] = [[VAL_107]], [[VAL_116:%.*]] = [[VAL_109]], [[VAL_117:%.*]] = [[VAL_107]]) threads([[VAL_118:%.*]], [[VAL_119:%.*]], [[VAL_120:%.*]]) in ([[VAL_121:%.*]] = [[VAL_107]], [[VAL_122:%.*]] = [[VAL_111]], [[VAL_123:%.*]] = [[VAL_107]]) {
+// CHECK:             [[VAL_124:%.*]] = affine.apply #map0([[VAL_113]])
+// CHECK:             [[VAL_125:%.*]] = addi [[VAL_124]], [[VAL_98]] : index
+// CHECK:             loop.for [[VAL_126:%.*]] = [[VAL_99]] to [[VAL_101]] step [[VAL_106]] {
+// CHECK:               [[VAL_127:%.*]] = affine.apply #map0([[VAL_119]])
+// CHECK:               [[VAL_128:%.*]] = addi [[VAL_127]], [[VAL_104]] : index
+// CHECK:               loop.for [[VAL_129:%.*]] = [[VAL_104]] to [[VAL_106]] step [[VAL_105]] {
+// CHECK:                 [[VAL_130:%.*]] = addi [[VAL_125]], [[VAL_128]] : index
+// CHECK:                 [[VAL_131:%.*]] = addi [[VAL_126]], [[VAL_129]] : index
+// CHECK:                 [[VAL_132:%.*]] = load [[VAL_102]]{{\[}}[[VAL_130]], [[VAL_131]]] : memref<?x?xf32>
+// CHECK:                 store [[VAL_132]], [[VAL_103]]{{\[}}[[VAL_131]], [[VAL_130]]] : memref<?x?xf32>
+// CHECK:               }
+// CHECK:             }
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:       }
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
+#map2 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
+#map3 = affine_map<(d0) -> (d0)>
+
+module {
+  func @sum(%arg0: memref<?x?xf32, #map0>, %arg1: memref<?x?xf32, #map0>, %arg2: memref<?x?xf32, #map0>) {
+    %c1 = constant 1 : index
+    %c0 = constant 0 : index
+    %c3 = constant 3 : index
+    %c2 = constant 2 : index
+    %0 = dim %arg0, 0 : memref<?x?xf32, #map0>
+    %1 = dim %arg0, 1 : memref<?x?xf32, #map0>
+    loop.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c2, %c3) {
+      %2 = dim %arg0, 0 : memref<?x?xf32, #map0>
+      %3 = affine.min #map1(%c2, %2, %arg3)
+      %4 = dim %arg0, 1 : memref<?x?xf32, #map0>
+      %5 = affine.min #map1(%c3, %4, %arg4)
+      %6 = std.subview %arg0[%arg3, %arg4][%3, %5][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, #map2>
+      %7 = dim %arg1, 0 : memref<?x?xf32, #map0>
+      %8 = affine.min #map1(%c2, %7, %arg3)
+      %9 = dim %arg1, 1 : memref<?x?xf32, #map0>
+      %10 = affine.min #map1(%c3, %9, %arg4)
+      %11 = std.subview %arg1[%arg3, %arg4][%8, %10][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, #map2>
+      %12 = dim %arg2, 0 : memref<?x?xf32, #map0>
+      %13 = affine.min #map1(%c2, %12, %arg3)
+      %14 = dim %arg2, 1 : memref<?x?xf32, #map0>
+      %15 = affine.min #map1(%c3, %14, %arg4)
+      %16 = std.subview %arg2[%arg3, %arg4][%13, %15][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, #map2>
+      loop.parallel (%arg5, %arg6) = (%c0, %c0) to (%3, %5) step (%c1, %c1) {
+        %17 = load %6[%arg5, %arg6] : memref<?x?xf32, #map2>
+        %18 = load %11[%arg5, %arg6] : memref<?x?xf32, #map2>
+        %19 = load %16[%arg5, %arg6] : memref<?x?xf32, #map2>
+        %20 = addf %17, %18 : f32
+        store %20, %16[%arg5, %arg6] : memref<?x?xf32, #map2>
+        "loop.terminator"() : () -> ()
+      } { mapping = [
+          {processor = 3, map = #map3, bound = #map3},
+          {processor = 4, map = #map3, bound = #map3}
+        ] }
+      "loop.terminator"() : () -> ()
+    } { mapping = [
+        {processor = 0, map = #map3, bound = #map3},
+        {processor = 1, map = #map3, bound = #map3}
+    ] }
+    return
+  }
+}
+
+// CHECK:       #map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+// CHECK:       #map1 = affine_map<(d0) -> (d0)>
+// CHECK:       #map2 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
+// CHECK:       #map3 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
+// CHECK:       module {
+
+// CHECK-LABEL:   func @sum(
+// CHECK-SAME:              [[VAL_133:%.*]]: memref<?x?xf32, #map0>, [[VAL_134:%.*]]: memref<?x?xf32, #map0>, [[VAL_135:%.*]]: memref<?x?xf32, #map0>) {
+// CHECK:           [[VAL_136:%.*]] = constant 1 : index
+// CHECK:           [[VAL_137:%.*]] = constant 0 : index
+// CHECK:           [[VAL_138:%.*]] = constant 3 : index
+// CHECK:           [[VAL_139:%.*]] = constant 2 : index
+// CHECK:           [[VAL_140:%.*]] = dim [[VAL_133]], 0 : memref<?x?xf32, #map0>
+// CHECK:           [[VAL_141:%.*]] = dim [[VAL_133]], 1 : memref<?x?xf32, #map0>
+// CHECK:           [[VAL_142:%.*]] = constant 1 : index
+// CHECK:           [[VAL_143:%.*]] = subi [[VAL_140]], [[VAL_137]] : index
+// CHECK:           [[VAL_144:%.*]] = affine.apply #map1([[VAL_143]])
+// CHECK:           [[VAL_145:%.*]] = subi [[VAL_141]], [[VAL_137]] : index
+// CHECK:           [[VAL_146:%.*]] = affine.apply #map1([[VAL_145]])
+// CHECK:           [[VAL_148:%.*]] = subi [[VAL_139]], [[VAL_137]] : index
+// CHECK:           [[VAL_149:%.*]] = affine.apply #map1([[VAL_148]])
+// CHECK:           [[VAL_151:%.*]] = subi [[VAL_138]], [[VAL_137]] : index
+// CHECK:           [[VAL_152:%.*]] = affine.apply #map1([[VAL_151]])
+// CHECK:           gpu.launch blocks([[VAL_153:%.*]], [[VAL_154:%.*]], [[VAL_155:%.*]]) in ([[VAL_156:%.*]] = [[VAL_144]], [[VAL_157:%.*]] = [[VAL_146]], [[VAL_158:%.*]] = [[VAL_142]]) threads([[VAL_159:%.*]], [[VAL_160:%.*]], [[VAL_161:%.*]]) in ([[VAL_162:%.*]] = [[VAL_149]], [[VAL_163:%.*]] = [[VAL_152]], [[VAL_164:%.*]] = [[VAL_142]]) {
+// CHECK:             [[VAL_165:%.*]] = affine.apply #map1([[VAL_153]])
+// CHECK:             [[VAL_166:%.*]] = addi [[VAL_165]], [[VAL_137]] : index
+// CHECK:             [[VAL_167:%.*]] = affine.apply #map1([[VAL_154]])
+// CHECK:             [[VAL_168:%.*]] = addi [[VAL_167]], [[VAL_137]] : index
+// CHECK:             [[VAL_169:%.*]] = dim [[VAL_133]], 0 : memref<?x?xf32, #map0>
+// CHECK:             [[VAL_170:%.*]] = affine.min #map2([[VAL_139]], [[VAL_169]], [[VAL_166]])
+// CHECK:             [[VAL_171:%.*]] = dim [[VAL_133]], 1 : memref<?x?xf32, #map0>
+// CHECK:             [[VAL_172:%.*]] = affine.min #map2([[VAL_138]], [[VAL_171]], [[VAL_168]])
+// CHECK:             [[VAL_173:%.*]] = std.subview [[VAL_133]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_170]], [[VAL_172]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref<?x?xf32, #map0> to memref<?x?xf32, #map3>
+// CHECK:             [[VAL_174:%.*]] = dim [[VAL_134]], 0 : memref<?x?xf32, #map0>
+// CHECK:             [[VAL_175:%.*]] = affine.min #map2([[VAL_139]], [[VAL_174]], [[VAL_166]])
+// CHECK:             [[VAL_176:%.*]] = dim [[VAL_134]], 1 : memref<?x?xf32, #map0>
+// CHECK:             [[VAL_177:%.*]] = affine.min #map2([[VAL_138]], [[VAL_176]], [[VAL_168]])
+// CHECK:             [[VAL_178:%.*]] = std.subview [[VAL_134]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_175]], [[VAL_177]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref<?x?xf32, #map0> to memref<?x?xf32, #map3>
+// CHECK:             [[VAL_179:%.*]] = dim [[VAL_135]], 0 : memref<?x?xf32, #map0>
+// CHECK:             [[VAL_180:%.*]] = affine.min #map2([[VAL_139]], [[VAL_179]], [[VAL_166]])
+// CHECK:             [[VAL_181:%.*]] = dim [[VAL_135]], 1 : memref<?x?xf32, #map0>
+// CHECK:             [[VAL_182:%.*]] = affine.min #map2([[VAL_138]], [[VAL_181]], [[VAL_168]])
+// CHECK:             [[VAL_183:%.*]] = std.subview [[VAL_135]]{{\[}}[[VAL_166]], [[VAL_168]]]{{\[}}[[VAL_180]], [[VAL_182]]]{{\[}}[[VAL_136]], [[VAL_136]]] : memref<?x?xf32, #map0> to memref<?x?xf32, #map3>
+// CHECK:             [[VAL_184:%.*]] = affine.apply #map1([[VAL_159]])
+// CHECK:             [[VAL_185:%.*]] = addi [[VAL_184]], [[VAL_137]] : index
+// CHECK:             [[VAL_186:%.*]] = cmpi "slt", [[VAL_185]], [[VAL_170]] : index
+// CHECK:             loop.if [[VAL_186]] {
+// CHECK:               [[VAL_187:%.*]] = affine.apply #map1([[VAL_160]])
+// CHECK:               [[VAL_188:%.*]] = addi [[VAL_187]], [[VAL_137]] : index
+// CHECK:               [[VAL_189:%.*]] = cmpi "slt", [[VAL_188]], [[VAL_172]] : index
+// CHECK:               loop.if [[VAL_189]] {
+// CHECK:                 [[VAL_190:%.*]] = load [[VAL_173]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref<?x?xf32, #map3>
+// CHECK:                 [[VAL_191:%.*]] = load [[VAL_178]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref<?x?xf32, #map3>
+// CHECK:                 [[VAL_192:%.*]] = load [[VAL_183]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref<?x?xf32, #map3>
+// CHECK:                 [[VAL_193:%.*]] = addf [[VAL_190]], [[VAL_191]] : f32
+// CHECK:                 store [[VAL_193]], [[VAL_183]]{{\[}}[[VAL_185]], [[VAL_188]]] : memref<?x?xf32, #map3>
+// CHECK:               }
+// CHECK:             }
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:       }
+