[Mlir-commits] [mlir] 46bb661 - [mlir][GPU] Use StructAttr to drive lowering from loop.parallel to
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Mar 24 16:17:57 PDT 2020
Author: MaheshRavishankar
Date: 2020-03-24T16:16:55-07:00
New Revision: 46bb6613a31fd43b6d4485ce7e71a387dc22cbc7
URL: https://github.com/llvm/llvm-project/commit/46bb6613a31fd43b6d4485ce7e71a387dc22cbc7
DIFF: https://github.com/llvm/llvm-project/commit/46bb6613a31fd43b6d4485ce7e71a387dc22cbc7.diff
LOG: [mlir][GPU] Use StructAttr to drive lowering from loop.parallel to
gpu.launch
Current implementation of lowering from loop.parallel to gpu.launch
uses a DictionaryAttr to specify the mapping. Moving this attribute to
be auto-generated from specification as a StructAttr. This simplifies
a lot the logic of looking up and creating this attribute.
Differential Revision: https://reviews.llvm.org/D76165
Added:
mlir/include/mlir/Dialect/GPU/GPUBase.td
mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td
Modified:
mlir/include/mlir/Dialect/GPU/CMakeLists.txt
mlir/include/mlir/Dialect/GPU/GPUOps.td
mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
mlir/lib/Dialect/GPU/CMakeLists.txt
mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
index 3122b8421cbb..d341303d62da 100644
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -1,2 +1,12 @@
-add_mlir_dialect(GPUOps gpu)
+add_mlir_dialect(GPUOps gpu GPUOps)
add_mlir_doc(GPUOps -gen-dialect-doc GPUDialect Dialects/)
+
+set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
+mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls)
+mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRParallelLoopMapperAttrGen)
+
+set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
+mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls)
+mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td
new file mode 100644
index 000000000000..39e2f1a940d9
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td
@@ -0,0 +1,58 @@
+//===-- GPUBase.td - GPU dialect definitions ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the GPU dialect
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_BASE
+#define GPU_BASE
+
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// GPU Dialect.
+//===----------------------------------------------------------------------===//
+
+def GPU_Dialect : Dialect {
+ let name = "gpu";
+ let hasOperationAttrVerify = 1;
+
+ let extraClassDeclaration = [{
+ /// Get the name of the attribute used to annotate the modules that contain
+ /// kernel modules.
+ static StringRef getContainerModuleAttrName() {
+ return "gpu.container_module";
+ }
+ /// Get the name of the attribute used to annotate external kernel
+ /// functions.
+ static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
+
+ /// Get the name of the attribute used to annotate kernel modules.
+ static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
+
+ /// Returns whether the given function is a kernel function, i.e., has the
+ /// 'gpu.kernel' attribute.
+ static bool isKernel(Operation *op);
+
+ /// Returns the number of workgroup (thread, block) dimensions supported in
+ /// the GPU dialect.
+ // TODO(zinenko,herhut): consider generalizing this.
+ static unsigned getNumWorkgroupDimensions() { return 3; }
+
+ /// Returns the numeric value used to identify the workgroup memory address
+ /// space.
+ static unsigned getWorkgroupAddressSpace() { return 3; }
+
+ /// Returns the numeric value used to identify the private memory address
+ /// space.
+ static unsigned getPrivateAddressSpace() { return 5; }
+ }];
+}
+
+#endif // GPU_BASE
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index 659c10142e81..6feaf82405f0 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -13,6 +13,7 @@
#ifndef GPU_OPS
#define GPU_OPS
+include "mlir/Dialect/GPU/GPUBase.td"
include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
include "mlir/Interfaces/SideEffects.td"
@@ -26,42 +27,6 @@ def IntLikeOrLLVMInt : TypeConstraint<
// GPU Dialect operations.
//===----------------------------------------------------------------------===//
-def GPU_Dialect : Dialect {
- let name = "gpu";
- let hasOperationAttrVerify = 1;
-
- let extraClassDeclaration = [{
- /// Get the name of the attribute used to annotate the modules that contain
- /// kernel modules.
- static StringRef getContainerModuleAttrName() {
- return "gpu.container_module";
- }
- /// Get the name of the attribute used to annotate external kernel
- /// functions.
- static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
-
- /// Get the name of the attribute used to annotate kernel modules.
- static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
-
- /// Returns whether the given function is a kernel function, i.e., has the
- /// 'gpu.kernel' attribute.
- static bool isKernel(Operation *op);
-
- /// Returns the number of workgroup (thread, block) dimensions supported in
- /// the GPU dialect.
- // TODO(zinenko,herhut): consider generalizing this.
- static unsigned getNumWorkgroupDimensions() { return 3; }
-
- /// Returns the numeric value used to identify the workgroup memory address
- /// space.
- static unsigned getWorkgroupAddressSpace() { return 3; }
-
- /// Returns the numeric value used to identify the private memory address
- /// space.
- static unsigned getPrivateAddressSpace() { return 5; }
- }];
-}
-
class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
Op<GPU_Dialect, mnemonic, traits>;
diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
index 92fb09ff6020..6bbcafb91925 100644
--- a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
+++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
@@ -14,28 +14,48 @@
#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+#include "mlir/IR/Attributes.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.h.inc"
+
namespace mlir {
+class AffineMap;
+struct LogicalResult;
+class Operation;
class Region;
+#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc"
+
+namespace loop {
+class ParallelOp;
+}
+
namespace gpu {
/// Name of the mapping attribute produced by loop mappers.
-static constexpr const char *kMappingAttributeName = "mapping";
-/// Name of the processor sub-attribute that identifies the hardware id
-/// to map a loop to.
-static constexpr const char *kProcessorEntryName = "processor";
-/// Name of the map sub-attribute that identifies the affine map to apply
-/// to the hardware id to compute the iteration number of the loop. This
-/// map is expected to be extended by step and lower bound computations:
-/// index = map(hardware_id) * step + lowerbound
-static constexpr const char *kIndexMapEntryName = "map";
-/// Name of the bound sub-attribute that itendities the affine map to
-/// compute an upper bound of iterations for the hardware id. This is
-/// applied to an upper bound on the number of iterations:
-/// launchBound = bound(upperbound-lowerbound ceildiv step)
-static constexpr const char *kBoundMapEntryName = "bound";
+StringRef getMappingAttrName();
+/// Get the value of the processor in the ParallelLoopDimMapping attribute.
+inline Processor getProcessor(ParallelLoopDimMapping attr) {
+ return static_cast<Processor>(attr.processor().getInt());
+}
+
+/// Helper function to create a ParallelDimMapperAttr.
+/// TODO(ravishankarm/antiagainst): Replace its uses with an auto-gened method.
+ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
+ AffineMap map,
+ AffineMap bound);
+
+/// Sets the mapping attribute of a loop.parallel operation. Verifies that the
+/// mapping passed is valid.
+/// - the number of DimMapperAttr provided is same as the number of loops of
+/// the `ploopOp`.
+/// - the mapping does not map multiple loops to the same processor.
+LogicalResult setMappingAttr(loop::ParallelOp ploopOp,
+ ArrayRef<ParallelLoopDimMapping> mapping);
} // end namespace gpu
/// Maps the parallel loops found in the given function to workgroups. The first
@@ -46,5 +66,4 @@ static constexpr const char *kBoundMapEntryName = "bound";
void greedilyMapParallelLoopsToGPU(Region ®ion);
} // end namespace mlir
-
#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td
new file mode 100644
index 000000000000..1bfdfe5ebcfc
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td
@@ -0,0 +1,51 @@
+//===-- ParallelLoopMapperAttr.td - Attribute definition ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the attribute used for driving conversion from loop.parallel to
+// gpu.launch operations
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PARALLEL_LOOP_MAPPER_ATTR
+#define PARALLEL_LOOP_MAPPER_ATTR
+
+include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
+include "mlir/Dialect/GPU/GPUBase.td"
+
+def BlockX : I64EnumAttrCase<"BlockX", 0>;
+def BlockY : I64EnumAttrCase<"BlockY", 1>;
+def BlockZ : I64EnumAttrCase<"BlockZ", 2>;
+def ThreadX : I64EnumAttrCase<"ThreadX", 3>;
+def ThreadY : I64EnumAttrCase<"ThreadY", 4>;
+def ThreadZ : I64EnumAttrCase<"ThreadZ", 5>;
+def Sequential : I64EnumAttrCase<"Sequential", 6>;
+
+def ProcessorAttr : I64EnumAttr<"Processor", "processor for loop mapping", [
+ BlockX, BlockY, BlockZ, ThreadX, ThreadY, ThreadZ, Sequential]> {
+ let cppNamespace = "::mlir::gpu";
+}
+
+// Attribute that drives conversion of a loop.parallel to gpu.launch
+// operation.
+// processor: the hardware id to map to.
+// map : An affine map that is used to pre-process hardware ids before
+// substitution.
+// bound : An affine map that is used to compute the bound of the hardware
+// id based on an upper bound of the number of iterations.
+def ParallelLoopDimMappingAttr :
+ StructAttr<"ParallelLoopDimMapping", GPU_Dialect,
+ [StructFieldAttr<"processor", ProcessorAttr>,
+ StructFieldAttr<"map", AffineMapAttr>,
+ StructFieldAttr<"bound", AffineMapAttr>]>;
+
+
+def ParallelLoopMappingAttr :
+ TypedArrayAttrBase<ParallelLoopDimMappingAttr,
+ "parallel loop to processor mapping attribute">;
+
+#endif // PARALLEL_LOOP_MAPPER_ATTR
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index ec5c9cff2ee0..b9c81ea45592 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -500,35 +500,8 @@ struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
LogicalResult matchAndRewrite(ParallelOp parallelOp,
PatternRewriter &rewriter) const override;
};
-
-struct MappingAnnotation {
- unsigned processor;
- AffineMap indexMap;
- AffineMap boundMap;
-};
-
} // namespace
-/// Extracts the mapping annotations from the provided attribute. The attribute
-/// is expected to be of the form
-/// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
-/// where the bound is optional.
-static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
- DictionaryAttr dict = attribute.cast<DictionaryAttr>();
- unsigned processor = dict.get(gpu::kProcessorEntryName)
- .cast<IntegerAttr>()
- .getValue()
- .getSExtValue();
- AffineMap map =
- dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
- AffineMapAttr boundAttr =
- dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
- AffineMap bound;
- if (boundAttr)
- bound = boundAttr.getValue();
- return {processor, map, bound};
-}
-
/// Tries to derive a static upper bound from the defining operation of
/// `upperBound`.
static Value deriveStaticUpperBound(Value upperBound,
@@ -546,6 +519,30 @@ static Value deriveStaticUpperBound(Value upperBound,
return {};
}
+static bool isMappedToProcessor(gpu::Processor processor) {
+ return processor != gpu::Processor::Sequential;
+}
+
+static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
+ switch (processor) {
+ case gpu::Processor::BlockX:
+ return 0;
+ case gpu::Processor::BlockY:
+ return 1;
+ case gpu::Processor::BlockZ:
+ return 2;
+ case gpu::Processor::ThreadX:
+ return 3;
+ case gpu::Processor::ThreadY:
+ return 4;
+ case gpu::Processor::ThreadZ:
+ return 5;
+ default:;
+ }
+ llvm_unreachable(
+ "invalid processor type while retrieving launch op argument number");
+}
+
/// Modifies the current transformation state to capture the effect of the given
/// `loop.parallel` operation on index substitutions and the operations to be
/// inserted.
@@ -568,16 +565,14 @@ static Value deriveStaticUpperBound(Value upperBound,
/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
/// worklist. This signals the processor of the worklist to pop the rewriter
/// one scope-level up.
-static LogicalResult processParallelLoop(ParallelOp parallelOp,
- gpu::LaunchOp launchOp,
- BlockAndValueMapping &cloningMap,
- SmallVectorImpl<Operation *> &worklist,
- DenseMap<int, Value> &bounds,
- PatternRewriter &rewriter) {
+static LogicalResult processParallelLoop(
+ ParallelOp parallelOp, gpu::LaunchOp launchOp,
+ BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist,
+ DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
// TODO(herhut): Verify that this is a valid GPU mapping.
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
ArrayAttr mapping =
- parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
+ parallelOp.getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
// TODO(herhut): Support reductions.
if (!mapping || parallelOp.getNumResults() != 0)
@@ -604,12 +599,17 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
Attribute mappingAttribute;
Value iv, lowerBound, upperBound, step;
std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
- MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute);
+ auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>();
+ if (!annotation)
+ return parallelOp.emitOpError()
+ << "expected mapping attribute for lowering to GPU";
Value newIndex;
+ gpu::Processor processor = gpu::getProcessor(annotation);
- if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) {
+ if (isMappedToProcessor(processor)) {
// Use the corresponding thread/grid index as replacement for the loop iv.
- Value operand = launchOp.body().front().getArgument(annotation.processor);
+ Value operand = launchOp.body().front().getArgument(
+ getLaunchOpArgumentNum(processor));
// Take the indexmap and add the lower bound and step computations in.
// This computes operand * step + lowerBound.
// Use an affine map here so that it composes nicely with the provided
@@ -619,11 +619,11 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
rewriter.getAffineSymbolExpr(1));
newIndex = rewriter.create<AffineApplyOp>(
- loc, annotation.indexMap.compose(lowerAndStep),
+ loc, annotation.map().getValue().compose(lowerAndStep),
ValueRange{operand, step, lowerBound});
// If there was also a bound, insert that, too.
// TODO(herhut): Check that we do not assign bounds twice.
- if (annotation.boundMap) {
+ if (annotation.bound().getValue()) {
// We pass as the single opererand to the bound-map the number of
// iterations, which is (upperBound - lowerBound) ceilDiv step. To
// support inner loops with dynamic upper bounds (as generated by e.g.
@@ -663,19 +663,21 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
rewriter.getAffineSymbolExpr(1))
.ceilDiv(rewriter.getAffineSymbolExpr(2))));
Value launchBound = rewriter.create<AffineApplyOp>(
- loc, annotation.boundMap.compose(stepMap),
+ loc, annotation.bound().getValue().compose(stepMap),
ValueRange{
ensureLaunchIndependent(
cloningMap.lookupOrDefault(upperBound)),
ensureLaunchIndependent(
cloningMap.lookupOrDefault(lowerBound)),
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
- if (bounds.find(annotation.processor) != bounds.end()) {
+ // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
+ // when this condition is relaxed.
+ if (bounds.find(processor) != bounds.end()) {
return parallelOp.emitOpError()
<< "cannot redefine the bound for processor "
- << annotation.processor;
+ << static_cast<int64_t>(processor);
}
- bounds[annotation.processor] = launchBound;
+ bounds[processor] = launchBound;
}
if (!boundIsPrecise) {
// We are using an approximation, create a surrounding conditional.
@@ -757,7 +759,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
rewriter.setInsertionPointToStart(&launchOp.body().front());
BlockAndValueMapping cloningMap;
- llvm::DenseMap<int, Value> launchBounds;
+ llvm::DenseMap<gpu::Processor, Value> launchBounds;
SmallVector<Operation *, 16> worklist;
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
launchBounds, rewriter)))
@@ -809,7 +811,8 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
// Now that we succeeded creating the launch operation, also update the
// bounds.
for (auto bound : launchBounds)
- launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
+ launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
+ std::get<1>(bound));
rewriter.eraseOp(parallelOp);
return success();
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index b83c3ca6a421..e71a018a451c 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -10,6 +10,8 @@ add_mlir_dialect_library(MLIRGPU
DEPENDS
MLIRGPUOpsIncGen
+ MLIRParallelLoopMapperAttrGen
+ MLIRParallelLoopMapperEnumsGen
)
target_link_libraries(MLIRGPU
PUBLIC
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index f85a0c702729..9697688ac850 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -23,6 +23,43 @@ using namespace mlir;
using namespace mlir::gpu;
using namespace mlir::loop;
+#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
+namespace mlir {
+
+#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
+namespace gpu {
+
+StringRef getMappingAttrName() { return "mapping"; }
+
+ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
+ AffineMap map,
+ AffineMap bound) {
+ MLIRContext *context = map.getContext();
+ OpBuilder builder(context);
+ return ParallelLoopDimMapping::get(
+ builder.getI64IntegerAttr(static_cast<int32_t>(processor)),
+ AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
+}
+
+LogicalResult setMappingAttr(loop::ParallelOp ploopOp,
+ ArrayRef<ParallelLoopDimMapping> mapping) {
+ // Verify that each processor is mapped to only once.
+ llvm::DenseSet<gpu::Processor> specifiedMappings;
+ for (auto dimAttr : mapping) {
+ gpu::Processor processor = getProcessor(dimAttr);
+ if (processor != gpu::Processor::Sequential &&
+ specifiedMappings.count(processor))
+ return ploopOp.emitError(
+ "invalid mapping multiple loops to same processor");
+ }
+ ArrayRef<Attribute> mappingAsAttrs(mapping.data(), mapping.size());
+ ploopOp.setAttr(getMappingAttrName(),
+ ArrayAttr::get(mappingAsAttrs, ploopOp.getContext()));
+ return success();
+}
+} // namespace gpu
+} // namespace mlir
+
namespace {
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
@@ -43,10 +80,41 @@ MappingLevel &operator++(MappingLevel &mappingLevel) {
/// Computed the hardware id to use for a given mapping level. Will
/// assign x,y and z hardware ids for the first 3 dimensions and use
/// sequential after.
-static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
+/// TODO(ravishankarm/herhut) : Make this use x for the inner-most loop that is
+/// distributed to map to x, the next innermost to y and the next innermost to
+/// z.
+static gpu::Processor getHardwareIdForMapping(MappingLevel level,
+ int dimension) {
+
if (dimension >= kNumHardwareIds || level == Sequential)
- return Sequential * kNumHardwareIds;
- return (level * kNumHardwareIds) + dimension;
+ return Processor::Sequential;
+ switch (level) {
+ case MapGrid:
+ switch (dimension) {
+ case 0:
+ return Processor::BlockX;
+ case 1:
+ return Processor::BlockY;
+ case 2:
+ return Processor::BlockZ;
+ default:
+ return Processor::Sequential;
+ }
+ break;
+ case MapBlock:
+ switch (dimension) {
+ case 0:
+ return Processor::ThreadX;
+ case 1:
+ return Processor::ThreadY;
+ case 2:
+ return Processor::ThreadZ;
+ default:
+ return Processor::Sequential;
+ }
+ default:;
+ }
+ return Processor::Sequential;
}
/// Add mapping information to the given parallel loop. Do not add
@@ -55,26 +123,20 @@ static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
static void mapParallelOp(ParallelOp parallelOp,
MappingLevel mappingLevel = MapGrid) {
// Do not try to add a mapping to already mapped loops or nested loops.
- if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
+ if (parallelOp.getAttr(getMappingAttrName()) ||
((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
return;
MLIRContext *ctx = parallelOp.getContext();
Builder b(ctx);
- SmallVector<Attribute, 4> attrs;
+ SmallVector<ParallelLoopDimMapping, 4> attrs;
attrs.reserve(parallelOp.getNumInductionVars());
for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
- SmallVector<NamedAttribute, 3> entries;
- entries.emplace_back(b.getNamedAttr(
- kProcessorEntryName,
- b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
- entries.emplace_back(b.getNamedAttr(
- kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
- entries.emplace_back(b.getNamedAttr(
- kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
- attrs.push_back(DictionaryAttr::get(entries, ctx));
+ attrs.push_back(getParallelLoopDimMappingAttr(
+ getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
+ b.getDimIdentityMap()));
}
- parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
+ setMappingAttr(parallelOp, attrs);
++mappingLevel;
// Parallel loop operations are immediately nested, so do not use
// walk but just iterate over the operations.
diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
index 4bc97da954ff..ab195936d83a 100644
--- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
@@ -3,7 +3,7 @@
// 2-d parallel loop mapped to block.y and block.x
func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,
- %arg3 : index, %arg4 : index,
+ %arg3 : index, %arg4 : index,
%buf : memref<?x?xf32>,
%res : memref<?x?xf32>) {
%step = constant 2 : index
@@ -334,7 +334,7 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind
// expected-error at +1 {{failed to legalize operation 'loop.parallel'}}
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
step (%four, %four) {
- // expected-error at +1 {{cannot derive loop-invariant upper bound}}
+ // expected-error at +1 {{cannot derive loop-invariant upper bound}}
loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
step (%one, %one) {
%idx0 = addi %i0, %si0 : index
More information about the Mlir-commits
mailing list