[Mlir-commits] [mlir] 9ad5e57 - [mlir][nvvm][rocdl] refactor NVVM and ROCDL dialect. NFC.
Alex Zinenko
llvmlistbot at llvm.org
Thu Apr 30 15:13:35 PDT 2020
Author: Wen-Heng (Jack) Chung
Date: 2020-05-01T00:13:26+02:00
New Revision: 9ad5e57316577c63a16c48c2b70d94f319317d2d
URL: https://github.com/llvm/llvm-project/commit/9ad5e57316577c63a16c48c2b70d94f319317d2d
DIFF: https://github.com/llvm/llvm-project/commit/9ad5e57316577c63a16c48c2b70d94f319317d2d.diff
LOG: [mlir][nvvm][rocdl] refactor NVVM and ROCDL dialect. NFC.
- Extract common logic between -convert-gpu-to-nvvm and -convert-gpu-to-rocdl.
- Cope with the fact that alloca operates on different addrspaces between NVVM
and ROCDL.
- Modernize unit tests for ROCDL dialect.
Differential Revision: https://reviews.llvm.org/D79021
Added:
mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
Modified:
mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
Removed:
mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 341526fc9964..1722ae628e88 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -11,11 +11,19 @@
#include <memory>
namespace mlir {
+class LLVMTypeConverter;
+class OwningRewritePatternList;
+
+template <typename OpT>
+class OperationPass;
namespace gpu {
class GPUModuleOp;
} // namespace gpu
-template <typename OpT> class OperationPass;
+
+/// Collect a set of patterns to convert from the GPU dialect to ROCDL.
+void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter,
+ OwningRewritePatternList &patterns);
/// Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
new file mode 100644
index 000000000000..b26e85a15b99
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -0,0 +1,171 @@
+//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
+#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Builders.h"
+
+namespace mlir {
+
+template <unsigned AllocaAddrSpace>
+struct GPUFuncOpLowering : ConvertToLLVMPattern {
+ explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
+ : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
+ typeConverter.getDialect()->getContext(),
+ typeConverter) {}
+
+ LogicalResult
+ matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const override {
+ assert(operands.empty() && "func op is not expected to have operands");
+ auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
+ Location loc = gpuFuncOp.getLoc();
+
+ SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
+ workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
+ for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+ Value attribution = en.value();
+
+ auto type = attribution.getType().dyn_cast<MemRefType>();
+ assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+ uint64_t numElements = type.getNumElements();
+
+ auto elementType = typeConverter.convertType(type.getElementType())
+ .template cast<LLVM::LLVMType>();
+ auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
+ std::string name = std::string(
+ llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
+ auto globalOp = rewriter.create<LLVM::GlobalOp>(
+ gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+ LLVM::Linkage::Internal, name, /*value=*/Attribute(),
+ gpu::GPUDialect::getWorkgroupAddressSpace());
+ workgroupBuffers.push_back(globalOp);
+ }
+
+ // Rewrite the original GPU function to an LLVM function.
+ auto funcType = typeConverter.convertType(gpuFuncOp.getType())
+ .template cast<LLVM::LLVMType>()
+ .getPointerElementTy();
+
+ // Remap proper input types.
+ TypeConverter::SignatureConversion signatureConversion(
+ gpuFuncOp.front().getNumArguments());
+ typeConverter.convertFunctionSignature(
+ gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
+
+ // Create the new function operation. Only copy those attributes that are
+ // not specific to function modeling.
+ SmallVector<NamedAttribute, 4> attributes;
+ for (const auto &attr : gpuFuncOp.getAttrs()) {
+ if (attr.first == SymbolTable::getSymbolAttrName() ||
+ attr.first == impl::getTypeAttrName() ||
+ attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
+ continue;
+ attributes.push_back(attr);
+ }
+ auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
+ gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
+ LLVM::Linkage::External, attributes);
+
+ {
+ // Insert operations that correspond to converted workgroup and private
+ // memory attributions to the body of the function. This must operate on
+ // the original function, before the body region is inlined in the new
+ // function to maintain the relation between block arguments and the
+ // parent operation that assigns their semantics.
+ OpBuilder::InsertionGuard guard(rewriter);
+
+ // Rewrite workgroup memory attributions to addresses of global buffers.
+ rewriter.setInsertionPointToStart(&gpuFuncOp.front());
+ unsigned numProperArguments = gpuFuncOp.getNumArguments();
+ auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
+
+ Value zero = nullptr;
+ if (!workgroupBuffers.empty())
+ zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
+ rewriter.getI32IntegerAttr(0));
+ for (auto en : llvm::enumerate(workgroupBuffers)) {
+ LLVM::GlobalOp global = en.value();
+ Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
+ auto elementType = global.getType().getArrayElementType();
+ Value memory = rewriter.create<LLVM::GEPOp>(
+ loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
+ address, ArrayRef<Value>{zero, zero});
+
+ // Build a memref descriptor pointing to the buffer to plug with the
+ // existing memref infrastructure. This may use more registers than
+ // otherwise necessary given that memref sizes are fixed, but we can try
+ // and canonicalize that away later.
+ Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
+ auto type = attribution.getType().cast<MemRefType>();
+ auto descr = MemRefDescriptor::fromStaticShape(
+ rewriter, loc, typeConverter, type, memory);
+ signatureConversion.remapInput(numProperArguments + en.index(), descr);
+ }
+
+ // Rewrite private memory attributions to alloca'ed buffers.
+ unsigned numWorkgroupAttributions =
+ gpuFuncOp.getNumWorkgroupAttributions();
+ auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
+ for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
+ Value attribution = en.value();
+ auto type = attribution.getType().cast<MemRefType>();
+ assert(type && type.hasStaticShape() &&
+ "unexpected type in attribution");
+
+ // Explicitly drop memory space when lowering private memory
+ // attributions since NVVM models it as `alloca`s in the default
+ // memory space and does not support `alloca`s with addrspace(5).
+ auto ptrType = typeConverter.convertType(type.getElementType())
+ .template cast<LLVM::LLVMType>()
+ .getPointerTo(AllocaAddrSpace);
+ Value numElements = rewriter.create<LLVM::ConstantOp>(
+ gpuFuncOp.getLoc(), int64Ty,
+ rewriter.getI64IntegerAttr(type.getNumElements()));
+ Value allocated = rewriter.create<LLVM::AllocaOp>(
+ gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
+ auto descr = MemRefDescriptor::fromStaticShape(
+ rewriter, loc, typeConverter, type, allocated);
+ signatureConversion.remapInput(
+ numProperArguments + numWorkgroupAttributions + en.index(), descr);
+ }
+ }
+
+ // Move the region to the new function, update the entry block signature.
+ rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
+ llvmFuncOp.end());
+ rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
+ signatureConversion);
+
+ rewriter.eraseOp(gpuFuncOp);
+ return success();
+ }
+};
+
+struct GPUReturnOpLowering : public ConvertToLLVMPattern {
+ GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
+ : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
+ typeConverter.getDialect()->getContext(),
+ typeConverter) {}
+
+ LogicalResult
+ matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const override {
+ rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
+ return success();
+ }
+};
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 66a0a94a499e..afc06c5727a2 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -21,6 +21,7 @@
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/FormatVariadic.h"
+#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"
#include "../PassDetail.h"
@@ -88,155 +89,6 @@ struct GPUShuffleOpLowering : public ConvertToLLVMPattern {
}
};
-struct GPUFuncOpLowering : ConvertToLLVMPattern {
- explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
- : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
- typeConverter.getDialect()->getContext(),
- typeConverter) {}
-
- LogicalResult
- matchAndRewrite(Operation *op, ArrayRef<Value> operands,
- ConversionPatternRewriter &rewriter) const override {
- assert(operands.empty() && "func op is not expected to have operands");
- auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
- Location loc = gpuFuncOp.getLoc();
-
- SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
- workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
- for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
- Value attribution = en.value();
-
- auto type = attribution.getType().dyn_cast<MemRefType>();
- assert(type && type.hasStaticShape() && "unexpected type in attribution");
-
- uint64_t numElements = type.getNumElements();
-
- auto elementType = typeConverter.convertType(type.getElementType())
- .cast<LLVM::LLVMType>();
- auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
- std::string name = std::string(
- llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
- auto globalOp = rewriter.create<LLVM::GlobalOp>(
- gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
- LLVM::Linkage::Internal, name, /*value=*/Attribute(),
- gpu::GPUDialect::getWorkgroupAddressSpace());
- workgroupBuffers.push_back(globalOp);
- }
-
- // Rewrite the original GPU function to an LLVM function.
- auto funcType = typeConverter.convertType(gpuFuncOp.getType())
- .cast<LLVM::LLVMType>()
- .getPointerElementTy();
-
- // Remap proper input types.
- TypeConverter::SignatureConversion signatureConversion(
- gpuFuncOp.front().getNumArguments());
- typeConverter.convertFunctionSignature(
- gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
-
- // Create the new function operation. Only copy those attributes that are
- // not specific to function modeling.
- SmallVector<NamedAttribute, 4> attributes;
- for (const auto &attr : gpuFuncOp.getAttrs()) {
- if (attr.first == SymbolTable::getSymbolAttrName() ||
- attr.first == impl::getTypeAttrName() ||
- attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
- continue;
- attributes.push_back(attr);
- }
- auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
- gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
- LLVM::Linkage::External, attributes);
-
- {
- // Insert operations that correspond to converted workgroup and private
- // memory attributions to the body of the function. This must operate on
- // the original function, before the body region is inlined in the new
- // function to maintain the relation between block arguments and the
- // parent operation that assigns their semantics.
- OpBuilder::InsertionGuard guard(rewriter);
-
- // Rewrite workgroup memory attributions to addresses of global buffers.
- rewriter.setInsertionPointToStart(&gpuFuncOp.front());
- unsigned numProperArguments = gpuFuncOp.getNumArguments();
- auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
-
- Value zero = nullptr;
- if (!workgroupBuffers.empty())
- zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
- rewriter.getI32IntegerAttr(0));
- for (auto en : llvm::enumerate(workgroupBuffers)) {
- LLVM::GlobalOp global = en.value();
- Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
- auto elementType = global.getType().getArrayElementType();
- Value memory = rewriter.create<LLVM::GEPOp>(
- loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
- address, ArrayRef<Value>{zero, zero});
-
- // Build a memref descriptor pointing to the buffer to plug with the
- // existing memref infrastructure. This may use more registers than
- // otherwise necessary given that memref sizes are fixed, but we can try
- // and canonicalize that away later.
- Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
- auto type = attribution.getType().cast<MemRefType>();
- auto descr = MemRefDescriptor::fromStaticShape(
- rewriter, loc, typeConverter, type, memory);
- signatureConversion.remapInput(numProperArguments + en.index(), descr);
- }
-
- // Rewrite private memory attributions to alloca'ed buffers.
- unsigned numWorkgroupAttributions =
- gpuFuncOp.getNumWorkgroupAttributions();
- auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
- for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
- Value attribution = en.value();
- auto type = attribution.getType().cast<MemRefType>();
- assert(type && type.hasStaticShape() &&
- "unexpected type in attribution");
-
- // Explicitly drop memory space when lowering private memory
- // attributions since NVVM models it as `alloca`s in the default
- // memory space and does not support `alloca`s with addrspace(5).
- auto ptrType = typeConverter.convertType(type.getElementType())
- .cast<LLVM::LLVMType>()
- .getPointerTo();
- Value numElements = rewriter.create<LLVM::ConstantOp>(
- gpuFuncOp.getLoc(), int64Ty,
- rewriter.getI64IntegerAttr(type.getNumElements()));
- Value allocated = rewriter.create<LLVM::AllocaOp>(
- gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
- auto descr = MemRefDescriptor::fromStaticShape(
- rewriter, loc, typeConverter, type, allocated);
- signatureConversion.remapInput(
- numProperArguments + numWorkgroupAttributions + en.index(), descr);
- }
- }
-
- // Move the region to the new function, update the entry block signature.
- rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
- llvmFuncOp.end());
- rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
- signatureConversion);
-
- rewriter.eraseOp(gpuFuncOp);
- return success();
- }
-};
-
-struct GPUReturnOpLowering : public ConvertToLLVMPattern {
- GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
- : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
- typeConverter.getDialect()->getContext(),
- typeConverter) {}
-
- LogicalResult
- matchAndRewrite(Operation *op, ArrayRef<Value> operands,
- ConversionPatternRewriter &rewriter) const override {
- rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
- return success();
- }
-};
-
/// Import the GPU Ops to NVVM Patterns.
#include "GPUToNVVM.cpp.inc"
@@ -300,8 +152,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
NVVM::GridDimYOp, NVVM::GridDimZOp>,
- GPUShuffleOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>(
- converter);
+ GPUShuffleOpLowering, GPUReturnOpLowering,
+ // Explicitly drop memory space when lowering private memory
+ // attributions since NVVM models it as `alloca`s in the default
+ // memory space and does not support `alloca`s with addrspace(5).
+ GPUFuncOpLowering<0>>(converter);
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
"__nv_fabs");
patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index ed78bcfb1e76..9661644909e2 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -14,11 +14,16 @@
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"
#include "../PassDetail.h"
@@ -38,41 +43,25 @@ class LowerGpuOpsToROCDLOpsPass
void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
- OwningRewritePatternList patterns;
LLVMTypeConverter converter(m.getContext());
- populateStdToLLVMConversionPatterns(converter, patterns);
- patterns.insert<
- GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
- ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
- GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
- ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
- GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
- ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
- GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
- ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
- converter);
- patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
- "__ocml_fabs_f64");
- patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
- "__ocml_ceil_f64");
- patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
- "__ocml_cos_f64");
- patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
- "__ocml_exp_f64");
- patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
- "__ocml_log_f64");
- patterns.insert<OpToFuncCallLowering<Log10Op>>(
- converter, "__ocml_log10_f32", "__ocml_log10_f64");
- patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
- "__ocml_log2_f64");
- patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
- "__ocml_tanh_f64");
- ConversionTarget target(getContext());
- target.addLegalDialect<LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
+ OwningRewritePatternList patterns;
+
+ populateGpuRewritePatterns(m.getContext(), patterns);
+ applyPatternsAndFoldGreedily(m, patterns);
+ patterns.clear();
+
+ populateVectorToLLVMConversionPatterns(converter, patterns);
+ populateStdToLLVMConversionPatterns(converter, patterns);
+ populateGpuToROCDLConversionPatterns(converter, patterns);
+ LLVMConversionTarget target(getContext());
+ target.addIllegalDialect<gpu::GPUDialect>();
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op>();
target.addIllegalOp<FuncOp>();
+ target.addLegalDialect<ROCDL::ROCDLDialect>();
+ // TODO(whchung): Remove once we support replacing non-root ops.
+ target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
if (failed(applyPartialConversion(m, target, patterns, &converter)))
signalPassFailure();
}
@@ -80,6 +69,36 @@ class LowerGpuOpsToROCDLOpsPass
} // anonymous namespace
+void mlir::populateGpuToROCDLConversionPatterns(
+ LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+ patterns.insert<
+ GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
+ ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
+ GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
+ ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
+ GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
+ ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
+ GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
+ ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
+ GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter);
+ patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
+ "__ocml_fabs_f64");
+ patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
+ "__ocml_ceil_f64");
+ patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
+ "__ocml_cos_f64");
+ patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
+ "__ocml_exp_f64");
+ patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
+ "__ocml_log_f64");
+ patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__ocml_log10_f32",
+ "__ocml_log10_f64");
+ patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
+ "__ocml_log2_f64");
+ patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
+ "__ocml_tanh_f64");
+}
+
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToROCDLOpsPass() {
return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
diff --git a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
new file mode 100644
index 000000000000..f0c9b414c9c8
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
@@ -0,0 +1,231 @@
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
+
+gpu.module @kernel {
+ // NVVM-LABEL: llvm.func @private
+ gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
+ // Allocate private memory inside the function.
+ // NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
+ // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
+
+ // ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
+ // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
+
+ // Populate the memref descriptor.
+ // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+ // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+ // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+ // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+ // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+ // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+ // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+ // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+ // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+ // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(5)*, float addrspace(5)*, i64, [1 x i64], [1 x i64] }">
+ // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+ // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+ // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+ // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+ // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+ // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+ // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+ // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+ // "Store" lowering should work just as any other memref, only check that
+ // we emit some core instructions.
+ // NVVM: llvm.extractvalue %[[descr6:.*]]
+ // NVVM: llvm.getelementptr
+ // NVVM: llvm.store
+
+ // ROCDL: llvm.extractvalue %[[descr6:.*]]
+ // ROCDL: llvm.getelementptr
+ // ROCDL: llvm.store
+ %c0 = constant 0 : index
+ store %arg0, %arg1[%c0] : memref<4xf32, 5>
+
+ "terminator"() : () -> ()
+ }
+}
+
+// -----
+
+gpu.module @kernel {
+ // Workgroup buffers are allocated as globals.
+ // NVVM: llvm.mlir.global internal @[[buffer:.*]]()
+ // NVVM-SAME: addr_space = 3
+ // NVVM-SAME: !llvm<"[4 x float]">
+
+ // ROCDL: llvm.mlir.global internal @[[buffer:.*]]()
+ // ROCDL-SAME: addr_space = 3
+ // ROCDL-SAME: !llvm<"[4 x float]">
+
+ // NVVM-LABEL: llvm.func @workgroup
+ // NVVM-SAME: {
+
+ // ROCDL-LABEL: llvm.func @workgroup
+ // ROCDL-SAME: {
+ gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
+ // Get the address of the first element in the global array.
+ // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+ // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
+ // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+ // NVVM-SAME: !llvm<"float addrspace(3)*">
+
+ // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+ // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
+ // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+ // ROCDL-SAME: !llvm<"float addrspace(3)*">
+
+ // Populate the memref descriptor.
+ // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
+ // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+ // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+ // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+ // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+ // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+ // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+ // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+ // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+ // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
+ // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+ // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+ // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+ // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+ // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+ // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+ // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+ // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+ // "Store" lowering should work just as any other memref, only check that
+ // we emit some core instructions.
+ // NVVM: llvm.extractvalue %[[descr6:.*]]
+ // NVVM: llvm.getelementptr
+ // NVVM: llvm.store
+
+ // ROCDL: llvm.extractvalue %[[descr6:.*]]
+ // ROCDL: llvm.getelementptr
+ // ROCDL: llvm.store
+ %c0 = constant 0 : index
+ store %arg0, %arg1[%c0] : memref<4xf32, 3>
+
+ "terminator"() : () -> ()
+ }
+}
+
+// -----
+
+gpu.module @kernel {
+ // Check that the total size was computed correctly.
+ // NVVM: llvm.mlir.global internal @[[buffer:.*]]()
+ // NVVM-SAME: addr_space = 3
+ // NVVM-SAME: !llvm<"[48 x float]">
+
+ // ROCDL: llvm.mlir.global internal @[[buffer:.*]]()
+ // ROCDL-SAME: addr_space = 3
+ // ROCDL-SAME: !llvm<"[48 x float]">
+
+ // NVVM-LABEL: llvm.func @workgroup3d
+ // ROCDL-LABEL: llvm.func @workgroup3d
+ gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
+ // Get the address of the first element in the global array.
+ // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+ // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
+ // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+ // NVVM-SAME: !llvm<"float addrspace(3)*">
+
+ // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+ // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
+ // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+ // ROCDL-SAME: !llvm<"float addrspace(3)*">
+
+ // Populate the memref descriptor.
+ // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
+ // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+ // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+ // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+ // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+ // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+ // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+ // NVVM: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
+ // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
+ // NVVM: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+ // NVVM: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
+ // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+ // NVVM: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
+ // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+ // NVVM: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
+ // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+ // NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
+
+ // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
+ // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+ // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+ // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+ // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+ // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+ // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+ // ROCDL: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
+ // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
+ // ROCDL: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+ // ROCDL: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
+ // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+ // ROCDL: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
+ // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+ // ROCDL: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
+ // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+ // ROCDL: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
+
+ %c0 = constant 0 : index
+ store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
+ "terminator"() : () -> ()
+ }
+}
+
+// -----
+
+gpu.module @kernel {
+ // Check that several buffers are defined.
+ // NVVM: llvm.mlir.global internal @[[buffer1:.*]]()
+ // NVVM-SAME: !llvm<"[1 x float]">
+ // NVVM: llvm.mlir.global internal @[[buffer2:.*]]()
+ // NVVM-SAME: !llvm<"[2 x float]">
+
+ // ROCDL: llvm.mlir.global internal @[[buffer1:.*]]()
+ // ROCDL-SAME: !llvm<"[1 x float]">
+ // ROCDL: llvm.mlir.global internal @[[buffer2:.*]]()
+ // ROCDL-SAME: !llvm<"[2 x float]">
+
+ // NVVM-LABEL: llvm.func @multiple
+ // ROCDL-LABEL: llvm.func @multiple
+ gpu.func @multiple(%arg0: f32)
+ workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
+ private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
+
+ // Workgroup buffers.
+ // NVVM: llvm.mlir.addressof @[[buffer1]]
+ // NVVM: llvm.mlir.addressof @[[buffer2]]
+
+ // ROCDL: llvm.mlir.addressof @[[buffer1]]
+ // ROCDL: llvm.mlir.addressof @[[buffer2]]
+
+ // Private buffers.
+ // NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
+ // NVVM: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
+ // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
+ // NVVM: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
+
+ // ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
+ // ROCDL: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
+ // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
+ // ROCDL: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
+
+ %c0 = constant 0 : index
+ store %arg0, %arg1[%c0] : memref<1xf32, 3>
+ store %arg0, %arg2[%c0] : memref<2xf32, 3>
+ store %arg0, %arg3[%c0] : memref<3xf32, 5>
+ store %arg0, %arg4[%c0] : memref<4xf32, 5>
+ "terminator"() : () -> ()
+ }
+}
diff --git a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
deleted file mode 100644
index 68b615725ad4..000000000000
--- a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
-
-gpu.module @kernel {
- // CHECK-LABEL: llvm.func @private
- gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
- // Allocate private memory inside the function.
- // CHECK: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
- // CHECK: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
-
- // Populate the memref descriptor.
- // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
- // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
- // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
- // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
- // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
- // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
- // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
- // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
- // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
-
- // "Store" lowering should work just as any other memref, only check that
- // we emit some core instructions.
- // CHECK: llvm.extractvalue %[[descr6:.*]]
- // CHECK: llvm.getelementptr
- // CHECK: llvm.store
- %c0 = constant 0 : index
- store %arg0, %arg1[%c0] : memref<4xf32, 5>
-
- "terminator"() : () -> ()
- }
-}
-
-// -----
-
-gpu.module @kernel {
- // Workgroup buffers are allocated as globals.
- // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
- // CHECK-SAME: addr_space = 3
- // CHECK-SAME: !llvm<"[4 x float]">
-
- // CHECK-LABEL: llvm.func @workgroup
- // CHECK-SAME: {
- gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
- // Get the address of the first element in the global array.
- // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
- // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
- // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
- // CHECK-SAME: !llvm<"float addrspace(3)*">
-
- // Populate the memref descriptor.
- // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
- // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
- // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
- // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
- // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
- // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
- // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
- // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
- // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
-
- // "Store" lowering should work just as any other memref, only check that
- // we emit some core instructions.
- // CHECK: llvm.extractvalue %[[descr6:.*]]
- // CHECK: llvm.getelementptr
- // CHECK: llvm.store
- %c0 = constant 0 : index
- store %arg0, %arg1[%c0] : memref<4xf32, 3>
-
- "terminator"() : () -> ()
- }
-}
-
-// -----
-
-gpu.module @kernel {
- // Check that the total size was computed correctly.
- // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
- // CHECK-SAME: addr_space = 3
- // CHECK-SAME: !llvm<"[48 x float]">
-
- // CHECK-LABEL: llvm.func @workgroup3d
- gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
- // Get the address of the first element in the global array.
- // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
- // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
- // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
- // CHECK-SAME: !llvm<"float addrspace(3)*">
-
- // Populate the memref descriptor.
- // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
- // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
- // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
- // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
- // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
- // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
- // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
- // CHECK: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
- // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
- // CHECK: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
- // CHECK: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
- // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
- // CHECK: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
- // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
- // CHECK: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
- // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
- // CHECK: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
-
- %c0 = constant 0 : index
- store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
- "terminator"() : () -> ()
- }
-}
-
-// -----
-
-gpu.module @kernel {
- // Check that several buffers are defined.
- // CHECK: llvm.mlir.global internal @[[buffer1:.*]]()
- // CHECK-SAME: !llvm<"[1 x float]">
- // CHECK: llvm.mlir.global internal @[[buffer2:.*]]()
- // CHECK-SAME: !llvm<"[2 x float]">
-
- // CHECK-LABEL: llvm.func @multiple
- gpu.func @multiple(%arg0: f32)
- workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
- private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
-
- // Workgroup buffers.
- // CHECK: llvm.mlir.addressof @[[buffer1]]
- // CHECK: llvm.mlir.addressof @[[buffer2]]
-
- // Private buffers.
- // CHECK: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
- // CHECK: llvm.alloca %[[c3]] x !llvm.float
- // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
- // CHECK: llvm.alloca %[[c4]] x !llvm.float
-
- %c0 = constant 0 : index
- store %arg0, %arg1[%c0] : memref<1xf32, 3>
- store %arg0, %arg2[%c0] : memref<2xf32, 3>
- store %arg0, %arg3[%c0] : memref<3xf32, 5>
- store %arg0, %arg4[%c0] : memref<4xf32, 5>
- "terminator"() : () -> ()
- }
-}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 7400d4f0bb1e..c893fd52a4fd 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,9 +1,10 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s --dump-input-on-failure
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_ops()
func @gpu_index_ops()
- attributes { gpu.kernel } {
+ -> (index, index, index, index, index, index,
+ index, index, index, index, index, index) {
// CHECK: rocdl.workitem.id.x : !llvm.i32
%tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
// CHECK: rocdl.workitem.id.y : !llvm.i32
@@ -32,68 +33,71 @@ gpu.module @kernel_module {
// CHECK: rocdl.grid.dim.z : !llvm.i32
%gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
- std.return
+ std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
+ %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ
+ : index, index, index, index, index, index,
+ index, index, index, index, index, index
}
}
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_fabs_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_fabs_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_fabs
- func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.absf %arg_f32 : f32
// CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.absf %arg_f64 : f64
// CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_ceil_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_ceil_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_ceil
- func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.ceilf %arg_f32 : f32
// CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.ceilf %arg_f64 : f64
// CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_cos_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_cos_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_cos
- func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.cos %arg_f32 : f32
// CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.cos %arg_f64 : f64
// CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_exp
- func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%exp_f32 = std.exp %arg_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
- %result_f32 = std.exp %exp_f32 : f32
+ %result32 = std.exp %exp_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.exp %arg_f64 : f64
// CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
@@ -101,20 +105,20 @@ gpu.module @kernel_module {
// -----
// Test that we handled properly operation with SymbolTable other than module op
-gpu.module @kernel_module {
+gpu.module @test_module {
"test.symbol_scope"() ({
// CHECK: test.symbol_scope
// CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_exp
- func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%exp_f32 = std.exp %arg_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
- %result_f32 = std.exp %exp_f32 : f32
+ %result32 = std.exp %exp_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.exp %arg_f64 : f64
// CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
"test.finish" () : () -> ()
}) : () -> ()
@@ -122,60 +126,60 @@ gpu.module @kernel_module {
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_log_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_log_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_log
- func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.log %arg_f32 : f32
// CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.log %arg_f64 : f64
// CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_log10_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_log10_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_log10
- func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.log10 %arg_f32 : f32
// CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.log10 %arg_f64 : f64
// CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_log2_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_log2_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_log2
- func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.log2 %arg_f32 : f32
// CHECK: llvm.call @__ocml_log2_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.log2 %arg_f64 : f64
// CHECK: llvm.call @__ocml_log2_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
// -----
-gpu.module @kernel_module {
+gpu.module @test_module {
// CHECK: llvm.func @__ocml_tanh_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_tanh_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_tanh
- func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) {
+ func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.tanh %arg_f32 : f32
// CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.tanh %arg_f64 : f64
// CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
- std.return
+ std.return %result32, %result64 : f32, f64
}
}
More information about the Mlir-commits
mailing list