[Mlir-commits] [mlir] 45676a8 - [MLIR] Change GpuLaunchFuncToGpuRuntimeCallsPass to wrap a RewritePattern with the same functionality.
Christian Sigg
llvmlistbot at llvm.org
Thu Aug 6 02:55:57 PDT 2020
Author: Christian Sigg
Date: 2020-08-06T11:55:46+02:00
New Revision: 45676a8936124e734177f1360e7af05a7ada7d99
URL: https://github.com/llvm/llvm-project/commit/45676a8936124e734177f1360e7af05a7ada7d99
DIFF: https://github.com/llvm/llvm-project/commit/45676a8936124e734177f1360e7af05a7ada7d99.diff
LOG: [MLIR] Change GpuLaunchFuncToGpuRuntimeCallsPass to wrap a RewritePattern with the same functionality.
The RewritePattern will become one of several, and will be part of the LLVM conversion pass (instead of a separate pass following LLVM conversion).
Reviewed By: herhut
Differential Revision: https://reviews.llvm.org/D84946
Added:
Modified:
mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
mlir/lib/Conversion/GPUCommon/CMakeLists.txt
mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 56bc5f2c2c4c..0935786af27a 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -14,10 +14,12 @@
namespace mlir {
+class LLVMTypeConverter;
class Location;
struct LogicalResult;
class ModuleOp;
class Operation;
+class OwningRewritePatternList;
template <typename T>
class OperationPass;
@@ -46,6 +48,11 @@ std::unique_ptr<OperationPass<ModuleOp>>
createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
StringRef gpuBinaryAnnotation = "");
+/// Collect a set of patterns to convert from the GPU dialect to LLVM.
+void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
+ OwningRewritePatternList &patterns,
+ StringRef gpuBinaryAnnotation);
+
/// Creates a pass to convert kernel functions into GPU target object blobs.
///
/// This transformation takes the body of each function that is annotated with
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
index a4abae8037f6..9cb6038b3020 100644
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -34,4 +34,5 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
MLIRLLVMIR
MLIRPass
MLIRSupport
+ MLIRStandardToLLVM
)
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
index e186a335214b..51c4cc924fcb 100644
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -16,6 +16,7 @@
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "../PassDetail.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"
@@ -34,21 +35,93 @@
using namespace mlir;
-// To avoid name mangling, these are defined in the mini-runtime file.
-static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad";
-static constexpr const char *kGpuModuleGetFunctionName =
- "mgpuModuleGetFunction";
-static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel";
-static constexpr const char *kGpuStreamCreateName = "mgpuStreamCreate";
-static constexpr const char *kGpuStreamSynchronizeName =
- "mgpuStreamSynchronize";
-static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister";
static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
namespace {
-/// A pass to convert gpu.launch_func operations into a sequence of GPU
-/// runtime calls. Currently it supports CUDA and ROCm (HIP).
+class GpuLaunchFuncToGpuRuntimeCallsPass
+ : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
+ GpuLaunchFuncToGpuRuntimeCallsPass> {
+public:
+ GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
+ if (!gpuBinaryAnnotation.empty())
+ this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
+ }
+
+ // Run the dialect converter on the module.
+ void runOnOperation() override;
+};
+
+class FunctionCallBuilder {
+public:
+ FunctionCallBuilder(StringRef functionName, LLVM::LLVMType returnType,
+ ArrayRef<LLVM::LLVMType> argumentTypes)
+ : functionName(functionName),
+ functionType(LLVM::LLVMType::getFunctionTy(returnType, argumentTypes,
+ /*isVarArg=*/false)) {}
+ LLVM::CallOp create(Location loc, OpBuilder &builder,
+ ArrayRef<Value> arguments) const;
+
+private:
+ StringRef functionName;
+ LLVM::LLVMType functionType;
+};
+
+template <typename OpTy>
+class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
+public:
+ explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
+
+protected:
+ MLIRContext *context = &this->typeConverter.getContext();
+
+ LLVM::LLVMType llvmVoidType = LLVM::LLVMType::getVoidTy(context);
+ LLVM::LLVMType llvmPointerType = LLVM::LLVMType::getInt8PtrTy(context);
+ LLVM::LLVMType llvmPointerPointerType = llvmPointerType.getPointerTo();
+ LLVM::LLVMType llvmInt8Type = LLVM::LLVMType::getInt8Ty(context);
+ LLVM::LLVMType llvmInt32Type = LLVM::LLVMType::getInt32Ty(context);
+ LLVM::LLVMType llvmInt64Type = LLVM::LLVMType::getInt64Ty(context);
+ LLVM::LLVMType llvmIntPtrType = LLVM::LLVMType::getIntNTy(
+ context, this->typeConverter.getPointerBitwidth(0));
+
+ FunctionCallBuilder moduleLoadCallBuilder = {
+ "mgpuModuleLoad",
+ llvmPointerType /* void *module */,
+ {llvmPointerType /* void *cubin */}};
+ FunctionCallBuilder moduleGetFunctionCallBuilder = {
+ "mgpuModuleGetFunction",
+ llvmPointerType /* void *function */,
+ {
+ llvmPointerType, /* void *module */
+ llvmPointerType /* char *name */
+ }};
+ FunctionCallBuilder launchKernelCallBuilder = {
+ "mgpuLaunchKernel",
+ llvmVoidType,
+ {
+ llvmPointerType, /* void* f */
+ llvmIntPtrType, /* intptr_t gridXDim */
+ llvmIntPtrType, /* intptr_t gridyDim */
+ llvmIntPtrType, /* intptr_t gridZDim */
+ llvmIntPtrType, /* intptr_t blockXDim */
+ llvmIntPtrType, /* intptr_t blockYDim */
+ llvmIntPtrType, /* intptr_t blockZDim */
+ llvmInt32Type, /* unsigned int sharedMemBytes */
+ llvmPointerType, /* void *hstream */
+ llvmPointerPointerType, /* void **kernelParams */
+ llvmPointerPointerType /* void **extra */
+ }};
+ FunctionCallBuilder streamCreateCallBuilder = {
+ "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
+ FunctionCallBuilder streamSynchronizeCallBuilder = {
+ "mgpuStreamSynchronize",
+ llvmVoidType,
+ {llvmPointerType /* void *stream */}};
+};
+
+/// A rewrite patter to convert gpu.launch_func operations into a sequence of
+/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
///
/// In essence, a gpu.launch_func operations gets compiled into the following
/// sequence of runtime calls:
@@ -60,167 +133,65 @@ namespace {
/// * streamSynchronize -- waits for operations on the stream to finish
///
/// Intermediate data structures are allocated on the stack.
-class GpuLaunchFuncToGpuRuntimeCallsPass
- : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
- GpuLaunchFuncToGpuRuntimeCallsPass> {
-private:
- LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
-
- void initializeCachedTypes() {
- llvmVoidType = LLVM::LLVMType::getVoidTy(&getContext());
- llvmPointerType = LLVM::LLVMType::getInt8PtrTy(&getContext());
- llvmPointerPointerType = llvmPointerType.getPointerTo();
- llvmInt8Type = LLVM::LLVMType::getInt8Ty(&getContext());
- llvmInt32Type = LLVM::LLVMType::getInt32Ty(&getContext());
- llvmInt64Type = LLVM::LLVMType::getInt64Ty(&getContext());
- llvmIntPtrType = LLVM::LLVMType::getIntNTy(
- &getContext(), llvmDialect->getDataLayout().getPointerSizeInBits());
- }
-
- LLVM::LLVMType getVoidType() { return llvmVoidType; }
-
- LLVM::LLVMType getPointerType() { return llvmPointerType; }
-
- LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }
-
- LLVM::LLVMType getInt8Type() { return llvmInt8Type; }
-
- LLVM::LLVMType getInt32Type() { return llvmInt32Type; }
-
- LLVM::LLVMType getInt64Type() { return llvmInt64Type; }
-
- LLVM::LLVMType getIntPtrType() {
- return LLVM::LLVMType::getIntNTy(
- &getContext(),
- getLLVMDialect()->getDataLayout().getPointerSizeInBits());
- }
-
- // Allocate a void pointer on the stack.
- Value allocatePointer(OpBuilder &builder, Location loc) {
- auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
- builder.getI32IntegerAttr(1));
- return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
- /*alignment=*/0);
- }
+class ConvertLaunchFuncOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
+public:
+ ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
+ StringRef gpuBinaryAnnotation)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
+ gpuBinaryAnnotation(gpuBinaryAnnotation) {}
- void declareGpuRuntimeFunctions(Location loc);
- void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
- unsigned pos, Value one);
- Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
+private:
+ void addParamToArray(OpBuilder &builder, Location loc, Value param,
+ Value array, unsigned pos, Value one) const;
+ Value generateParamsArray(gpu::LaunchFuncOp launchOp, unsigned numArguments,
+ OpBuilder &builder) const;
Value generateKernelNameConstant(StringRef moduleName, StringRef name,
- Location loc, OpBuilder &builder);
- void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
+ Location loc, OpBuilder &builder) const;
-public:
- GpuLaunchFuncToGpuRuntimeCallsPass() = default;
- GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
- this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
- }
+ LogicalResult
+ matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const override;
- // Run the dialect converter on the module.
- void runOnOperation() override {
- // Cache the LLVMDialect for the current module.
- llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
- // Cache the used LLVM types.
- initializeCachedTypes();
+ llvm::SmallString<32> gpuBinaryAnnotation;
+};
- getOperation().walk(
- [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
+class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
+ using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
+ LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
+ PatternRewriter &rewriter) const override {
// GPU kernel modules are no longer necessary since we have a global
// constant with the CUBIN, or HSACO data.
- for (auto m :
- llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
- m.erase();
+ rewriter.eraseOp(op);
+ return success();
}
-
-private:
- LLVM::LLVMDialect *llvmDialect;
- LLVM::LLVMType llvmVoidType;
- LLVM::LLVMType llvmPointerType;
- LLVM::LLVMType llvmPointerPointerType;
- LLVM::LLVMType llvmInt8Type;
- LLVM::LLVMType llvmInt32Type;
- LLVM::LLVMType llvmInt64Type;
- LLVM::LLVMType llvmIntPtrType;
};
-} // anonymous namespace
-
-// Adds declarations for the needed helper functions from the runtime wrappers.
-// The types in comments give the actual types expected/returned but the API
-// uses void pointers. This is fine as they have the same linkage in C.
-void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
- Location loc) {
- ModuleOp module = getOperation();
- OpBuilder builder(module.getBody()->getTerminator());
- if (!module.lookupSymbol(kGpuModuleLoadName)) {
- builder.create<LLVM::LLVMFuncOp>(
- loc, kGpuModuleLoadName,
- LLVM::LLVMType::getFunctionTy(getPointerType(),
- {getPointerType()}, /* void *cubin */
- /*isVarArg=*/false));
- }
- if (!module.lookupSymbol(kGpuModuleGetFunctionName)) {
- // The helper uses void* instead of CUDA's opaque CUmodule and
- // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t.
- builder.create<LLVM::LLVMFuncOp>(
- loc, kGpuModuleGetFunctionName,
- LLVM::LLVMType::getFunctionTy(getPointerType(),
- {
- getPointerType(), /* void *module */
- getPointerType() /* char *name */
- },
- /*isVarArg=*/false));
- }
- if (!module.lookupSymbol(kGpuLaunchKernelName)) {
- // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to
- // match the LLVM type if MLIR's index type, which the GPU dialect uses.
- // Furthermore, they use void* instead of CUDA's opaque CUfunction and
- // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t.
- builder.create<LLVM::LLVMFuncOp>(
- loc, kGpuLaunchKernelName,
- LLVM::LLVMType::getFunctionTy(
- getVoidType(),
- {
- getPointerType(), /* void* f */
- getIntPtrType(), /* intptr_t gridXDim */
- getIntPtrType(), /* intptr_t gridyDim */
- getIntPtrType(), /* intptr_t gridZDim */
- getIntPtrType(), /* intptr_t blockXDim */
- getIntPtrType(), /* intptr_t blockYDim */
- getIntPtrType(), /* intptr_t blockZDim */
- getInt32Type(), /* unsigned int sharedMemBytes */
- getPointerType(), /* void *hstream */
- getPointerPointerType(), /* void **kernelParams */
- getPointerPointerType() /* void **extra */
- },
- /*isVarArg=*/false));
- }
- if (!module.lookupSymbol(kGpuStreamCreateName)) {
- // Helper function to get the current GPU compute stream. Uses void*
- // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t.
- builder.create<LLVM::LLVMFuncOp>(
- loc, kGpuStreamCreateName,
- LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
- }
- if (!module.lookupSymbol(kGpuStreamSynchronizeName)) {
- builder.create<LLVM::LLVMFuncOp>(
- loc, kGpuStreamSynchronizeName,
- LLVM::LLVMType::getFunctionTy(getVoidType(),
- {getPointerType()}, /* void *stream */
- /*isVarArg=*/false));
- }
- if (!module.lookupSymbol(kGpuMemHostRegisterName)) {
- builder.create<LLVM::LLVMFuncOp>(
- loc, kGpuMemHostRegisterName,
- LLVM::LLVMType::getFunctionTy(getVoidType(),
- {
- getPointerType(), /* void *ptr */
- getInt64Type() /* int64 sizeBytes*/
- },
- /*isVarArg=*/false));
- }
+} // namespace
+
+void GpuLaunchFuncToGpuRuntimeCallsPass::runOnOperation() {
+ LLVMTypeConverter converter(&getContext());
+ OwningRewritePatternList patterns;
+ populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
+
+ LLVMConversionTarget target(getContext());
+ if (failed(applyPartialConversion(getOperation(), target, patterns)))
+ signalPassFailure();
+}
+
+LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
+ ArrayRef<Value> arguments) const {
+ auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
+ auto function = [&] {
+ if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
+ return function;
+ return OpBuilder(module.getBody()->getTerminator())
+ .create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
+ }();
+ return builder.create<LLVM::CallOp>(
+ loc, const_cast<LLVM::LLVMType &>(functionType).getFunctionResultType(),
+ builder.getSymbolRefAttr(function), arguments);
}
/// Emits the IR with the following structure:
@@ -228,28 +199,26 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
/// %data = llvm.alloca 1 x type-of(<param>)
/// llvm.store <param>, %data
/// %typeErased = llvm.bitcast %data to !llvm<"i8*">
-/// %addr = llvm.getelementptr <list>[<pos>]
+/// %addr = llvm.getelementptr <array>[<pos>]
/// llvm.store %typeErased, %addr
///
-/// This is necessary to construct the list of arguments passed to the kernel
-/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list
-/// of stack-allocated type-erased pointers to the actual arguments.
-void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
- Location loc,
- Value param, Value list,
- unsigned pos,
- Value one) {
+/// This is necessary to construct the array of arguments passed to the kernel
+/// function as accepted by cuLaunchKernel, i.e. as a void** that points to
+/// array of stack-allocated type-erased pointers to the actual arguments.
+void ConvertLaunchFuncOpToGpuRuntimeCallPattern::addParamToArray(
+ OpBuilder &builder, Location loc, Value param, Value array, unsigned pos,
+ Value one) const {
auto memLocation = builder.create<LLVM::AllocaOp>(
loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
- /*alignment=*/1);
+ /*alignment=*/0);
builder.create<LLVM::StoreOp>(loc, param, memLocation);
auto casted =
- builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+ builder.create<LLVM::BitcastOp>(loc, llvmPointerType, memLocation);
- auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+ auto index = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
builder.getI32IntegerAttr(pos));
- auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), list,
- ArrayRef<Value>{index});
+ auto gep = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, array,
+ index.getResult());
builder.create<LLVM::StoreOp>(loc, casted, gep);
}
@@ -261,24 +230,16 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
// for (i : [0, NumKernelOperands))
// %array[i] = cast<void*>(KernelOperand[i])
// return %array
-Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
- gpu::LaunchFuncOp launchOp, OpBuilder &builder) {
-
- // Get the launch target.
- auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
- launchOp, launchOp.kernel());
- if (!gpuFunc)
- return {};
-
- unsigned numArgs = gpuFunc.getNumArguments();
-
+Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
+ gpu::LaunchFuncOp launchOp, unsigned numArguments,
+ OpBuilder &builder) const {
auto numKernelOperands = launchOp.getNumKernelOperands();
Location loc = launchOp.getLoc();
- auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+ auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
builder.getI32IntegerAttr(1));
auto arraySize = builder.create<LLVM::ConstantOp>(
- loc, getInt32Type(), builder.getI32IntegerAttr(numArgs));
- auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
+ loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
+ auto array = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
arraySize, /*alignment=*/0);
unsigned pos = 0;
@@ -290,7 +251,7 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
// hold anymore then we `launchOp` to lower from MemRefType and not after
// LLVMConversion has taken place and the MemRef information is lost.
if (!llvmType.isStructTy()) {
- addParamToList(builder, loc, operand, array, pos++, one);
+ addParamToArray(builder, loc, operand, array, pos++, one);
continue;
}
@@ -304,7 +265,7 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
Value elem = builder.create<LLVM::ExtractValueOp>(
loc, elemType.getArrayElementType(), operand,
builder.getI32ArrayAttr({j, k}));
- addParamToList(builder, loc, elem, array, pos++, one);
+ addParamToArray(builder, loc, elem, array, pos++, one);
}
} else {
assert((elemType.isIntegerTy() || elemType.isFloatTy() ||
@@ -312,7 +273,7 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
"expected scalar type");
Value strct = builder.create<LLVM::ExtractValueOp>(
loc, elemType, operand, builder.getI32ArrayAttr(j));
- addParamToList(builder, loc, strct, array, pos++, one);
+ addParamToArray(builder, loc, strct, array, pos++, one);
}
}
}
@@ -330,8 +291,9 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
// %1 = llvm.constant (0 : index)
// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
// }
-Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
- StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
+Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
+ StringRef moduleName, StringRef name, Location loc,
+ OpBuilder &builder) const {
// Make sure the trailing zero is included in the constant.
std::vector<char> kernelName(name.begin(), name.end());
kernelName.push_back('\0');
@@ -352,93 +314,86 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
// %2 = <see generateKernelNameConstant>
// %3 = call %moduleGetFunction(%1, %2)
// %4 = call %streamCreate()
-// %5 = <see setupParamsArray>
+// %5 = <see generateParamsArray>
// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
// call %streamSynchronize(%4)
-void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
- mlir::gpu::LaunchFuncOp launchOp) {
- OpBuilder builder(launchOp);
- Location loc = launchOp.getLoc();
- declareGpuRuntimeFunctions(loc);
+LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
+ Operation *op, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const {
+ Location loc = op->getLoc();
+ auto launchOp = cast<gpu::LaunchFuncOp>(op);
+ auto moduleOp = op->getParentOfType<ModuleOp>();
- auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
- builder.getI32IntegerAttr(0));
// Create an LLVM global with CUBIN extracted from the kernel annotation and
// obtain a pointer to the first byte in it.
- auto kernelModule = getOperation().lookupSymbol<gpu::GPUModuleOp>(
- launchOp.getKernelModuleName());
+ auto kernelModule =
+ moduleOp.lookupSymbol<gpu::GPUModuleOp>(launchOp.getKernelModuleName());
assert(kernelModule && "expected a kernel module");
auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
if (!binaryAttr) {
kernelModule.emitOpError()
<< "missing " << gpuBinaryAnnotation << " attribute";
- return signalPassFailure();
+ return failure();
}
SmallString<128> nameBuffer(kernelModule.getName());
nameBuffer.append(kGpuBinaryStorageSuffix);
Value data =
- LLVM::createGlobalString(loc, builder, nameBuffer.str(),
+ LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
binaryAttr.getValue(), LLVM::Linkage::Internal);
- // Emit the load module call to load the module data. Error checking is done
- // in the called helper function.
- auto gpuModuleLoad =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleLoadName);
- auto module = builder.create<LLVM::CallOp>(
- loc, ArrayRef<Type>{getPointerType()},
- builder.getSymbolRefAttr(gpuModuleLoad), ArrayRef<Value>{data});
+ auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
// Get the function from the module. The name corresponds to the name of
// the kernel function.
auto kernelName = generateKernelNameConstant(
- launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);
- auto gpuModuleGetFunction =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleGetFunctionName);
- auto function = builder.create<LLVM::CallOp>(
- loc, ArrayRef<Type>{getPointerType()},
- builder.getSymbolRefAttr(gpuModuleGetFunction),
- ArrayRef<Value>{module.getResult(0), kernelName});
+ launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter);
+ auto function = moduleGetFunctionCallBuilder.create(
+ loc, rewriter, {module.getResult(0), kernelName});
// Grab the global stream needed for execution.
- auto gpuStreamCreate =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamCreateName);
- auto stream = builder.create<LLVM::CallOp>(
- loc, ArrayRef<Type>{getPointerType()},
- builder.getSymbolRefAttr(gpuStreamCreate), ArrayRef<Value>{});
- // Invoke the function with required arguments.
- auto gpuLaunchKernel =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuLaunchKernelName);
- auto paramsArray = setupParamsArray(launchOp, builder);
- if (!paramsArray) {
- launchOp.emitOpError() << "cannot pass given parameters to the kernel";
- return signalPassFailure();
+ auto stream = streamCreateCallBuilder.create(loc, rewriter, {});
+
+ // Get the launch target.
+ auto gpuFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
+ launchOp, launchOp.kernel());
+ if (!gpuFuncOp) {
+ launchOp.emitOpError() << "corresponding kernel function not found";
+ return failure();
}
+ // Build array of kernel parameters.
+ auto kernelParams =
+ generateParamsArray(launchOp, gpuFuncOp.getNumArguments(), rewriter);
+
+ // Invoke the function with required arguments.
+ auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+ rewriter.getI32IntegerAttr(0));
auto nullpointer =
- builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
- builder.create<LLVM::CallOp>(
- loc, ArrayRef<Type>{getVoidType()},
- builder.getSymbolRefAttr(gpuLaunchKernel),
- ArrayRef<Value>{function.getResult(0), launchOp.getOperand(0),
- launchOp.getOperand(1), launchOp.getOperand(2),
- launchOp.getOperand(3), launchOp.getOperand(4),
- launchOp.getOperand(5), zero, /* sharedMemBytes */
- stream.getResult(0), /* stream */
- paramsArray, /* kernel params */
- nullpointer /* extra */});
- // Sync on the stream to make it synchronous.
- auto gpuStreamSync =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamSynchronizeName);
- builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getVoidType()},
- builder.getSymbolRefAttr(gpuStreamSync),
- ArrayRef<Value>(stream.getResult(0)));
- launchOp.erase();
+ rewriter.create<LLVM::IntToPtrOp>(loc, llvmPointerPointerType, zero);
+ launchKernelCallBuilder.create(
+ loc, rewriter,
+ {function.getResult(0), launchOp.gridSizeX(), launchOp.gridSizeY(),
+ launchOp.gridSizeZ(), launchOp.blockSizeX(), launchOp.blockSizeY(),
+ launchOp.blockSizeZ(), zero, /* sharedMemBytes */
+ stream.getResult(0), /* stream */
+ kernelParams, /* kernel params */
+ nullpointer /* extra */});
+ streamSynchronizeCallBuilder.create(loc, rewriter, stream.getResult(0));
+
+ rewriter.eraseOp(op);
+ return success();
}
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
StringRef gpuBinaryAnnotation) {
- if (gpuBinaryAnnotation.empty())
- return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
gpuBinaryAnnotation);
}
+
+void mlir::populateGpuToLLVMConversionPatterns(
+ LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
+ StringRef gpuBinaryAnnotation) {
+ patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
+ converter, gpuBinaryAnnotation);
+ patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
+}
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
index cdd8ec3fe5f3..62e7dbe34c00 100644
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -110,15 +110,17 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
PassManager pm(m.getContext());
applyPassManagerCLOptions(pm);
+ const char gpuBinaryAnnotation[] = "nvvm.cubin";
pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
kernelPm.addPass(createConvertGPUKernelToBlobPass(
translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
- "sm_35", "+ptx60", "nvvm.cubin"));
+ "sm_35", "+ptx60", gpuBinaryAnnotation));
pm.addPass(createLowerToLLVMPass());
- pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());
+ pm.addPass(
+ createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
return pm.run(m);
}
diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
index d61cb2d98809..5b3d23adcb89 100644
--- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
+++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
@@ -299,16 +299,17 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
// Configure target features per ROCm / HIP version.
configTargetFeatures();
+ const char gpuBinaryAnnotation[] = "rocdl.hsaco";
pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());
kernelPm.addPass(createConvertGPUKernelToBlobPass(
compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
- features, /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+ features, gpuBinaryAnnotation));
pm.addPass(createLowerToLLVMPass());
- pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
- /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+ pm.addPass(
+ createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
return pm.run(m);
}
More information about the Mlir-commits
mailing list