[Mlir-commits] [mlir] 0d4b7ad - [MLIR] Make gpu.launch_func rewrite pattern part of the LLVM lowering pass.
Christian Sigg
llvmlistbot at llvm.org
Mon Aug 10 10:28:42 PDT 2020
Author: Christian Sigg
Date: 2020-08-10T19:28:30+02:00
New Revision: 0d4b7adb8220f68f5eca08bb53d8af0b0d139c5d
URL: https://github.com/llvm/llvm-project/commit/0d4b7adb8220f68f5eca08bb53d8af0b0d139c5d
DIFF: https://github.com/llvm/llvm-project/commit/0d4b7adb8220f68f5eca08bb53d8af0b0d139c5d.diff
LOG: [MLIR] Make gpu.launch_func rewrite pattern part of the LLVM lowering pass.
Reviewed By: herhut
Differential Revision: https://reviews.llvm.org/D85073
Added:
Modified:
mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
mlir/include/mlir/Conversion/Passes.td
mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 5cb987081bb5..61d62a3ef78c 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -45,8 +45,7 @@ using LoweringCallback = std::function<std::unique_ptr<llvm::Module>(
/// instead uses a small wrapper library that exports a stable and conveniently
/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
std::unique_ptr<OperationPass<ModuleOp>>
-createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
- StringRef gpuBinaryAnnotation = "");
+createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation = "");
/// Collect a set of patterns to convert from the GPU dialect to LLVM.
void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index f4c790655b1f..4d4fe064a6bc 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -82,14 +82,12 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
// GPUCommon
//===----------------------------------------------------------------------===//
-def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime",
- "ModuleOp"> {
- let summary = "Convert all launch_func ops to GPU runtime calls";
- let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()";
+def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
+ let summary = "Convert GPU dialect to LLVM dialect with GPU runtime calls";
+ let constructor = "mlir::createGpuToLLVMConversionPass()";
let options = [
Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
- "\"nvvm.cubin\"",
- "Annotation attribute string for GPU binary">,
+ "", "Annotation attribute string for GPU binary">,
];
}
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
index 51c4cc924fcb..e14c9fbc6718 100644
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -39,11 +39,10 @@ static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
namespace {
-class GpuLaunchFuncToGpuRuntimeCallsPass
- : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
- GpuLaunchFuncToGpuRuntimeCallsPass> {
+class GpuToLLVMConversionPass
+ : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
public:
- GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
+ GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
if (!gpuBinaryAnnotation.empty())
this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
}
@@ -142,10 +141,8 @@ class ConvertLaunchFuncOpToGpuRuntimeCallPattern
gpuBinaryAnnotation(gpuBinaryAnnotation) {}
private:
- void addParamToArray(OpBuilder &builder, Location loc, Value param,
- Value array, unsigned pos, Value one) const;
- Value generateParamsArray(gpu::LaunchFuncOp launchOp, unsigned numArguments,
- OpBuilder &builder) const;
+ Value generateParamsArray(gpu::LaunchFuncOp launchOp,
+ ArrayRef<Value> operands, OpBuilder &builder) const;
Value generateKernelNameConstant(StringRef moduleName, StringRef name,
Location loc, OpBuilder &builder) const;
@@ -170,9 +167,10 @@ class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
} // namespace
-void GpuLaunchFuncToGpuRuntimeCallsPass::runOnOperation() {
+void GpuToLLVMConversionPass::runOnOperation() {
LLVMTypeConverter converter(&getContext());
OwningRewritePatternList patterns;
+ populateStdToLLVMConversionPatterns(converter, patterns);
populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
LLVMConversionTarget target(getContext());
@@ -194,91 +192,57 @@ LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
builder.getSymbolRefAttr(function), arguments);
}
-/// Emits the IR with the following structure:
-///
-/// %data = llvm.alloca 1 x type-of(<param>)
-/// llvm.store <param>, %data
-/// %typeErased = llvm.bitcast %data to !llvm<"i8*">
-/// %addr = llvm.getelementptr <array>[<pos>]
-/// llvm.store %typeErased, %addr
-///
-/// This is necessary to construct the array of arguments passed to the kernel
-/// function as accepted by cuLaunchKernel, i.e. as a void** that points to
-/// array of stack-allocated type-erased pointers to the actual arguments.
-void ConvertLaunchFuncOpToGpuRuntimeCallPattern::addParamToArray(
- OpBuilder &builder, Location loc, Value param, Value array, unsigned pos,
- Value one) const {
- auto memLocation = builder.create<LLVM::AllocaOp>(
- loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
- /*alignment=*/0);
- builder.create<LLVM::StoreOp>(loc, param, memLocation);
- auto casted =
- builder.create<LLVM::BitcastOp>(loc, llvmPointerType, memLocation);
-
- auto index = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
- builder.getI32IntegerAttr(pos));
- auto gep = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, array,
- index.getResult());
- builder.create<LLVM::StoreOp>(loc, casted, gep);
-}
-
-// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
-// launch call. The arguments are extracted from the launchOp.
+// Creates a struct containing all kernel parameters on the stack and returns
+// an array of type-erased pointers to the fields of the struct. The array can
+// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
// The generated code is essentially as follows:
//
-// %array = alloca(numparams * sizeof(void *))
-// for (i : [0, NumKernelOperands))
-// %array[i] = cast<void*>(KernelOperand[i])
+// %struct = alloca(sizeof(struct { Parameters... }))
+// %array = alloca(NumParameters * sizeof(void *))
+// for (i : [0, NumParameters))
+// %fieldPtr = llvm.getelementptr %struct[0, i]
+// llvm.store parameters[i], %fieldPtr
+// %elementPtr = llvm.getelementptr %array[i]
+// llvm.store %fieldPtr, %elementPtr
// return %array
Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
- gpu::LaunchFuncOp launchOp, unsigned numArguments,
+ gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
OpBuilder &builder) const {
+ auto loc = launchOp.getLoc();
auto numKernelOperands = launchOp.getNumKernelOperands();
- Location loc = launchOp.getLoc();
+ auto arguments = typeConverter.promoteOperands(
+ loc, launchOp.getOperands().take_back(numKernelOperands),
+ operands.take_back(numKernelOperands), builder);
+ auto numArguments = arguments.size();
+ SmallVector<LLVM::LLVMType, 4> argumentTypes;
+ argumentTypes.reserve(numArguments);
+ for (auto argument : arguments)
+ argumentTypes.push_back(argument.getType().cast<LLVM::LLVMType>());
+ auto structType = LLVM::LLVMType::createStructTy(argumentTypes, StringRef());
auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
builder.getI32IntegerAttr(1));
+ auto structPtr = builder.create<LLVM::AllocaOp>(
+ loc, structType.getPointerTo(), one, /*alignment=*/0);
auto arraySize = builder.create<LLVM::ConstantOp>(
loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
- auto array = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
- arraySize, /*alignment=*/0);
-
- unsigned pos = 0;
- for (unsigned idx = 0; idx < numKernelOperands; ++idx) {
- auto operand = launchOp.getKernelOperand(idx);
- auto llvmType = operand.getType().cast<LLVM::LLVMType>();
-
- // Assume all struct arguments come from MemRef. If this assumption does not
- // hold anymore then we `launchOp` to lower from MemRefType and not after
- // LLVMConversion has taken place and the MemRef information is lost.
- if (!llvmType.isStructTy()) {
- addParamToArray(builder, loc, operand, array, pos++, one);
- continue;
- }
-
- // Put individual components of a memref descriptor into the flat argument
- // list. We cannot use unpackMemref from LLVM lowering here because we have
- // no access to MemRefType that had been lowered away.
- for (int32_t j = 0, ej = llvmType.getStructNumElements(); j < ej; ++j) {
- auto elemType = llvmType.getStructElementType(j);
- if (elemType.isArrayTy()) {
- for (int32_t k = 0, ek = elemType.getArrayNumElements(); k < ek; ++k) {
- Value elem = builder.create<LLVM::ExtractValueOp>(
- loc, elemType.getArrayElementType(), operand,
- builder.getI32ArrayAttr({j, k}));
- addParamToArray(builder, loc, elem, array, pos++, one);
- }
- } else {
- assert((elemType.isIntegerTy() || elemType.isFloatTy() ||
- elemType.isDoubleTy() || elemType.isPointerTy()) &&
- "expected scalar type");
- Value strct = builder.create<LLVM::ExtractValueOp>(
- loc, elemType, operand, builder.getI32ArrayAttr(j));
- addParamToArray(builder, loc, strct, array, pos++, one);
- }
- }
+ auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
+ arraySize, /*alignment=*/0);
+ auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+ builder.getI32IntegerAttr(0));
+ for (auto en : llvm::enumerate(arguments)) {
+ auto index = builder.create<LLVM::ConstantOp>(
+ loc, llvmInt32Type, builder.getI32IntegerAttr(en.index()));
+ auto fieldPtr =
+ builder.create<LLVM::GEPOp>(loc, structType.getPointerTo(), structPtr,
+ ArrayRef<Value>{zero, index.getResult()});
+ builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
+ auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType,
+ arrayPtr, index.getResult());
+ auto casted =
+ builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
+ builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
}
-
- return array;
+ return arrayPtr;
}
// Generates an LLVM IR dialect global that contains the name of the given
@@ -305,6 +269,11 @@ Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
LLVM::Linkage::Internal);
}
+// Returns whether value is of LLVM type.
+static bool isLLVMType(Value value) {
+ return value.getType().isa<LLVM::LLVMType>();
+}
+
// Emits LLVM IR to launch a kernel function. Expects the module that contains
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
@@ -320,14 +289,17 @@ Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
- Location loc = op->getLoc();
+ if (!llvm::all_of(operands, isLLVMType))
+ return rewriter.notifyMatchFailure(
+ op, "Cannot convert if operands aren't of LLVM type.");
+
auto launchOp = cast<gpu::LaunchFuncOp>(op);
- auto moduleOp = op->getParentOfType<ModuleOp>();
+ Location loc = launchOp.getLoc();
// Create an LLVM global with CUBIN extracted from the kernel annotation and
// obtain a pointer to the first byte in it.
- auto kernelModule =
- moduleOp.lookupSymbol<gpu::GPUModuleOp>(launchOp.getKernelModuleName());
+ auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
+ launchOp, launchOp.getKernelModuleName());
assert(kernelModule && "expected a kernel module");
auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
@@ -350,25 +322,13 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter);
auto function = moduleGetFunctionCallBuilder.create(
loc, rewriter, {module.getResult(0), kernelName});
- // Grab the global stream needed for execution.
- auto stream = streamCreateCallBuilder.create(loc, rewriter, {});
-
- // Get the launch target.
- auto gpuFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
- launchOp, launchOp.kernel());
- if (!gpuFuncOp) {
- launchOp.emitOpError() << "corresponding kernel function not found";
- return failure();
- }
- // Build array of kernel parameters.
- auto kernelParams =
- generateParamsArray(launchOp, gpuFuncOp.getNumArguments(), rewriter);
-
- // Invoke the function with required arguments.
auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
rewriter.getI32IntegerAttr(0));
- auto nullpointer =
- rewriter.create<LLVM::IntToPtrOp>(loc, llvmPointerPointerType, zero);
+ // Grab the global stream needed for execution.
+ auto stream = streamCreateCallBuilder.create(loc, rewriter, {});
+ // Create array of pointers to kernel arguments.
+ auto kernelParams = generateParamsArray(launchOp, operands, rewriter);
+ auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType);
launchKernelCallBuilder.create(
loc, rewriter,
{function.getResult(0), launchOp.gridSizeX(), launchOp.gridSizeY(),
@@ -384,10 +344,8 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
}
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
- StringRef gpuBinaryAnnotation) {
- return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
- gpuBinaryAnnotation);
+mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
+ return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);
}
void mlir::populateGpuToLLVMConversionPatterns(
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index f5c085224f07..2391b185bda8 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
+// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
module attributes {gpu.container_module} {
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
index 62e7dbe34c00..208e27a75f78 100644
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -118,9 +118,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
kernelPm.addPass(createConvertGPUKernelToBlobPass(
translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
"sm_35", "+ptx60", gpuBinaryAnnotation));
- pm.addPass(createLowerToLLVMPass());
- pm.addPass(
- createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
+ pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation));
return pm.run(m);
}
diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
index de07a2370857..4689926be87d 100644
--- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
+++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
@@ -309,9 +309,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
kernelPm.addPass(createConvertGPUKernelToBlobPass(
compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
features, gpuBinaryAnnotation));
- pm.addPass(createLowerToLLVMPass());
- pm.addPass(
- createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
+ pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation));
return pm.run(m);
}
More information about the Mlir-commits
mailing list