[Mlir-commits] [mlir] cdb6f05 - [mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass.
Wen-Heng Chung
llvmlistbot at llvm.org
Wed May 20 14:12:03 PDT 2020
Author: Wen-Heng (Jack) Chung
Date: 2020-05-20T16:11:48-05:00
New Revision: cdb6f05e2d5f0132956020e6b4990af0206c066f
URL: https://github.com/llvm/llvm-project/commit/cdb6f05e2d5f0132956020e6b4990af0206c066f
DIFF: https://github.com/llvm/llvm-project/commit/cdb6f05e2d5f0132956020e6b4990af0206c066f.diff
LOG: [mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass.
Due to similar APIs between CUDA and ROCm (HIP),
ConvertGpuLaunchFuncToCudaCalls pass could be used on both platforms with some
refactoring.
In this commit:
- Migrate ConvertLaunchFuncToCudaCalls from GPUToCUDA to GPUCommon, and rename.
- Rename runtime wrapper APIs be platform-neutral.
- Let GPU binary annotation attribute be specifiable as a PassOption.
- Naming changes within the implementation and tests.
Subsequent patches would introduce ROCm-specific tests and runtime wrapper
APIs.
Differential Revision: https://reviews.llvm.org/D80167
Added:
mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
mlir/lib/Conversion/GPUCommon/CMakeLists.txt
mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
Modified:
mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
mlir/include/mlir/Conversion/Passes.td
mlir/include/mlir/InitAllPasses.h
mlir/lib/Conversion/CMakeLists.txt
mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
Removed:
mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
new file mode 100644
index 000000000000..791d859f6414
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -0,0 +1,36 @@
+//===- GPUCommonPass.h - MLIR GPU runtime support -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
+#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace mlir {
+
+class Location;
+class ModuleOp;
+
+template <typename T>
+class OperationPass;
+
+/// Creates a pass to convert a gpu.launch_func operation into a sequence of
+/// GPU runtime calls.
+///
+/// This pass does not generate code to call GPU runtime APIs directly but
+/// instead uses a small wrapper library that exports a stable and conveniently
+/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertGpuLaunchFuncToGpuRuntimeCallsPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
index 6e21483c6728..bac13d6d7ccb 100644
--- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
+++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -45,15 +45,6 @@ using CubinGenerator =
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
-/// Creates a pass to convert a gpu.launch_func operation into a sequence of
-/// CUDA calls.
-///
-/// This pass does not generate code to call CUDA directly but instead uses a
-/// small wrapper library that exports a stable and conveniently typed ABI
-/// on top of CUDA.
-std::unique_ptr<OperationPass<ModuleOp>>
-createConvertGpuLaunchFuncToCudaCallsPass();
-
} // namespace mlir
#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index ea4ea845a5a4..65d05a7aea53 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -79,12 +79,18 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
}
//===----------------------------------------------------------------------===//
-// GPUToCUDA
+// GPUCommon
//===----------------------------------------------------------------------===//
-def ConvertGpuLaunchFuncToCudaCalls : Pass<"launch-func-to-cuda", "ModuleOp"> {
- let summary = "Convert all launch_func ops to CUDA runtime calls";
- let constructor = "mlir::createConvertGpuLaunchFuncToCudaCallsPass()";
+def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime",
+ "ModuleOp"> {
+ let summary = "Convert all launch_func ops to GPU runtime calls";
+ let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()";
+ let options = [
+ Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
+ "\"nvvm.cubin\"",
+ "Annotation attribute string for GPU binary">,
+ ];
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 5b5f72f9d82b..66083f671cde 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -15,6 +15,7 @@
#define MLIR_INITALLPASSES_H_
#include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index d78fb8920516..248f5f5a0e6c 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -1,5 +1,6 @@
add_subdirectory(AffineToStandard)
add_subdirectory(AVX512ToLLVM)
+add_subdirectory(GPUCommon)
add_subdirectory(GPUToCUDA)
add_subdirectory(GPUToNVVM)
add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
new file mode 100644
index 000000000000..a01fb7676b10
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(SOURCES
+ ConvertLaunchFuncToRuntimeCalls.cpp
+)
+
+add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
+ ${SOURCES}
+
+ DEPENDS
+ MLIRConversionPassIncGen
+ intrinsics_gen
+
+ LINK_COMPONENTS
+ Core
+
+ LINK_LIBS PUBLIC
+ MLIRGPU
+ MLIRIR
+ MLIRLLVMIR
+ MLIRPass
+ MLIRSupport
+)
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
similarity index 68%
rename from mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
rename to mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
index cfdcb0f98ade..7bd388803d96 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -1,4 +1,4 @@
-//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===//
+//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +7,13 @@
//===----------------------------------------------------------------------===//
//
// This file implements a pass to convert gpu.launch_func op into a sequence of
-// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI,
-// this pass uses a slim runtime layer that builds on top of the public API from
-// the CUDA headers.
+// GPU runtime calls. As most of GPU runtimes does not have a stable published
+// ABI, this pass uses a slim runtime layer that builds on top of the public
+// API from GPU runtime headers.
//
//===----------------------------------------------------------------------===//
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "../PassDetail.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
@@ -35,33 +35,34 @@
using namespace mlir;
// To avoid name mangling, these are defined in the mini-runtime file.
-static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
-static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
-static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
-static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
-static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
-static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
-
-static constexpr const char *kCubinAnnotation = "nvvm.cubin";
-static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
+static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad";
+static constexpr const char *kGpuModuleGetFunctionName =
+ "mgpuModuleGetFunction";
+static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel";
+static constexpr const char *kGpuGetStreamHelperName = "mgpuGetStreamHelper";
+static constexpr const char *kGpuStreamSynchronizeName =
+ "mgpuStreamSynchronize";
+static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister";
+static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
namespace {
-/// A pass to convert gpu.launch_func operations into a sequence of CUDA
-/// runtime calls.
+/// A pass to convert gpu.launch_func operations into a sequence of GPU
+/// runtime calls. Currently it supports CUDA and ROCm (HIP).
///
/// In essence, a gpu.launch_func operations gets compiled into the following
/// sequence of runtime calls:
///
-/// * mcuModuleLoad -- loads the module given the cubin data
-/// * mcuModuleGetFunction -- gets a handle to the actual kernel function
-/// * mcuGetStreamHelper -- initializes a new CUDA stream
-/// * mcuLaunchKernelName -- launches the kernel on a stream
-/// * mcuStreamSynchronize -- waits for operations on the stream to finish
+/// * moduleLoad -- loads the module given the cubin / hsaco data
+/// * moduleGetFunction -- gets a handle to the actual kernel function
+/// * getStreamHelper -- initializes a new compute stream on GPU
+/// * launchKernel -- launches the kernel on a stream
+/// * streamSynchronize -- waits for operations on the stream to finish
///
/// Intermediate data structures are allocated on the stack.
-class GpuLaunchFuncToCudaCallsPass
- : public ConvertGpuLaunchFuncToCudaCallsBase<GpuLaunchFuncToCudaCallsPass> {
+class GpuLaunchFuncToGpuRuntimeCallsPass
+ : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
+ GpuLaunchFuncToGpuRuntimeCallsPass> {
private:
LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
@@ -99,8 +100,9 @@ class GpuLaunchFuncToCudaCallsPass
getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
}
- LLVM::LLVMType getCUResultType() {
- // This is declared as an enum in CUDA but helpers use i32.
+ LLVM::LLVMType getGpuRuntimeResultType() {
+ // This is declared as an enum in both CUDA and ROCm (HIP), but helpers
+ // use i32.
return getInt32Type();
}
@@ -112,7 +114,7 @@ class GpuLaunchFuncToCudaCallsPass
/*alignment=*/0);
}
- void declareCudaFunctions(Location loc);
+ void declareGpuRuntimeFunctions(Location loc);
void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
unsigned pos, Value one);
Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
@@ -132,7 +134,7 @@ class GpuLaunchFuncToCudaCallsPass
[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
// GPU kernel modules are no longer necessary since we have a global
- // constant with the CUBIN data.
+ // constant with the CUBIN, or HSACO data.
for (auto m :
llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
m.erase();
@@ -151,30 +153,31 @@ class GpuLaunchFuncToCudaCallsPass
} // anonymous namespace
-// Adds declarations for the needed helper functions from the CUDA wrapper.
+// Adds declarations for the needed helper functions from the runtime wrappers.
// The types in comments give the actual types expected/returned but the API
// uses void pointers. This is fine as they have the same linkage in C.
-void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
+void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
+ Location loc) {
ModuleOp module = getOperation();
OpBuilder builder(module.getBody()->getTerminator());
- if (!module.lookupSymbol(cuModuleLoadName)) {
+ if (!module.lookupSymbol(kGpuModuleLoadName)) {
builder.create<LLVM::LLVMFuncOp>(
- loc, cuModuleLoadName,
+ loc, kGpuModuleLoadName,
LLVM::LLVMType::getFunctionTy(
- getCUResultType(),
+ getGpuRuntimeResultType(),
{
getPointerPointerType(), /* CUmodule *module */
getPointerType() /* void *cubin */
},
/*isVarArg=*/false));
}
- if (!module.lookupSymbol(cuModuleGetFunctionName)) {
+ if (!module.lookupSymbol(kGpuModuleGetFunctionName)) {
// The helper uses void* instead of CUDA's opaque CUmodule and
- // CUfunction.
+ // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t.
builder.create<LLVM::LLVMFuncOp>(
- loc, cuModuleGetFunctionName,
+ loc, kGpuModuleGetFunctionName,
LLVM::LLVMType::getFunctionTy(
- getCUResultType(),
+ getGpuRuntimeResultType(),
{
getPointerPointerType(), /* void **function */
getPointerType(), /* void *module */
@@ -182,15 +185,15 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
},
/*isVarArg=*/false));
}
- if (!module.lookupSymbol(cuLaunchKernelName)) {
- // Other than the CUDA api, the wrappers use uintptr_t to match the
- // LLVM type if MLIR's index type, which the GPU dialect uses.
+ if (!module.lookupSymbol(kGpuLaunchKernelName)) {
+ // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to
+ // match the LLVM type if MLIR's index type, which the GPU dialect uses.
// Furthermore, they use void* instead of CUDA's opaque CUfunction and
- // CUstream.
+ // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t.
builder.create<LLVM::LLVMFuncOp>(
- loc, cuLaunchKernelName,
+ loc, kGpuLaunchKernelName,
LLVM::LLVMType::getFunctionTy(
- getCUResultType(),
+ getGpuRuntimeResultType(),
{
getPointerType(), /* void* f */
getIntPtrType(), /* intptr_t gridXDim */
@@ -206,23 +209,23 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
},
/*isVarArg=*/false));
}
- if (!module.lookupSymbol(cuGetStreamHelperName)) {
- // Helper function to get the current CUDA stream. Uses void* instead of
- // CUDAs opaque CUstream.
+ if (!module.lookupSymbol(kGpuGetStreamHelperName)) {
+ // Helper function to get the current GPU compute stream. Uses void*
+ // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t.
builder.create<LLVM::LLVMFuncOp>(
- loc, cuGetStreamHelperName,
+ loc, kGpuGetStreamHelperName,
LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
}
- if (!module.lookupSymbol(cuStreamSynchronizeName)) {
+ if (!module.lookupSymbol(kGpuStreamSynchronizeName)) {
builder.create<LLVM::LLVMFuncOp>(
- loc, cuStreamSynchronizeName,
- LLVM::LLVMType::getFunctionTy(getCUResultType(),
+ loc, kGpuStreamSynchronizeName,
+ LLVM::LLVMType::getFunctionTy(getGpuRuntimeResultType(),
getPointerType() /* CUstream stream */,
/*isVarArg=*/false));
}
- if (!module.lookupSymbol(kMcuMemHostRegister)) {
+ if (!module.lookupSymbol(kGpuMemHostRegisterName)) {
builder.create<LLVM::LLVMFuncOp>(
- loc, kMcuMemHostRegister,
+ loc, kGpuMemHostRegisterName,
LLVM::LLVMType::getFunctionTy(getVoidType(),
{
getPointerType(), /* void *ptr */
@@ -243,10 +246,11 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
/// This is necessary to construct the list of arguments passed to the kernel
/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list
/// of stack-allocated type-erased pointers to the actual arguments.
-void GpuLaunchFuncToCudaCallsPass::addParamToList(OpBuilder &builder,
- Location loc, Value param,
- Value list, unsigned pos,
- Value one) {
+void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
+ Location loc,
+ Value param, Value list,
+ unsigned pos,
+ Value one) {
auto memLocation = builder.create<LLVM::AllocaOp>(
loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
/*alignment=*/1);
@@ -261,16 +265,16 @@ void GpuLaunchFuncToCudaCallsPass::addParamToList(OpBuilder &builder,
builder.create<LLVM::StoreOp>(loc, casted, gep);
}
-// Generates a parameters array to be used with a CUDA kernel launch call. The
-// arguments are extracted from the launchOp.
+// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
+// launch call. The arguments are extracted from the launchOp.
// The generated code is essentially as follows:
//
// %array = alloca(numparams * sizeof(void *))
// for (i : [0, NumKernelOperands))
// %array[i] = cast<void*>(KernelOperand[i])
// return %array
-Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
- OpBuilder &builder) {
+Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
+ gpu::LaunchFuncOp launchOp, OpBuilder &builder) {
// Get the launch target.
auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
@@ -338,7 +342,7 @@ Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
// %1 = llvm.constant (0 : index)
// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
// }
-Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
+Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
// Make sure the trailing zero is included in the constant.
std::vector<char> kernelName(name.begin(), name.end());
@@ -352,30 +356,26 @@ Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
}
// Emits LLVM IR to launch a kernel function. Expects the module that contains
-// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
-// kernel function in the IR.
-// While MLIR has no global constants, also expects a cubin getter function in
-// an 'nvvm.cubingetter' attribute. Such function is expected to return a
-// pointer to the cubin blob when invoked.
-// With these given, the generated code in essence is
+// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
+// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
//
-// %0 = call %cubingetter
+// %0 = call %binarygetter
// %1 = alloca sizeof(void*)
-// call %mcuModuleLoad(%2, %1)
+// call %moduleLoad(%2, %1)
// %2 = alloca sizeof(void*)
// %3 = load %1
// %4 = <see generateKernelNameConstant>
-// call %mcuModuleGetFunction(%2, %3, %4)
-// %5 = call %mcuGetStreamHelper()
+// call %moduleGetFunction(%2, %3, %4)
+// %5 = call %getStreamHelper()
// %6 = load %2
// %7 = <see setupParamsArray>
-// call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
-// call %mcuStreamSynchronize(%5)
-void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
+// call %launchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
+// call %streamSynchronize(%5)
+void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
mlir::gpu::LaunchFuncOp launchOp) {
OpBuilder builder(launchOp);
Location loc = launchOp.getLoc();
- declareCudaFunctions(loc);
+ declareGpuRuntimeFunctions(loc);
auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
builder.getI32IntegerAttr(0));
@@ -385,51 +385,51 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
launchOp.getKernelModuleName());
assert(kernelModule && "expected a kernel module");
- auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
- if (!cubinAttr) {
+ auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
+ if (!binaryAttr) {
kernelModule.emitOpError()
- << "missing " << kCubinAnnotation << " attribute";
+ << "missing " << gpuBinaryAnnotation << " attribute";
return signalPassFailure();
}
SmallString<128> nameBuffer(kernelModule.getName());
- nameBuffer.append(kCubinStorageSuffix);
+ nameBuffer.append(kGpuBinaryStorageSuffix);
Value data = LLVM::createGlobalString(
- loc, builder, nameBuffer.str(), cubinAttr.getValue(),
+ loc, builder, nameBuffer.str(), binaryAttr.getValue(),
LLVM::Linkage::Internal, getLLVMDialect());
// Emit the load module call to load the module data. Error checking is done
// in the called helper function.
- auto cuModule = allocatePointer(builder, loc);
- auto cuModuleLoad =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleLoadName);
- builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
- builder.getSymbolRefAttr(cuModuleLoad),
- ArrayRef<Value>{cuModule, data});
+ auto gpuModule = allocatePointer(builder, loc);
+ auto gpuModuleLoad =
+ getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleLoadName);
+ builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
+ builder.getSymbolRefAttr(gpuModuleLoad),
+ ArrayRef<Value>{gpuModule, data});
// Get the function from the module. The name corresponds to the name of
// the kernel function.
- auto cuOwningModuleRef =
- builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
+ auto gpuOwningModuleRef =
+ builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuModule);
auto kernelName = generateKernelNameConstant(
launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);
- auto cuFunction = allocatePointer(builder, loc);
- auto cuModuleGetFunction =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);
+ auto gpuFunction = allocatePointer(builder, loc);
+ auto gpuModuleGetFunction =
+ getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleGetFunctionName);
builder.create<LLVM::CallOp>(
- loc, ArrayRef<Type>{getCUResultType()},
- builder.getSymbolRefAttr(cuModuleGetFunction),
- ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});
+ loc, ArrayRef<Type>{getGpuRuntimeResultType()},
+ builder.getSymbolRefAttr(gpuModuleGetFunction),
+ ArrayRef<Value>{gpuFunction, gpuOwningModuleRef, kernelName});
// Grab the global stream needed for execution.
- auto cuGetStreamHelper =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName);
- auto cuStream = builder.create<LLVM::CallOp>(
+ auto gpuGetStreamHelper =
+ getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuGetStreamHelperName);
+ auto gpuStream = builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getPointerType()},
- builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{});
+ builder.getSymbolRefAttr(gpuGetStreamHelper), ArrayRef<Value>{});
// Invoke the function with required arguments.
- auto cuLaunchKernel =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName);
- auto cuFunctionRef =
- builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
+ auto gpuLaunchKernel =
+ getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuLaunchKernelName);
+ auto gpuFunctionRef =
+ builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuFunction);
auto paramsArray = setupParamsArray(launchOp, builder);
if (!paramsArray) {
launchOp.emitOpError() << "cannot pass given parameters to the kernel";
@@ -438,25 +438,25 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
auto nullpointer =
builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
builder.create<LLVM::CallOp>(
- loc, ArrayRef<Type>{getCUResultType()},
- builder.getSymbolRefAttr(cuLaunchKernel),
- ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0),
+ loc, ArrayRef<Type>{getGpuRuntimeResultType()},
+ builder.getSymbolRefAttr(gpuLaunchKernel),
+ ArrayRef<Value>{gpuFunctionRef, launchOp.getOperand(0),
launchOp.getOperand(1), launchOp.getOperand(2),
launchOp.getOperand(3), launchOp.getOperand(4),
launchOp.getOperand(5), zero, /* sharedMemBytes */
- cuStream.getResult(0), /* stream */
+ gpuStream.getResult(0), /* stream */
paramsArray, /* kernel params */
nullpointer /* extra */});
// Sync on the stream to make it synchronous.
- auto cuStreamSync =
- getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName);
- builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
- builder.getSymbolRefAttr(cuStreamSync),
- ArrayRef<Value>(cuStream.getResult(0)));
+ auto gpuStreamSync =
+ getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamSynchronizeName);
+ builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
+ builder.getSymbolRefAttr(gpuStreamSync),
+ ArrayRef<Value>(gpuStream.getResult(0)));
launchOp.erase();
}
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
- return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
+mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() {
+ return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
}
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
index 4696dd65fa62..31ca6055c395 100644
--- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -2,10 +2,6 @@ set(LLVM_OPTIONAL_SOURCES
ConvertKernelFuncToCubin.cpp
)
-set(SOURCES
- ConvertLaunchFuncToCudaCalls.cpp
-)
-
if (MLIR_CUDA_CONVERSIONS_ENABLED)
list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp")
set(NVPTX_LIBS
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
similarity index 57%
rename from mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
rename to mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 20b76a2e3a29..a3381465ebf2 100644
--- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -1,11 +1,13 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-cuda | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
module attributes {gpu.container_module} {
// CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00")
// CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")
+ // ROCDL: llvm.mlir.global internal constant @[[global:.*]]("HSACO")
- gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} {
+ gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO"} {
llvm.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {
llvm.return
}
@@ -18,15 +20,15 @@ module attributes {gpu.container_module} {
// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
- // CHECK: %[[cubin_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
+ // CHECK: %[[binary_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
// CHECK-SAME: -> !llvm<"i8*">
// CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
- // CHECK: llvm.call @mcuModuleLoad(%[[module_ptr]], %[[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
+ // CHECK: llvm.call @mgpuModuleLoad(%[[module_ptr]], %[[binary_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
// CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
- // CHECK: llvm.call @mcuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
- // CHECK: llvm.call @mcuGetStreamHelper
- // CHECK: llvm.call @mcuLaunchKernel
- // CHECK: llvm.call @mcuStreamSynchronize
+ // CHECK: llvm.call @mgpuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
+ // CHECK: llvm.call @mgpuGetStreamHelper
+ // CHECK: llvm.call @mgpuLaunchKernel
+ // CHECK: llvm.call @mgpuStreamSynchronize
"gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_module::@kernel }
: (!llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.float, !llvm<"float*">) -> ()
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
index 0efd1709cee3..cce0b8124f4c 100644
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -30,7 +30,7 @@ int32_t reportErrorIfAny(CUresult result, const char *where) {
}
} // anonymous namespace
-extern "C" int32_t mcuModuleLoad(void **module, void *data) {
+extern "C" int32_t mgpuModuleLoad(void **module, void *data) {
int32_t err = reportErrorIfAny(
cuModuleLoadData(reinterpret_cast<CUmodule *>(module), data),
"ModuleLoad");
@@ -48,11 +48,11 @@ extern "C" int32_t mcuModuleGetFunction(void **function, void *module,
// The wrapper uses intptr_t instead of CUDA's unsigned int to match
// the type of MLIR's index type. This avoids the need for casts in the
// generated MLIR code.
-extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX,
- intptr_t gridY, intptr_t gridZ,
- intptr_t blockX, intptr_t blockY,
- intptr_t blockZ, int32_t smem, void *stream,
- void **params, void **extra) {
+extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
+ intptr_t gridY, intptr_t gridZ,
+ intptr_t blockX, intptr_t blockY,
+ intptr_t blockZ, int32_t smem, void *stream,
+ void **params, void **extra) {
return reportErrorIfAny(
cuLaunchKernel(reinterpret_cast<CUfunction>(function), gridX, gridY,
gridZ, blockX, blockY, blockZ, smem,
@@ -60,13 +60,13 @@ extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX,
"LaunchKernel");
}
-extern "C" void *mcuGetStreamHelper() {
+extern "C" void *mgpuGetStreamHelper() {
CUstream stream;
reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate");
return stream;
}
-extern "C" int32_t mcuStreamSynchronize(void *stream) {
+extern "C" int32_t mgpuStreamSynchronize(void *stream) {
return reportErrorIfAny(
cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");
}
@@ -75,7 +75,7 @@ extern "C" int32_t mcuStreamSynchronize(void *stream) {
// Allows to register byte array with the CUDA runtime. Helpful until we have
// transfer functions implemented.
-extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0),
"MemHostRegister");
}
@@ -99,7 +99,7 @@ void mcuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
assert(strides == llvm::makeArrayRef(denseStrides));
std::fill_n(pointer, count, value);
- mcuMemHostRegister(pointer, count * sizeof(T));
+ mgpuMemHostRegister(pointer, count * sizeof(T));
}
extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) {
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
index e784a0abf835..6a404221744b 100644
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -14,6 +14,7 @@
#include "llvm/ADT/STLExtras.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
@@ -115,7 +116,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
pm.addPass(createLowerToLLVMPass());
- pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
+ pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());
return pm.run(m);
}
More information about the Mlir-commits
mailing list