[Mlir-commits] [mlir] [mlir][gpu][NVPTX] Enable NVIDIA GPU JIT compilation path (PR #66220)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Sep 13 08:46:42 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
<details>
<summary>Changes</summary>
This patch adds an NVPTX compilation path that enables JIT compilation on NVIDIA targets. The following modifications were performed:
1. Adding a format field to the GPU object attribute, allowing the translation attribute to use the correct runtime function to load the module. Likewise, a dictionary attribute was added to add any possible extra options.
2. Adding the `createObject` method to `GPUTargetAttrInterface`; this method returns a GPU object from a binary string.
3. Adding the function `mgpuModuleLoadJIT`, which is only available for NVIDIA GPUs, as there is no equivalent for AMD.
4. Adding the CMake flag `MLIR_GPU_COMPILATION_TEST_FORMAT` to specify the format to use during testing.
NOTE:
1. Not all tests are using `MLIR_GPU_COMPILATION_TEST_FORMAT`.
2. An option needs to be added to the `SparseCompiler` to support the format option, however I didn't know if there's any preference.
3. I'm basing the implementation of `mgpuModuleLoadJIT` on the assumption there's a [JIT cache](https://developer.nvidia.com/blog/cuda-pro-tip-understand-fat-binaries-jit-caching/). Another option is to implement the cache itself in MLIR.
--
Patch is 50.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66220.diff
33 Files Affected:
- (modified) mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td (+12-3)
- (modified) mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td (+23-2)
- (modified) mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h (+16-22)
- (modified) mlir/include/mlir/Dialect/GPU/Transforms/Passes.td (+1-2)
- (modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+44-5)
- (modified) mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp (+18-18)
- (modified) mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp (+21)
- (modified) mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp (+5)
- (modified) mlir/lib/Target/LLVM/NVVM/Target.cpp (+31-7)
- (modified) mlir/lib/Target/LLVM/ROCDL/Target.cpp (+19-2)
- (modified) mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp (+69-21)
- (modified) mlir/test/CMakeLists.txt (+2)
- (modified) mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir (+3-3)
- (modified) mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir (+3-3)
- (modified) mlir/test/Dialect/GPU/ops.mlir (+10)
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/async.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/lit.local.cfg (+2)
- (modified) mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/printf.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/shuffle.mlir (+1-1)
- (modified) mlir/test/Integration/GPU/CUDA/two-modules.mlir (+1-1)
- (modified) mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp (+7-1)
- (modified) mlir/test/lit.site.cfg.py.in (+1)
- (modified) mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp (+3-3)
- (modified) mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp (+3-3)
<pre>
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
index 5255286619e3bf2..160730480394272 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
@@ -33,12 +33,21 @@ def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> {
If serialization fails then the method should return `std::nullopt`.
- The `module` argument must be a GPU Module Op. The `options` argument is
- meant to be used for passing additional options that are not in the
+ The `module` parameter must be a GPU Module Op. The `options` parameter
+ is meant to be used for passing additional options that are not in the
attribute.
}],
"std::optional<SmallVector<char, 0>>", "serializeToObject",
- (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>
+ (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>,
+ InterfaceMethod<[{
+ Creates a GPU object attribute from a binary string.
+
+ The `object` parameter is a binary string. The `options` parameter is
+ meant to be used for passing additional options that are not in the
+ attribute.
+ }], "Attribute", "createObject",
+ (ins "const SmallVector<char, 0>&":$object,
+ "const gpu::TargetOptions&":$options)>
];
}
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 9c1110d8e9a9463..3d2e9848a2b25a0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -20,6 +20,18 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
// GPU object attribute.
//===----------------------------------------------------------------------===//
+def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">;
+def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">;
+def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">;
+def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">;
+def GPU_CompilationTargetEnum : GPU_I32Enum<
+ "CompilationTarget", "GPU object format", [
+ GPU_ObjectOffload,
+ GPU_ObjectISA,
+ GPU_ObjectBinary,
+ GPU_ObjectFatbin
+ ]>;
+
def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
let description = [{
A GPU object attribute pairs a GPU target with a binary string,
@@ -32,8 +44,17 @@ def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
#gpu.object<#nvvm.target, "...">
```
}];
- let parameters = (ins "Attribute":$target, "StringAttr":$object);
- let assemblyFormat = [{`<` $target `,` $object `>`}];
+ let parameters = (ins
+ "Attribute":$target,
+ DefaultValuedParameter<"CompilationTarget", "CompilationTarget::Fatbin">:$format,
+ "StringAttr":$object,
+ OptionalParameter<"DictionaryAttr">:$properties
+ );
+ let assemblyFormat = [{ `<`
+ $target `,` (`properties` `=` $properties ^ `,`)?
+ custom<Object>($format, $object)
+ `>`
+ }];
let genVerifyDecl = 1;
}
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index a1f64be57fa699d..ee7daed58f98314 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -25,6 +25,8 @@ namespace LLVM {
class ModuleTranslation;
}
namespace gpu {
+enum class CompilationTarget : uint32_t;
+
/// This class indicates that the attribute associated with this trait is a GPU
/// offloading translation attribute. These kinds of attributes must implement
/// an interface for handling the translation of GPU offloading operations like
@@ -42,27 +44,15 @@ class OffloadingTranslationAttrTrait
/// ensure type safeness. Targets are free to ignore these options.
class TargetOptions {
public:
- /// The target representation of the compilation process.
- typedef enum {
- offload = 1, /// The process should produce an offloading representation.
- /// For the NVVM & ROCDL targets this option produces LLVM IR.
- assembly = 2, /// The process should produce assembly code.
- binary = 4, /// The process should produce a binary.
- fatbinary = 8, /// The process should produce a fat binary.
- binOrFatbin =
- binary |
- fatbinary, /// The process should produce a binary or fatbinary. It's up
- /// to the target to decide which.
- } CompilationTarget;
-
/// Constructor initializing the toolkit path, the list of files to link to,
/// extra command line options, the compilation target and a callback for
/// obtaining the parent symbol table. The default compilation target is
/// `binOrFatbin`.
- TargetOptions(StringRef toolkitPath = {},
- ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
- CompilationTarget compilationTarget = binOrFatbin,
- function_ref<SymbolTable *()> getSymbolTableCallback = {});
+ TargetOptions(
+ StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
+ StringRef cmdOptions = {},
+ CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+ function_ref<SymbolTable *()> getSymbolTableCallback = {});
/// Returns the typeID.
TypeID getTypeID() const;
@@ -90,13 +80,17 @@ class TargetOptions {
/// table.
SymbolTable *getSymbolTable() const;
+ /// Returns the default compilation target: `CompilationTarget::Fatbin`.
+ static CompilationTarget getDefaultCompilationTarget();
+
protected:
/// Derived classes must use this constructor to initialize `typeID` to the
/// appropiate value: ie. `TargetOptions(TypeID::get<DerivedClass>())`.
- TargetOptions(TypeID typeID, StringRef toolkitPath = {},
- ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
- CompilationTarget compilationTarget = binOrFatbin,
- function_ref<SymbolTable *()> getSymbolTableCallback = {});
+ TargetOptions(
+ TypeID typeID, StringRef toolkitPath = {},
+ ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
+ CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+ function_ref<SymbolTable *()> getSymbolTableCallback = {});
/// Path to the target toolkit.
std::string toolkitPath;
@@ -108,7 +102,7 @@ class TargetOptions {
/// process.
std::string cmdOptions;
- /// Compilation process target representation.
+ /// Compilation process target format.
CompilationTarget compilationTarget;
/// Callback for obtaining the parent symbol table of all the GPU modules
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 0bfb2750992058f..3de8e18851369df 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -68,7 +68,6 @@ def GpuModuleToBinaryPass
2. `assembly`, `isa`: produces assembly code.
3. `binary`, `bin`: produces binaries.
4. `fatbinary`, `fatbin`: produces fatbinaries.
- 5. `binOrFatbin`: produces bins or fatbins, the target decides which.
}];
let options = [
Option<"offloadingHandler", "handler", "Attribute", "nullptr",
@@ -79,7 +78,7 @@ def GpuModuleToBinaryPass
"Extra files to link to.">,
Option<"cmdOptions", "opts", "std::string", [{""}],
"Command line options to pass to the tools.">,
- Option<"compilationTarget", "format", "std::string", [{"binOrFatbin"}],
+ Option<"compilationTarget", "format", "std::string", [{"fatbin"}],
"The target representation of the compilation process.">
];
}
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index fde379cd0afe13f..5eb2cadc884e151 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1959,7 +1959,8 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
//===----------------------------------------------------------------------===//
LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
- Attribute target, StringAttr object) {
+ Attribute target, CompilationTarget format,
+ StringAttr object, DictionaryAttr properties) {
if (!target)
return emitError() << "the target attribute cannot be null";
if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>())
@@ -1968,6 +1969,40 @@ LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
"`gpu::TargetAttrInterface`";
}
+namespace {
+LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format,
+ StringAttr &object) {
+ std::optional<CompilationTarget> formatResult;
+ StringRef enumKeyword;
+ auto loc = odsParser.getCurrentLocation();
+ if (failed(odsParser.parseOptionalKeyword(&enumKeyword)))
+ formatResult = CompilationTarget::Fatbin;
+ if (!formatResult &&
+ (formatResult =
+ gpu::symbolizeEnum<gpu::CompilationTarget>(enumKeyword)) &&
+ odsParser.parseEqual())
+ return odsParser.emitError(loc, "expected an equal sign");
+ if (!formatResult)
+ return odsParser.emitError(loc, "expected keyword for GPU object format");
+ FailureOr<StringAttr> objectResult =
+ FieldParser<StringAttr>::parse(odsParser);
+ if (failed(objectResult))
+ return odsParser.emitError(odsParser.getCurrentLocation(),
+ "failed to parse GPU_ObjectAttr parameter "
+ "'object' which is to be a `StringAttr`");
+ format = *formatResult;
+ object = *objectResult;
+ return success();
+}
+
+void printObject(AsmPrinter &odsParser, CompilationTarget format,
+ StringAttr object) {
+ if (format != CompilationTarget::Fatbin)
+ odsParser << stringifyEnum(format) << " = ";
+ odsParser << object;
+}
+} // namespace
+
//===----------------------------------------------------------------------===//
// GPU select object attribute
//===----------------------------------------------------------------------===//
@@ -2020,6 +2055,14 @@ SymbolTable *TargetOptions::getSymbolTable() const {
return getSymbolTableCallback ? getSymbolTableCallback() : nullptr;
}
+CompilationTarget TargetOptions::getCompilationTarget() const {
+ return compilationTarget;
+}
+
+CompilationTarget TargetOptions::getDefaultCompilationTarget() {
+ return CompilationTarget::Fatbin;
+}
+
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
TargetOptions::tokenizeCmdOptions() const {
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options;
@@ -2043,10 +2086,6 @@ TargetOptions::tokenizeCmdOptions() const {
return options;
}
-TargetOptions::CompilationTarget TargetOptions::getCompilationTarget() const {
- return compilationTarget;
-}
-
MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
#include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index e29a1f0c3248d04..2bf89f8c57903e5 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -57,14 +57,14 @@ void GpuModuleToBinaryPass::getDependentDialects(
void GpuModuleToBinaryPass::runOnOperation() {
RewritePatternSet patterns(&getContext());
- int targetFormat = llvm::StringSwitch<int>(compilationTarget)
- .Cases("offloading", "llvm", TargetOptions::offload)
- .Cases("assembly", "isa", TargetOptions::assembly)
- .Cases("binary", "bin", TargetOptions::binary)
- .Cases("fatbinary", "fatbin", TargetOptions::fatbinary)
- .Case("binOrFatbin", TargetOptions::binOrFatbin)
- .Default(-1);
- if (targetFormat == -1)
+ auto targetFormat =
+ llvm::StringSwitch<std::optional<CompilationTarget>>(compilationTarget)
+ .Cases("offloading", "llvm", CompilationTarget::Offload)
+ .Cases("assembly", "isa", CompilationTarget::Assembly)
+ .Cases("binary", "bin", CompilationTarget::Binary)
+ .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin)
+ .Default(std::nullopt);
+ if (!targetFormat)
getOperation()->emitError() << "Invalid format specified.";
// Lazy symbol table builder callback.
@@ -82,10 +82,8 @@ void GpuModuleToBinaryPass::runOnOperation() {
return &parentTable.value();
};
- TargetOptions targetOptions(
- toolkitPath, linkFiles, cmdOptions,
- static_cast<TargetOptions::CompilationTarget>(targetFormat),
- lazyTableBuilder);
+ TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat,
+ lazyTableBuilder);
if (failed(transformGpuModulesToBinaries(
getOperation(),
offloadingHandler ? dyn_cast<OffloadingLLVMTranslationAttrInterface>(
@@ -107,17 +105,19 @@ LogicalResult moduleSerializer(GPUModuleOp op,
auto target = dyn_cast<gpu::TargetAttrInterface>(targetAttr);
assert(target &&
"Target attribute doesn't implements `TargetAttrInterface`.");
- std::optional<SmallVector<char, 0>> object =
+ std::optional<SmallVector<char, 0>> serializedModule =
target.serializeToObject(op, targetOptions);
-
- if (!object) {
+ if (!serializedModule) {
op.emitError("An error happened while serializing the module.");
return failure();
}
- objects.push_back(builder.getAttr<gpu::ObjectAttr>(
- target,
- builder.getStringAttr(StringRef(object->data(), object->size()))));
+ Attribute object = target.createObject(*serializedModule, targetOptions);
+ if (!object) {
+ op.emitError("An error happened while creating the object.");
+ return failure();
+ }
+ objects.push_back(object);
}
builder.setInsertionPointAfter(op);
builder.create<gpu::BinaryOp>(op.getLoc(), op.getName(), handler,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 7bf6804902479a8..d19d473a5327627 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -126,6 +126,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
return module;
}
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
+ int optLevel) {
+ ScopedContext scopedContext;
+ CUmodule module = nullptr;
+ char jitErrorBuffer[4096] = {0};
+ CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+ CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+ CU_JIT_OPTIMIZATION_LEVEL};
+ void *jitOptionsVals[] = {jitErrorBuffer,
+ reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
+ reinterpret_cast<void *>(optLevel)};
+
+ CUresult result =
+ cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
+ if (result) {
+ fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer);
+ CUDA_REPORT_IF_ERROR(result);
+ }
+ return module;
+}
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
}
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..da2ae87fef6715f 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -38,6 +38,11 @@ extern "C" hipModule_t mgpuModuleLoad(void *data) {
return module;
}
+extern "C" hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) {
+ assert(false && "This function is not available in HIP.");
+ return nullptr;
+}
+
extern "C" void mgpuModuleUnload(hipModule_t module) {
HIP_REPORT_IF_ERROR(hipModuleUnload(module));
}
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 13188b1107d928b..7f263627db54fbe 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -47,6 +47,10 @@ class NVVMTargetAttrImpl
std::optional<SmallVector<char, 0>>
serializeT...
<truncated>
</pre>
</details>
https://github.com/llvm/llvm-project/pull/66220
More information about the Mlir-commits
mailing list