[Mlir-commits] [mlir] [mlir][gpu][NVPTX] Enable NVIDIA GPU JIT compilation path (PR #66220)
Fabian Mora
llvmlistbot at llvm.org
Thu Sep 14 06:13:48 PDT 2023
https://github.com/fabianmcg updated https://github.com/llvm/llvm-project/pull/66220:
>From 608af55af6b768efbc2ab4139aea61a8728e50e6 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Tue, 12 Sep 2023 19:32:43 +0000
Subject: [PATCH 1/3] [mlir][gpu][NVPTX] Enable NVIDIA GPU JIT compilation path
This patch adds an NVPTX compilation path that enables JIT compilation on NVIDIA
targets. The following modifications were performed:
1. Adding a format field to the GPU object attribute, allowing the translation
attribute to use the correct runtime function to load the module. Likewise, a
dictionary attribute was added to add any possible extra options.
2. Adding the "createObject" method to "GPUTargetAttrInterface"; this method
returns a GPU object from a binary string.
3. Adding the function "mgpuModuleLoadJIT", which is only available for NVIDIA GPUs,
as there is no equivalent for AMD.
4. Adding the CMake flag `MLIR_GPU_COMPILATION_TEST_FORMAT` to specify the format
to use during testing.
---
.../GPU/IR/CompilationAttrInterfaces.td | 15 +++-
.../mlir/Dialect/GPU/IR/CompilationAttrs.td | 25 +++++-
.../Dialect/GPU/IR/CompilationInterfaces.h | 38 ++++----
.../mlir/Dialect/GPU/Transforms/Passes.td | 3 +-
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 49 ++++++++--
.../Dialect/GPU/Transforms/ModuleToBinary.cpp | 36 ++++----
.../ExecutionEngine/CudaRuntimeWrappers.cpp | 21 +++++
.../ExecutionEngine/RocmRuntimeWrappers.cpp | 5 ++
mlir/lib/Target/LLVM/NVVM/Target.cpp | 38 ++++++--
mlir/lib/Target/LLVM/ROCDL/Target.cpp | 21 ++++-
.../LLVMIR/Dialect/GPU/SelectObjectAttr.cpp | 90 ++++++++++++++-----
mlir/test/CMakeLists.txt | 2 +
.../Dialect/GPU/module-to-binary-nvvm.mlir | 6 +-
.../Dialect/GPU/module-to-binary-rocdl.mlir | 6 +-
mlir/test/Dialect/GPU/ops.mlir | 10 +++
.../Integration/GPU/CUDA/all-reduce-and.mlir | 2 +-
.../Integration/GPU/CUDA/all-reduce-max.mlir | 2 +-
.../Integration/GPU/CUDA/all-reduce-min.mlir | 2 +-
.../Integration/GPU/CUDA/all-reduce-op.mlir | 2 +-
.../Integration/GPU/CUDA/all-reduce-or.mlir | 2 +-
.../GPU/CUDA/all-reduce-region.mlir | 2 +-
.../Integration/GPU/CUDA/all-reduce-xor.mlir | 2 +-
mlir/test/Integration/GPU/CUDA/async.mlir | 2 +-
.../Integration/GPU/CUDA/gpu-to-cubin.mlir | 2 +-
mlir/test/Integration/GPU/CUDA/lit.local.cfg | 2 +
.../GPU/CUDA/multiple-all-reduce.mlir | 2 +-
mlir/test/Integration/GPU/CUDA/printf.mlir | 2 +-
mlir/test/Integration/GPU/CUDA/shuffle.mlir | 2 +-
.../Integration/GPU/CUDA/two-modules.mlir | 2 +-
mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp | 8 +-
mlir/test/lit.site.cfg.py.in | 1 +
.../Target/LLVM/SerializeNVVMTarget.cpp | 6 +-
.../Target/LLVM/SerializeROCDLTarget.cpp | 6 +-
33 files changed, 306 insertions(+), 108 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
index 5255286619e3bf2..160730480394272 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
@@ -33,12 +33,21 @@ def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> {
If serialization fails then the method should return `std::nullopt`.
- The `module` argument must be a GPU Module Op. The `options` argument is
- meant to be used for passing additional options that are not in the
+ The `module` parameter must be a GPU Module Op. The `options` parameter
+ is meant to be used for passing additional options that are not in the
attribute.
}],
"std::optional<SmallVector<char, 0>>", "serializeToObject",
- (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>
+ (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>,
+ InterfaceMethod<[{
+ Creates a GPU object attribute from a binary string.
+
+ The `object` parameter is a binary string. The `options` parameter is
+ meant to be used for passing additional options that are not in the
+ attribute.
+ }], "Attribute", "createObject",
+ (ins "const SmallVector<char, 0>&":$object,
+ "const gpu::TargetOptions&":$options)>
];
}
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 9c1110d8e9a9463..3d2e9848a2b25a0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -20,6 +20,18 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
// GPU object attribute.
//===----------------------------------------------------------------------===//
+def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">;
+def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">;
+def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">;
+def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">;
+def GPU_CompilationTargetEnum : GPU_I32Enum<
+ "CompilationTarget", "GPU object format", [
+ GPU_ObjectOffload,
+ GPU_ObjectISA,
+ GPU_ObjectBinary,
+ GPU_ObjectFatbin
+ ]>;
+
def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
let description = [{
A GPU object attribute pairs a GPU target with a binary string,
@@ -32,8 +44,17 @@ def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
#gpu.object<#nvvm.target, "...">
```
}];
- let parameters = (ins "Attribute":$target, "StringAttr":$object);
- let assemblyFormat = [{`<` $target `,` $object `>`}];
+ let parameters = (ins
+ "Attribute":$target,
+ DefaultValuedParameter<"CompilationTarget", "CompilationTarget::Fatbin">:$format,
+ "StringAttr":$object,
+ OptionalParameter<"DictionaryAttr">:$properties
+ );
+ let assemblyFormat = [{ `<`
+ $target `,` (`properties` `=` $properties ^ `,`)?
+ custom<Object>($format, $object)
+ `>`
+ }];
let genVerifyDecl = 1;
}
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index a1f64be57fa699d..ee7daed58f98314 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -25,6 +25,8 @@ namespace LLVM {
class ModuleTranslation;
}
namespace gpu {
+enum class CompilationTarget : uint32_t;
+
/// This class indicates that the attribute associated with this trait is a GPU
/// offloading translation attribute. These kinds of attributes must implement
/// an interface for handling the translation of GPU offloading operations like
@@ -42,27 +44,15 @@ class OffloadingTranslationAttrTrait
/// ensure type safeness. Targets are free to ignore these options.
class TargetOptions {
public:
- /// The target representation of the compilation process.
- typedef enum {
- offload = 1, /// The process should produce an offloading representation.
- /// For the NVVM & ROCDL targets this option produces LLVM IR.
- assembly = 2, /// The process should produce assembly code.
- binary = 4, /// The process should produce a binary.
- fatbinary = 8, /// The process should produce a fat binary.
- binOrFatbin =
- binary |
- fatbinary, /// The process should produce a binary or fatbinary. It's up
- /// to the target to decide which.
- } CompilationTarget;
-
/// Constructor initializing the toolkit path, the list of files to link to,
/// extra command line options, the compilation target and a callback for
/// obtaining the parent symbol table. The default compilation target is
/// `binOrFatbin`.
- TargetOptions(StringRef toolkitPath = {},
- ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
- CompilationTarget compilationTarget = binOrFatbin,
- function_ref<SymbolTable *()> getSymbolTableCallback = {});
+ TargetOptions(
+ StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
+ StringRef cmdOptions = {},
+ CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+ function_ref<SymbolTable *()> getSymbolTableCallback = {});
/// Returns the typeID.
TypeID getTypeID() const;
@@ -90,13 +80,17 @@ class TargetOptions {
/// table.
SymbolTable *getSymbolTable() const;
+ /// Returns the default compilation target: `CompilationTarget::Fatbin`.
+ static CompilationTarget getDefaultCompilationTarget();
+
protected:
/// Derived classes must use this constructor to initialize `typeID` to the
/// appropiate value: ie. `TargetOptions(TypeID::get<DerivedClass>())`.
- TargetOptions(TypeID typeID, StringRef toolkitPath = {},
- ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
- CompilationTarget compilationTarget = binOrFatbin,
- function_ref<SymbolTable *()> getSymbolTableCallback = {});
+ TargetOptions(
+ TypeID typeID, StringRef toolkitPath = {},
+ ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
+ CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+ function_ref<SymbolTable *()> getSymbolTableCallback = {});
/// Path to the target toolkit.
std::string toolkitPath;
@@ -108,7 +102,7 @@ class TargetOptions {
/// process.
std::string cmdOptions;
- /// Compilation process target representation.
+ /// Compilation process target format.
CompilationTarget compilationTarget;
/// Callback for obtaining the parent symbol table of all the GPU modules
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 0bfb2750992058f..3de8e18851369df 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -68,7 +68,6 @@ def GpuModuleToBinaryPass
2. `assembly`, `isa`: produces assembly code.
3. `binary`, `bin`: produces binaries.
4. `fatbinary`, `fatbin`: produces fatbinaries.
- 5. `binOrFatbin`: produces bins or fatbins, the target decides which.
}];
let options = [
Option<"offloadingHandler", "handler", "Attribute", "nullptr",
@@ -79,7 +78,7 @@ def GpuModuleToBinaryPass
"Extra files to link to.">,
Option<"cmdOptions", "opts", "std::string", [{""}],
"Command line options to pass to the tools.">,
- Option<"compilationTarget", "format", "std::string", [{"binOrFatbin"}],
+ Option<"compilationTarget", "format", "std::string", [{"fatbin"}],
"The target representation of the compilation process.">
];
}
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index fde379cd0afe13f..5eb2cadc884e151 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1959,7 +1959,8 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
//===----------------------------------------------------------------------===//
LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
- Attribute target, StringAttr object) {
+ Attribute target, CompilationTarget format,
+ StringAttr object, DictionaryAttr properties) {
if (!target)
return emitError() << "the target attribute cannot be null";
if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>())
@@ -1968,6 +1969,40 @@ LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
"`gpu::TargetAttrInterface`";
}
+namespace {
+LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format,
+ StringAttr &object) {
+ std::optional<CompilationTarget> formatResult;
+ StringRef enumKeyword;
+ auto loc = odsParser.getCurrentLocation();
+ if (failed(odsParser.parseOptionalKeyword(&enumKeyword)))
+ formatResult = CompilationTarget::Fatbin;
+ if (!formatResult &&
+ (formatResult =
+ gpu::symbolizeEnum<gpu::CompilationTarget>(enumKeyword)) &&
+ odsParser.parseEqual())
+ return odsParser.emitError(loc, "expected an equal sign");
+ if (!formatResult)
+ return odsParser.emitError(loc, "expected keyword for GPU object format");
+ FailureOr<StringAttr> objectResult =
+ FieldParser<StringAttr>::parse(odsParser);
+ if (failed(objectResult))
+ return odsParser.emitError(odsParser.getCurrentLocation(),
+ "failed to parse GPU_ObjectAttr parameter "
+ "'object' which is to be a `StringAttr`");
+ format = *formatResult;
+ object = *objectResult;
+ return success();
+}
+
+void printObject(AsmPrinter &odsParser, CompilationTarget format,
+ StringAttr object) {
+ if (format != CompilationTarget::Fatbin)
+ odsParser << stringifyEnum(format) << " = ";
+ odsParser << object;
+}
+} // namespace
+
//===----------------------------------------------------------------------===//
// GPU select object attribute
//===----------------------------------------------------------------------===//
@@ -2020,6 +2055,14 @@ SymbolTable *TargetOptions::getSymbolTable() const {
return getSymbolTableCallback ? getSymbolTableCallback() : nullptr;
}
+CompilationTarget TargetOptions::getCompilationTarget() const {
+ return compilationTarget;
+}
+
+CompilationTarget TargetOptions::getDefaultCompilationTarget() {
+ return CompilationTarget::Fatbin;
+}
+
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
TargetOptions::tokenizeCmdOptions() const {
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options;
@@ -2043,10 +2086,6 @@ TargetOptions::tokenizeCmdOptions() const {
return options;
}
-TargetOptions::CompilationTarget TargetOptions::getCompilationTarget() const {
- return compilationTarget;
-}
-
MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
#include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index e29a1f0c3248d04..2bf89f8c57903e5 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -57,14 +57,14 @@ void GpuModuleToBinaryPass::getDependentDialects(
void GpuModuleToBinaryPass::runOnOperation() {
RewritePatternSet patterns(&getContext());
- int targetFormat = llvm::StringSwitch<int>(compilationTarget)
- .Cases("offloading", "llvm", TargetOptions::offload)
- .Cases("assembly", "isa", TargetOptions::assembly)
- .Cases("binary", "bin", TargetOptions::binary)
- .Cases("fatbinary", "fatbin", TargetOptions::fatbinary)
- .Case("binOrFatbin", TargetOptions::binOrFatbin)
- .Default(-1);
- if (targetFormat == -1)
+ auto targetFormat =
+ llvm::StringSwitch<std::optional<CompilationTarget>>(compilationTarget)
+ .Cases("offloading", "llvm", CompilationTarget::Offload)
+ .Cases("assembly", "isa", CompilationTarget::Assembly)
+ .Cases("binary", "bin", CompilationTarget::Binary)
+ .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin)
+ .Default(std::nullopt);
+ if (!targetFormat)
getOperation()->emitError() << "Invalid format specified.";
// Lazy symbol table builder callback.
@@ -82,10 +82,8 @@ void GpuModuleToBinaryPass::runOnOperation() {
return &parentTable.value();
};
- TargetOptions targetOptions(
- toolkitPath, linkFiles, cmdOptions,
- static_cast<TargetOptions::CompilationTarget>(targetFormat),
- lazyTableBuilder);
+ TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat,
+ lazyTableBuilder);
if (failed(transformGpuModulesToBinaries(
getOperation(),
offloadingHandler ? dyn_cast<OffloadingLLVMTranslationAttrInterface>(
@@ -107,17 +105,19 @@ LogicalResult moduleSerializer(GPUModuleOp op,
auto target = dyn_cast<gpu::TargetAttrInterface>(targetAttr);
assert(target &&
"Target attribute doesn't implements `TargetAttrInterface`.");
- std::optional<SmallVector<char, 0>> object =
+ std::optional<SmallVector<char, 0>> serializedModule =
target.serializeToObject(op, targetOptions);
-
- if (!object) {
+ if (!serializedModule) {
op.emitError("An error happened while serializing the module.");
return failure();
}
- objects.push_back(builder.getAttr<gpu::ObjectAttr>(
- target,
- builder.getStringAttr(StringRef(object->data(), object->size()))));
+ Attribute object = target.createObject(*serializedModule, targetOptions);
+ if (!object) {
+ op.emitError("An error happened while creating the object.");
+ return failure();
+ }
+ objects.push_back(object);
}
builder.setInsertionPointAfter(op);
builder.create<gpu::BinaryOp>(op.getLoc(), op.getName(), handler,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 7bf6804902479a8..d19d473a5327627 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -126,6 +126,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
return module;
}
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
+ int optLevel) {
+ ScopedContext scopedContext;
+ CUmodule module = nullptr;
+ char jitErrorBuffer[4096] = {0};
+ CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+ CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+ CU_JIT_OPTIMIZATION_LEVEL};
+ void *jitOptionsVals[] = {jitErrorBuffer,
+ reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
+ reinterpret_cast<void *>(optLevel)};
+
+ CUresult result =
+ cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
+ if (result) {
+ fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer);
+ CUDA_REPORT_IF_ERROR(result);
+ }
+ return module;
+}
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
}
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..da2ae87fef6715f 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -38,6 +38,11 @@ extern "C" hipModule_t mgpuModuleLoad(void *data) {
return module;
}
+extern "C" hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) {
+ assert(false && "This function is not available in HIP.");
+ return nullptr;
+}
+
extern "C" void mgpuModuleUnload(hipModule_t module) {
HIP_REPORT_IF_ERROR(hipModuleUnload(module));
}
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 13188b1107d928b..7f263627db54fbe 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -47,6 +47,10 @@ class NVVMTargetAttrImpl
std::optional<SmallVector<char, 0>>
serializeToObject(Attribute attribute, Operation *module,
const gpu::TargetOptions &options) const;
+
+ Attribute createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const;
};
} // namespace
@@ -227,9 +231,9 @@ std::optional<std::string> NVPTXSerializer::findTool(StringRef tool) {
}
// 2. Check PATH.
- if (std::optional<std::string> ptxasCompiler =
+ if (std::optional<std::string> toolPath =
llvm::sys::Process::FindInEnvPath("PATH", tool))
- return *ptxasCompiler;
+ return *toolPath;
// 3. Check `getCUDAToolkitPath()`.
pathRef = getCUDAToolkitPath();
@@ -255,8 +259,7 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) {
// Determine if the serializer should create a fatbinary with the PTX embeded
// or a simple CUBIN binary.
const bool createFatbin =
- (targetOptions.getCompilationTarget() & gpu::TargetOptions::fatbinary) ==
- gpu::TargetOptions::fatbinary;
+ targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin;
// Find the `ptxas` & `fatbinary` tools.
std::optional<std::string> ptxasCompiler = findTool("ptxas");
@@ -522,7 +525,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule,
llvm::dbgs().flush();
});
#undef DEBUG_TYPE
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload)
+ if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine);
// Emit PTX code.
@@ -541,8 +544,12 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule,
#undef DEBUG_TYPE
// Return PTX if the compilation target is assembly.
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly)
- return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
+ if (targetOptions.getCompilationTarget() ==
+ gpu::CompilationTarget::Assembly) {
+ // Make sure to include the null terminator.
+ StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
+ return SmallVector<char, 0>(bin.begin(), bin.end());
+ }
// Compile to binary.
#if MLIR_NVPTXCOMPILER_ENABLED == 1
@@ -573,3 +580,20 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
return std::nullopt;
#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
}
+
+Attribute
+NVVMTargetAttrImpl::createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const {
+ auto target = cast<NVVMTargetAttr>(attribute);
+ gpu::CompilationTarget format = options.getCompilationTarget();
+ DictionaryAttr objectProps;
+ Builder builder(attribute.getContext());
+ if (format == gpu::CompilationTarget::Assembly)
+ objectProps = builder.getDictionaryAttr(
+ {builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO()))});
+ return builder.getAttr<gpu::ObjectAttr>(
+ attribute, format,
+ builder.getStringAttr(StringRef(object.data(), object.size())),
+ objectProps);
+}
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index effb88554e8ee04..611d08fe3e79e56 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -60,6 +60,10 @@ class ROCDLTargetAttrImpl
std::optional<SmallVector<char, 0>>
serializeToObject(Attribute attribute, Operation *module,
const gpu::TargetOptions &options) const;
+
+ Attribute createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const;
};
} // namespace
@@ -417,7 +421,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule,
<< llvmModule << "\n";
});
#undef DEBUG_TYPE
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload)
+ if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine);
// Translate the Module to ISA.
@@ -434,7 +438,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule,
});
#undef DEBUG_TYPE
// Return ISA assembly code if the compilation target is assembly.
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly)
+ if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
// Compile to binary.
@@ -463,3 +467,16 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
return std::nullopt;
#endif // MLIR_ROCM_CONVERSIONS_ENABLED == 1
}
+
+Attribute
+ROCDLTargetAttrImpl::createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const {
+ gpu::CompilationTarget format = options.getCompilationTarget();
+ Builder builder(attribute.getContext());
+ return builder.getAttr<gpu::ObjectAttr>(
+ attribute,
+ format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary
+ : format,
+ builder.getStringAttr(StringRef(object.data(), object.size())), nullptr);
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
index 3b060ac1779db26..47fe6973778cd7f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
@@ -44,6 +44,9 @@ class SelectObjectAttrImpl
Operation *binaryOperation,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) const;
+
+ // Returns the selected object for embedding.
+ gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
};
// Returns an identifier for the global string holding the binary.
std::string getBinaryIdentifier(StringRef binaryName) {
@@ -58,24 +61,15 @@ void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
});
}
-LogicalResult SelectObjectAttrImpl::embedBinary(
- Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
- LLVM::ModuleTranslation &moduleTranslation) const {
- assert(operation && "The binary operation must be non null.");
- if (!operation)
- return failure();
-
- auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
- if (!op) {
- operation->emitError("Operation must be a GPU binary.");
- return failure();
- }
-
+gpu::ObjectAttr
+SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
// Obtain the index of the object to select.
int64_t index = -1;
- if (Attribute target = cast<gpu::SelectObjectAttr>(attribute).getTarget()) {
+ if (Attribute target =
+ cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
+ .getTarget()) {
// If the target attribute is a number it is the index. Otherwise compare
// the attribute to every target inside the object array to find the index.
if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
@@ -95,10 +89,28 @@ LogicalResult SelectObjectAttrImpl::embedBinary(
}
if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
- op->emitError("The requested target object couldn't be found.");
+ op->emitError("the requested target object couldn't be found");
+ return nullptr;
+ }
+ return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
+}
+
+LogicalResult SelectObjectAttrImpl::embedBinary(
+ Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const {
+ assert(operation && "The binary operation must be non null.");
+ if (!operation)
+ return failure();
+
+ auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
+ if (!op) {
+ operation->emitError("operation must be a GPU binary");
return failure();
}
- auto object = mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
+
+ gpu::ObjectAttr object = getSelectedObject(op);
+ if (!object)
+ return failure();
llvm::Module *module = moduleTranslation.getLLVMModule();
@@ -130,6 +142,9 @@ class LaunchKernel {
// Get the module load callee.
FunctionCallee getModuleLoadFn();
+ // Get the module load JIT callee.
+ FunctionCallee getModuleLoadJITFn();
+
// Get the module unload callee.
FunctionCallee getModuleUnloadFn();
@@ -149,7 +164,8 @@ class LaunchKernel {
Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
// Create the full kernel launch.
- mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op);
+ mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+ mlir::gpu::ObjectAttr object);
private:
Module &module;
@@ -174,13 +190,22 @@ LogicalResult SelectObjectAttrImpl::launchKernel(
auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
if (!launchFuncOp) {
- launchFuncOperation->emitError("Operation must be a GPU launch func Op.");
+ launchFuncOperation->emitError("operation must be a GPU launch func Op.");
return failure();
}
+ auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
+ if (!binOp) {
+ binaryOperation->emitError("operation must be a GPU binary.");
+ return failure();
+ }
+ gpu::ObjectAttr object = getSelectedObject(binOp);
+ if (!object)
+ return failure();
+
return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
moduleTranslation)
- .createKernelLaunch(launchFuncOp);
+ .createKernelLaunch(launchFuncOp, object);
}
llvm::LaunchKernel::LaunchKernel(
@@ -215,6 +240,12 @@ llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy}), false));
}
+llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() {
+ return module.getOrInsertFunction(
+ "mgpuModuleLoadJIT",
+ FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i32Ty}), false));
+}
+
llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() {
return module.getOrInsertFunction(
"mgpuModuleUnload",
@@ -299,7 +330,8 @@ llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
// call %streamDestroy(%4)
// call %moduleUnload(%1)
mlir::LogicalResult
-llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) {
+llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+ mlir::gpu::ObjectAttr object) {
auto llvmValue = [&](mlir::Value value) -> Value * {
Value *v = moduleTranslation.lookupValue(value);
assert(v && "Value has not been translated.");
@@ -326,13 +358,29 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) {
// Create the argument array.
Value *argArray = createKernelArgArray(op);
+ // Default JIT optimization level.
+ llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0);
+ // Check if there's an optimization level embedded in the object.
+ DictionaryAttr objectProps = object.getProperties();
+ mlir::Attribute optAttr;
+ if (objectProps && (optAttr = objectProps.get("O"))) {
+ auto optLevel = dyn_cast<IntegerAttr>(optAttr);
+ if (!optLevel)
+ return op.emitError("the optimization level must be an integer");
+ optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue());
+ }
+
// Load the kernel module.
StringRef moduleName = op.getKernelModuleName().getValue();
std::string binaryIdentifier = getBinaryIdentifier(moduleName);
Value *binary = module.getGlobalVariable(binaryIdentifier, true);
if (!binary)
return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
- Value *moduleObject = builder.CreateCall(getModuleLoadFn(), {binary});
+
+ Value *moduleObject =
+ object.getFormat() == gpu::CompilationTarget::Assembly
+ ? builder.CreateCall(getModuleLoadJITFn(), {binary, optV})
+ : builder.CreateCall(getModuleLoadFn(), {binary});
// Load the kernel function.
Value *moduleFunction = builder.CreateCall(
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 66a9cb01106ba5d..bf143d036c2f66f 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -26,6 +26,8 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
"If arch-specific Arm integration tests run emulated, use this Arm native lli.")
set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING
"If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.")
+ set(MLIR_GPU_COMPILATION_TEST_FORMAT "fatbin" CACHE STRING
+ "The GPU compilation format used by the tests.")
option(MLIR_RUN_AMX_TESTS "Run AMX tests.")
option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.")
option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")
diff --git a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
index 555b28a8293ee4f..22d7caa38feec97 100644
--- a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
+++ b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
@@ -1,10 +1,10 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s
-// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s
+// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA
module attributes {gpu.container_module} {
// CHECK-LABEL:gpu.binary @kernel_module1
- // CHECK:[#gpu.object<#nvvm.target<chip = "sm_70">, "{{.*}}">]
+ // CHECK:[#gpu.object<#nvvm.target<chip = "sm_70">, offload = "{{.*}}">]
gpu.module @kernel_module1 [#nvvm.target<chip = "sm_70">] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
}
// CHECK-LABEL:gpu.binary @kernel_module2
- // CHECK:[#gpu.object<#nvvm.target<flags = {fast}>, "{{.*}}">, #gpu.object<#nvvm.target, "{{.*}}">]
+ // CHECK-ISA:[#gpu.object<#nvvm.target<flags = {fast}>, properties = {O = 2 : i32}, assembly = "{{.*}}">, #gpu.object<#nvvm.target, properties = {O = 2 : i32}, assembly = "{{.*}}">]
gpu.module @kernel_module2 [#nvvm.target<flags = {fast}>, #nvvm.target] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
diff --git a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
index fb7cfb70c17ed3b..9f987c71387f4ca 100644
--- a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
+++ b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
@@ -1,10 +1,10 @@
// REQUIRES: host-supports-amdgpu
// RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s
-// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s
+// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA
module attributes {gpu.container_module} {
// CHECK-LABEL:gpu.binary @kernel_module1
- // CHECK:[#gpu.object<#rocdl.target<chip = "gfx90a">, "{{.*}}">]
+ // CHECK:[#gpu.object<#rocdl.target<chip = "gfx90a">, offload = "{{.*}}">]
gpu.module @kernel_module1 [#rocdl.target<chip = "gfx90a">] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
}
// CHECK-LABEL:gpu.binary @kernel_module2
- // CHECK:[#gpu.object<#rocdl.target<flags = {fast}>, "{{.*}}">, #gpu.object<#rocdl.target, "{{.*}}">]
+ // CHECK-ISA:[#gpu.object<#rocdl.target<flags = {fast}>, assembly = "{{.*}}">, #gpu.object<#rocdl.target, assembly = "{{.*}}">]
gpu.module @kernel_module2 [#rocdl.target<flags = {fast}>, #rocdl.target] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index b314a768a089632..0d2f52e8adbfcda 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -127,6 +127,16 @@ module attributes {gpu.container_module} {
gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">]
+ gpu.binary @binary_4 [#gpu.object<#nvvm.target, bin = "">,
+ #gpu.object<#nvvm.target, assembly = "">,
+ #gpu.object<#nvvm.target, offload = "">,
+ #gpu.object<#nvvm.target, properties = { O = 3 : i32 }, offload = "">
+ ]
+
+ // Check that fatbin gets ellided as it's the default format.
+ // CHECK: gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, "">]
+ gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, fatbin = "">]
+
func.func private @two_value_generator() -> (f32, memref<?xf32, 1>)
func.func @foo() {
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index c48a515ed022135..1401ac7bd489cc9 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -10,7 +10,7 @@
// Same as above but with the memref bare pointer lowering convention.
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \
+// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
index e8ffc3f830c7c91..2e72ccabd636514 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
index fde50e9b6b92fbd..532834197f63d32 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index 08c3571ef1c35fa..7d0f56e9a125691 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index 134296f39c2b49e..9d00b49c75ff13f 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index c2be1b65950ea51..bf726821cea40a2 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index 6b75321b7bfc235..9cfadac04d33d00 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index 1314d32a779a883..f53249d1bb2143f 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -3,7 +3,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
-// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \
+// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary="format=%format" \
// RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
// RUN: | mlir-cpu-runner \
diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
index abc93f7b1703a66..09211e34d8d14d4 100644
--- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
index acb8dd43f50b4cf..c49265d67433ede 100644
--- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
@@ -1,2 +1,4 @@
if not config.enable_cuda_runner:
config.unsupported = True
+
+config.substitutions.append(("%format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
index 3389f805ac63d0f..6aced55992417eb 100644
--- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index eef5ac66ca52ad4..c22c02ccb24500d 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
index 05cb854d18dd4f3..949631709febfcb 100644
--- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
index fde66de2fce6e7e..7b52db81fc11079 100644
--- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir
+++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index 5d0c420f65d5319..a554fe6dcadb12c 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -70,6 +70,10 @@ struct TestLowerToNVVMOptions
*this, "cubin-features",
llvm::cl::desc("Features to use to serialize to cubin."),
llvm::cl::init("+ptx60")};
+ PassOptions::Option<std::string> cubinFormat{
+ *this, "cubin-format",
+ llvm::cl::desc("Compilation format to use to serialize to cubin."),
+ llvm::cl::init("isa")};
};
//===----------------------------------------------------------------------===//
@@ -257,7 +261,9 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
// Serialize all GPU modules to binaries.
- pm.addPass(createGpuModuleToBinaryPass());
+ GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+ gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
+ pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
// Convert vector to LLVM (always needed).
// TODO: C++20 designated initializers.
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index ef1fdbc0cba07c0..2de40ba5e8e57e6 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -29,6 +29,7 @@ config.run_cuda_tests = @MLIR_ENABLE_CUDA_CONVERSIONS@
config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
+config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index 62c9b527e1e38cb..a00ebba7b97e6d2 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -79,7 +79,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMMToLLVM)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -115,7 +115,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToPTX)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -145,7 +145,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToBinary)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
diff --git a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
index 89cf5c5d2ada586..9ada2dab40ff79e 100644
--- a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
@@ -83,7 +83,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLMToLLVM)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -119,7 +119,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToPTX)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -149,7 +149,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToBinary)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
>From 05afac027db6cbebcdacdf6761c0e0672bfd9b1e Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Wed, 13 Sep 2023 20:43:05 +0000
Subject: [PATCH 2/3] Add a format option to the SparseCompiler pipeline.
---
.../mlir/Dialect/SparseTensor/Pipelines/Passes.h | 16 ++++++++++++++++
.../Pipelines/SparseTensorPipelines.cpp | 4 +++-
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index 5deab8321cbcbde..ac3c2fde8a4840f 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -144,6 +144,22 @@ struct SparseCompilerOptions
desc("GPU target architecture")};
PassOptions::Option<std::string> gpuFeatures{*this, "gpu-features",
desc("GPU target features")};
+ /// For NVIDIA GPUs there are 3 compilation format options:
+ /// 1. `isa`: the compiler generates PTX and the runtime JITs the PTX.
+ /// 2. `bin`: generates a CUBIN object for `chip=gpuChip`.
+ /// 3. `fatbin`: generates a fat binary with a CUBIN object for `gpuChip` and
+ /// also embeds the PTX in the fat binary.
+ /// Notes:
+ /// Option 1 adds a significant runtime performance hit, however, tests are
+ /// more likely to pass with this option.
+ /// Option 2 is better for execution time as there is no JIT; however, the
+ /// program will fail if there's an arch mismatch between `gpuChip` and the
+ /// GPU running the program.
+ /// Option 3 is the best compromise between options 1 & 2 as it can JIT in
+ /// case of an arch mismatch, however, it's only possible to JIT to a higher
+ /// CC than `gpuChip`.
+ PassOptions::Option<std::string> gpuFormat{
+ *this, "gpu-format", desc("GPU compilation format"), init("isa")};
/// This option is used to enable GPU library generation.
PassOptions::Option<bool> enableGPULibgen{
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 37f9e09d34c04e7..54069064839eac3 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -84,7 +84,9 @@ void mlir::sparse_tensor::buildSparseCompiler(
nvvmTargetOptions.features = options.gpuFeatures;
pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
pm.addPass(createGpuToLLVMConversionPass());
- pm.addPass(createGpuModuleToBinaryPass());
+ GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+ gpuModuleToBinaryPassOptions.compilationTarget = options.gpuFormat;
+ pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
}
pm.addPass(createReconcileUnrealizedCastsPass());
>From 4ad818f186ab35d0bc344203ab1560de43fda3be Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Thu, 14 Sep 2023 13:05:47 +0000
Subject: [PATCH 3/3] Updated the docs & migrated more tests to use
MLIR_GPU_COMPILATION_TEST_FORMAT
---
.../mlir/Dialect/GPU/IR/CompilationAttrs.td | 33 +++++++++++++++----
.../Dialect/GPU/IR/CompilationInterfaces.h | 2 +-
.../Dialect/SparseTensor/Pipelines/Passes.h | 8 ++---
.../SparseTensor/GPU/CUDA/lit.local.cfg | 2 ++
.../sparse-matmul-2-4-lib-from-linalg.mlir | 2 +-
.../CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir | 2 +-
.../GPU/CUDA/sparse-gemm-lib.mlir | 4 +--
.../GPU/CUDA/sparse-matmul-lib.mlir | 2 +-
.../GPU/CUDA/sparse-matvec-const.mlir | 2 +-
.../GPU/CUDA/sparse-matvec-lib.mlir | 2 +-
.../SparseTensor/GPU/CUDA/sparse-matvec.mlir | 2 +-
.../GPU/CUDA/sparse-mma-2-4-f16.mlir | 2 +-
.../GPU/CUDA/sparse-sampled-matmul-lib.mlir | 2 +-
...ansform-mma-sync-matmul-f16-f16-accum.mlir | 2 +-
.../sm80/transform-mma-sync-matmul-f32.mlir | 2 +-
.../GPU/CUDA/TensorCore/wmma-matmul-f16.mlir | 2 +-
.../TensorCore/wmma-matmul-f32-bare-ptr.mlir | 2 +-
.../GPU/CUDA/TensorCore/wmma-matmul-f32.mlir | 2 +-
.../sm90/tma_load_128x64_swizzle128b.mlir | 2 +-
.../sm90/tma_load_64x8_8x128_noswizzle.mlir | 4 +--
...a_load_64x8_8x128_noswizzle-transform.mlir | 4 +--
21 files changed, 54 insertions(+), 31 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 3d2e9848a2b25a0..cc43c53c269d87a 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -20,12 +20,13 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
// GPU object attribute.
//===----------------------------------------------------------------------===//
+// For documentation on this enum cases, see the `GPU_ObjectAttr` docs.
def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">;
def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">;
def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">;
def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">;
def GPU_CompilationTargetEnum : GPU_I32Enum<
- "CompilationTarget", "GPU object format", [
+ "CompilationTarget", "GPU compilation format", [
GPU_ObjectOffload,
GPU_ObjectISA,
GPU_ObjectBinary,
@@ -34,14 +35,34 @@ def GPU_CompilationTargetEnum : GPU_I32Enum<
def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
let description = [{
- A GPU object attribute pairs a GPU target with a binary string,
- encapsulating the information of how the object was generated with the
- object itself.
+ A GPU object attribute glues together a GPU target, the object kind, a
+ binary string with the object, and the object properties, encapsulating how
+ the object was generated and its properties with the object itself.
- The target attribute must implement the `TargetAttrInterface` interface.
+ There are four object formats:
+ 1. `Offload`: represents generic objects not described by the other three
+ formats, and its meaning is target-dependent. For example, on the NVPTX and
+ AMDGPU targets, this format is associated with LLVM bitcode.
+ 2. `Assembly`: represents GPU assembly code. For example, in the NVPTX
+ target, assembly is PTX code, which can be JITted at runtime.
+ 3. `Binary`: represents executable code for a GPU single architecture. For
+ example, PTX code that was compiled for a specific compute capability. Note
+ that this format is likely to throw an error if there is an arch mismatch
+ between the compiled and running arch.
+ 4. `Fatbin`: represents a GPU fat binary with executable code for multiple
+ architectures. This format is the default; thus, it gets elided inassembly
+ code.
+
+ Object properties are specified through the `properties` dictionary
+ attribute and can be used to define additional information.
+ The target attribute must implement or promise the `TargetAttrInterface`
+ interface.
```
- #gpu.object<#nvvm.target, "...">
+ #gpu.object<#rocdl.target, offload = "..."> // An offload object.
+ #gpu.object<#nvvm.target, properties = {O = 3 : i32}, assembly = "..."> // An assembly object with additional properties.
+ #gpu.object<#rocdl.target, bin = "..."> // A binary object.
+ #gpu.object<#nvvm.target, "..."> // A fatbin object.
```
}];
let parameters = (ins
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index ee7daed58f98314..6d7cb5ca7a7f81f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -47,7 +47,7 @@ class TargetOptions {
/// Constructor initializing the toolkit path, the list of files to link to,
/// extra command line options, the compilation target and a callback for
/// obtaining the parent symbol table. The default compilation target is
- /// `binOrFatbin`.
+ /// `Fatbin`.
TargetOptions(
StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
StringRef cmdOptions = {},
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index ac3c2fde8a4840f..e65b78607df4557 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -145,7 +145,7 @@ struct SparseCompilerOptions
PassOptions::Option<std::string> gpuFeatures{*this, "gpu-features",
desc("GPU target features")};
/// For NVIDIA GPUs there are 3 compilation format options:
- /// 1. `isa`: the compiler generates PTX and the runtime JITs the PTX.
+ /// 1. `isa`: the compiler generates PTX and the driver JITs the PTX.
/// 2. `bin`: generates a CUBIN object for `chip=gpuChip`.
/// 3. `fatbin`: generates a fat binary with a CUBIN object for `gpuChip` and
/// also embeds the PTX in the fat binary.
@@ -156,10 +156,10 @@ struct SparseCompilerOptions
/// program will fail if there's an arch mismatch between `gpuChip` and the
/// GPU running the program.
/// Option 3 is the best compromise between options 1 & 2 as it can JIT in
- /// case of an arch mismatch, however, it's only possible to JIT to a higher
- /// CC than `gpuChip`.
+ /// case of an arch mismatch between `gpuChip` and the running arch. However,
+ /// it's only possible to JIT to a higher CC than `gpuChip`.
PassOptions::Option<std::string> gpuFormat{
- *this, "gpu-format", desc("GPU compilation format"), init("isa")};
+ *this, "gpu-format", desc("GPU compilation format"), init("fatbin")};
/// This option is used to enable GPU library generation.
PassOptions::Option<bool> enableGPULibgen{
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
index 6788ccea3a222c5..3c6ac3ce14f3e23 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
@@ -1,2 +1,4 @@
if not config.enable_cuda_runner or not config.mlir_run_cuda_sm80_tests:
config.unsupported = True
+
+config.substitutions.append(("%format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
index aa71abbcf0e717c..7d306d81428c733 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80 and cusparselt
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
// DEFINE: %{run} = mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
// DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
index 062798a39b8106e..59694cb540f5f11 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -1,7 +1,7 @@
//
// NOTE: this test requires gpu-sm80 and cusparselt
//
-// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
// RUN: %s \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
index a310e59d53038fd..64709a2e93bee81 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
@@ -4,7 +4,7 @@
// with RT lib:
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
@@ -14,7 +14,7 @@
// without RT lib:
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
index 1e51aae5f389260..324f7a75d3e1e45 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
// DEFINE: %{run} = mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
// DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
index ca47de6cca27f6d..039156e046364f6 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
@@ -3,7 +3,7 @@
// NOTE: this test requires gpu-sm80
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
index 16a240838d7c4fd..9ded223f0c67cbf 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
// DEFINE: %{run} = mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
// DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
index c5c3546cdf01694..98fb423001b86f9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
@@ -3,7 +3,7 @@
// NOTE: this test requires gpu-sm80
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index aee8a6a6558e4f5..d73fd5558cbd327 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -6,7 +6,7 @@
// RUN: mlir-opt \
// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
// RUN: %s \
-// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \
+// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index 934bd837420c1bf..28f5bf063db2dd7 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
// DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
// DEFINE: mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
index d959fdb6a9db178..641ea40681074c6 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
@@ -3,7 +3,7 @@
// RUN: mlir-opt %s \
// RUN: -test-transform-dialect-interpreter \
// RUN: -test-transform-dialect-erase-schedule \
-// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
+// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
index 0ec15f2a9c79d70..c494a48b1b55680 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
@@ -13,7 +13,7 @@
// RUN: mlir-opt %s \
// RUN: -test-transform-dialect-interpreter \
// RUN: -test-transform-dialect-erase-schedule \
-// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
+// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
index 4d8a281113593c6..273d6b06792707f 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
index 664d344b2769bf7..6c5ee42c8473fe3 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
@@ -5,7 +5,7 @@
// Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
// This test also uses gpu.memcpy operations (instead of gpu.host_register).
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --entry-point-result=void \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
index 4d76eb898dc2935..5183c8d9eb04b00 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
@@ -1,7 +1,7 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index de68d3b90f11f39..9e14dc6e183f9b7 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -16,7 +16,7 @@
// RUN: -canonicalize -cse \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 6d998522058154f..675de1f4a1cf632 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -10,7 +10,7 @@
// RUN: -convert-func-to-llvm \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
// Basic PTX check to make sure we are generating the right instructions.
@@ -34,7 +34,7 @@
// RUN: -convert-func-to-llvm \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
index e66978bc594b1b8..6f5520d98aeebe7 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -16,7 +16,7 @@
// RUN: -canonicalize \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
// Basic PTX check to make sure we are generating the right instructions.
@@ -43,7 +43,7 @@
// RUN: -canonicalize \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
More information about the Mlir-commits
mailing list