[Mlir-commits] [mlir] 5093413 - [mlir][gpu][NVPTX] Enable NVIDIA GPU JIT compilation path (#66220)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Sep 14 15:00:32 PDT 2023
Author: Fabian Mora
Date: 2023-09-14T18:00:27-04:00
New Revision: 5093413a5007b017a530edbeed42d32bfd18b126
URL: https://github.com/llvm/llvm-project/commit/5093413a5007b017a530edbeed42d32bfd18b126
DIFF: https://github.com/llvm/llvm-project/commit/5093413a5007b017a530edbeed42d32bfd18b126.diff
LOG: [mlir][gpu][NVPTX] Enable NVIDIA GPU JIT compilation path (#66220)
This patch adds an NVPTX compilation path that enables JIT compilation
on NVIDIA targets. The following modifications were performed:
1. Adding a format field to the GPU object attribute, allowing the
translation attribute to use the correct runtime function to load the
module. Likewise, a dictionary attribute was added to add any possible
extra options.
2. Adding the `createObject` method to `GPUTargetAttrInterface`; this
method returns a GPU object from a binary string.
3. Adding the function `mgpuModuleLoadJIT`, which is only available for
NVIDIA GPUs, as there is no equivalent for AMD.
4. Adding the CMake flag `MLIR_GPU_COMPILATION_TEST_FORMAT` to specify
the format to use during testing.
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
mlir/lib/Target/LLVM/NVVM/Target.cpp
mlir/lib/Target/LLVM/ROCDL/Target.cpp
mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
mlir/test/CMakeLists.txt
mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
mlir/test/Dialect/GPU/ops.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
mlir/test/Integration/GPU/CUDA/async.mlir
mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
mlir/test/Integration/GPU/CUDA/lit.local.cfg
mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
mlir/test/Integration/GPU/CUDA/printf.mlir
mlir/test/Integration/GPU/CUDA/shuffle.mlir
mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
mlir/test/Integration/GPU/CUDA/two-modules.mlir
mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
mlir/test/lit.site.cfg.py.in
mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
index 5255286619e3bf2..160730480394272 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
@@ -33,12 +33,21 @@ def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> {
If serialization fails then the method should return `std::nullopt`.
- The `module` argument must be a GPU Module Op. The `options` argument is
- meant to be used for passing additional options that are not in the
+ The `module` parameter must be a GPU Module Op. The `options` parameter
+ is meant to be used for passing additional options that are not in the
attribute.
}],
"std::optional<SmallVector<char, 0>>", "serializeToObject",
- (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>
+ (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>,
+ InterfaceMethod<[{
+ Creates a GPU object attribute from a binary string.
+
+ The `object` parameter is a binary string. The `options` parameter is
+ meant to be used for passing additional options that are not in the
+ attribute.
+ }], "Attribute", "createObject",
+ (ins "const SmallVector<char, 0>&":$object,
+ "const gpu::TargetOptions&":$options)>
];
}
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 9c1110d8e9a9463..6659f4a2c58e825 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -20,20 +20,62 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
// GPU object attribute.
//===----------------------------------------------------------------------===//
+// For documentation on this enum cases, see the `GPU_ObjectAttr` docs.
+def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">;
+def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">;
+def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">;
+def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">;
+def GPU_CompilationTargetEnum : GPU_I32Enum<
+ "CompilationTarget", "GPU compilation format", [
+ GPU_ObjectOffload,
+ GPU_ObjectISA,
+ GPU_ObjectBinary,
+ GPU_ObjectFatbin
+ ]>;
+
def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
let description = [{
- A GPU object attribute pairs a GPU target with a binary string,
- encapsulating the information of how the object was generated with the
- object itself.
+ A GPU object attribute glues together a GPU target, the object kind, a
+ binary string with the object, and the object properties, encapsulating how
+ the object was generated and its properties with the object itself.
+
+ There are four object formats:
+ 1. `Offload`: represents generic objects not described by the other three
+ formats, and its meaning is target-dependent. For example, on the NVPTX and
+ AMDGPU targets, this format is associated with LLVM bitcode.
+ 2. `Assembly`: represents GPU assembly code. For example, in the NVPTX
+ target, assembly is PTX code, which can be JITted at runtime.
+ 3. `Binary`: represents executable code for a GPU single architecture. For
+ example, PTX code that was compiled for a specific compute capability. Note
+ that this format is likely to throw an error if there is an architecture
+ mismatch between the compiled and running architecture.
+ 4. `Fatbin`: represents a GPU fat binary with executable code for multiple
+ architectures. This format is the default; thus, it gets elided inassembly
+ code.
- The target attribute must implement the `TargetAttrInterface` interface.
+ Object properties are specified through the `properties` dictionary
+ attribute and can be used to define additional information.
+ The target attribute must implement or promise the `TargetAttrInterface`
+ interface.
```
- #gpu.object<#nvvm.target, "...">
+ #gpu.object<#rocdl.target, offload = "..."> // An offload object.
+ #gpu.object<#nvvm.target, properties = {O = 3 : i32}, assembly = "..."> // An assembly object with additional properties.
+ #gpu.object<#rocdl.target, bin = "..."> // A binary object.
+ #gpu.object<#nvvm.target, "..."> // A fatbin object.
```
}];
- let parameters = (ins "Attribute":$target, "StringAttr":$object);
- let assemblyFormat = [{`<` $target `,` $object `>`}];
+ let parameters = (ins
+ "Attribute":$target,
+ DefaultValuedParameter<"CompilationTarget", "CompilationTarget::Fatbin">:$format,
+ "StringAttr":$object,
+ OptionalParameter<"DictionaryAttr">:$properties
+ );
+ let assemblyFormat = [{ `<`
+ $target `,` (`properties` `=` $properties ^ `,`)?
+ custom<Object>($format, $object)
+ `>`
+ }];
let genVerifyDecl = 1;
}
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index a1f64be57fa699d..6d7cb5ca7a7f81f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -25,6 +25,8 @@ namespace LLVM {
class ModuleTranslation;
}
namespace gpu {
+enum class CompilationTarget : uint32_t;
+
/// This class indicates that the attribute associated with this trait is a GPU
/// offloading translation attribute. These kinds of attributes must implement
/// an interface for handling the translation of GPU offloading operations like
@@ -42,27 +44,15 @@ class OffloadingTranslationAttrTrait
/// ensure type safeness. Targets are free to ignore these options.
class TargetOptions {
public:
- /// The target representation of the compilation process.
- typedef enum {
- offload = 1, /// The process should produce an offloading representation.
- /// For the NVVM & ROCDL targets this option produces LLVM IR.
- assembly = 2, /// The process should produce assembly code.
- binary = 4, /// The process should produce a binary.
- fatbinary = 8, /// The process should produce a fat binary.
- binOrFatbin =
- binary |
- fatbinary, /// The process should produce a binary or fatbinary. It's up
- /// to the target to decide which.
- } CompilationTarget;
-
/// Constructor initializing the toolkit path, the list of files to link to,
/// extra command line options, the compilation target and a callback for
/// obtaining the parent symbol table. The default compilation target is
- /// `binOrFatbin`.
- TargetOptions(StringRef toolkitPath = {},
- ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
- CompilationTarget compilationTarget = binOrFatbin,
- function_ref<SymbolTable *()> getSymbolTableCallback = {});
+ /// `Fatbin`.
+ TargetOptions(
+ StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
+ StringRef cmdOptions = {},
+ CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+ function_ref<SymbolTable *()> getSymbolTableCallback = {});
/// Returns the typeID.
TypeID getTypeID() const;
@@ -90,13 +80,17 @@ class TargetOptions {
/// table.
SymbolTable *getSymbolTable() const;
+ /// Returns the default compilation target: `CompilationTarget::Fatbin`.
+ static CompilationTarget getDefaultCompilationTarget();
+
protected:
/// Derived classes must use this constructor to initialize `typeID` to the
/// appropiate value: ie. `TargetOptions(TypeID::get<DerivedClass>())`.
- TargetOptions(TypeID typeID, StringRef toolkitPath = {},
- ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
- CompilationTarget compilationTarget = binOrFatbin,
- function_ref<SymbolTable *()> getSymbolTableCallback = {});
+ TargetOptions(
+ TypeID typeID, StringRef toolkitPath = {},
+ ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
+ CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+ function_ref<SymbolTable *()> getSymbolTableCallback = {});
/// Path to the target toolkit.
std::string toolkitPath;
@@ -108,7 +102,7 @@ class TargetOptions {
/// process.
std::string cmdOptions;
- /// Compilation process target representation.
+ /// Compilation process target format.
CompilationTarget compilationTarget;
/// Callback for obtaining the parent symbol table of all the GPU modules
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 0bfb2750992058f..3de8e18851369df 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -68,7 +68,6 @@ def GpuModuleToBinaryPass
2. `assembly`, `isa`: produces assembly code.
3. `binary`, `bin`: produces binaries.
4. `fatbinary`, `fatbin`: produces fatbinaries.
- 5. `binOrFatbin`: produces bins or fatbins, the target decides which.
}];
let options = [
Option<"offloadingHandler", "handler", "Attribute", "nullptr",
@@ -79,7 +78,7 @@ def GpuModuleToBinaryPass
"Extra files to link to.">,
Option<"cmdOptions", "opts", "std::string", [{""}],
"Command line options to pass to the tools.">,
- Option<"compilationTarget", "format", "std::string", [{"binOrFatbin"}],
+ Option<"compilationTarget", "format", "std::string", [{"fatbin"}],
"The target representation of the compilation process.">
];
}
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index 63040b152a860c1..71fcfc84919eb5c 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -144,6 +144,23 @@ struct SparseCompilerOptions
desc("GPU target architecture")};
PassOptions::Option<std::string> gpuFeatures{*this, "gpu-features",
desc("GPU target features")};
+ /// For NVIDIA GPUs there are 3 compilation format options:
+ /// 1. `isa`: the compiler generates PTX and the driver JITs the PTX.
+ /// 2. `bin`: generates a CUBIN object for `chip=gpuChip`.
+ /// 3. `fatbin`: generates a fat binary with a CUBIN object for `gpuChip` and
+ /// also embeds the PTX in the fat binary.
+ /// Notes:
+ /// Option 1 adds a significant runtime performance hit, however, tests are
+ /// more likely to pass with this option.
+ /// Option 2 is better for execution time as there is no JIT; however, the
+ /// program will fail if there's an architecture mismatch between `gpuChip`
+ /// and the GPU running the program.
+ /// Option 3 is the best compromise between options 1 and 2 as it can JIT in
+ /// case of an architecture mismatch between `gpuChip` and the running
+ /// architecture. However, it's only possible to JIT to a higher CC than
+ /// `gpuChip`.
+ PassOptions::Option<std::string> gpuFormat{
+ *this, "gpu-format", desc("GPU compilation format"), init("fatbin")};
/// This option is used to enable GPU library generation.
PassOptions::Option<bool> enableGPULibgen{
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index fde379cd0afe13f..5eb2cadc884e151 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1959,7 +1959,8 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
//===----------------------------------------------------------------------===//
LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
- Attribute target, StringAttr object) {
+ Attribute target, CompilationTarget format,
+ StringAttr object, DictionaryAttr properties) {
if (!target)
return emitError() << "the target attribute cannot be null";
if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>())
@@ -1968,6 +1969,40 @@ LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
"`gpu::TargetAttrInterface`";
}
+namespace {
+LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format,
+ StringAttr &object) {
+ std::optional<CompilationTarget> formatResult;
+ StringRef enumKeyword;
+ auto loc = odsParser.getCurrentLocation();
+ if (failed(odsParser.parseOptionalKeyword(&enumKeyword)))
+ formatResult = CompilationTarget::Fatbin;
+ if (!formatResult &&
+ (formatResult =
+ gpu::symbolizeEnum<gpu::CompilationTarget>(enumKeyword)) &&
+ odsParser.parseEqual())
+ return odsParser.emitError(loc, "expected an equal sign");
+ if (!formatResult)
+ return odsParser.emitError(loc, "expected keyword for GPU object format");
+ FailureOr<StringAttr> objectResult =
+ FieldParser<StringAttr>::parse(odsParser);
+ if (failed(objectResult))
+ return odsParser.emitError(odsParser.getCurrentLocation(),
+ "failed to parse GPU_ObjectAttr parameter "
+ "'object' which is to be a `StringAttr`");
+ format = *formatResult;
+ object = *objectResult;
+ return success();
+}
+
+void printObject(AsmPrinter &odsParser, CompilationTarget format,
+ StringAttr object) {
+ if (format != CompilationTarget::Fatbin)
+ odsParser << stringifyEnum(format) << " = ";
+ odsParser << object;
+}
+} // namespace
+
//===----------------------------------------------------------------------===//
// GPU select object attribute
//===----------------------------------------------------------------------===//
@@ -2020,6 +2055,14 @@ SymbolTable *TargetOptions::getSymbolTable() const {
return getSymbolTableCallback ? getSymbolTableCallback() : nullptr;
}
+CompilationTarget TargetOptions::getCompilationTarget() const {
+ return compilationTarget;
+}
+
+CompilationTarget TargetOptions::getDefaultCompilationTarget() {
+ return CompilationTarget::Fatbin;
+}
+
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
TargetOptions::tokenizeCmdOptions() const {
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options;
@@ -2043,10 +2086,6 @@ TargetOptions::tokenizeCmdOptions() const {
return options;
}
-TargetOptions::CompilationTarget TargetOptions::getCompilationTarget() const {
- return compilationTarget;
-}
-
MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
#include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index e29a1f0c3248d04..2bf89f8c57903e5 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -57,14 +57,14 @@ void GpuModuleToBinaryPass::getDependentDialects(
void GpuModuleToBinaryPass::runOnOperation() {
RewritePatternSet patterns(&getContext());
- int targetFormat = llvm::StringSwitch<int>(compilationTarget)
- .Cases("offloading", "llvm", TargetOptions::offload)
- .Cases("assembly", "isa", TargetOptions::assembly)
- .Cases("binary", "bin", TargetOptions::binary)
- .Cases("fatbinary", "fatbin", TargetOptions::fatbinary)
- .Case("binOrFatbin", TargetOptions::binOrFatbin)
- .Default(-1);
- if (targetFormat == -1)
+ auto targetFormat =
+ llvm::StringSwitch<std::optional<CompilationTarget>>(compilationTarget)
+ .Cases("offloading", "llvm", CompilationTarget::Offload)
+ .Cases("assembly", "isa", CompilationTarget::Assembly)
+ .Cases("binary", "bin", CompilationTarget::Binary)
+ .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin)
+ .Default(std::nullopt);
+ if (!targetFormat)
getOperation()->emitError() << "Invalid format specified.";
// Lazy symbol table builder callback.
@@ -82,10 +82,8 @@ void GpuModuleToBinaryPass::runOnOperation() {
return &parentTable.value();
};
- TargetOptions targetOptions(
- toolkitPath, linkFiles, cmdOptions,
- static_cast<TargetOptions::CompilationTarget>(targetFormat),
- lazyTableBuilder);
+ TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat,
+ lazyTableBuilder);
if (failed(transformGpuModulesToBinaries(
getOperation(),
offloadingHandler ? dyn_cast<OffloadingLLVMTranslationAttrInterface>(
@@ -107,17 +105,19 @@ LogicalResult moduleSerializer(GPUModuleOp op,
auto target = dyn_cast<gpu::TargetAttrInterface>(targetAttr);
assert(target &&
"Target attribute doesn't implements `TargetAttrInterface`.");
- std::optional<SmallVector<char, 0>> object =
+ std::optional<SmallVector<char, 0>> serializedModule =
target.serializeToObject(op, targetOptions);
-
- if (!object) {
+ if (!serializedModule) {
op.emitError("An error happened while serializing the module.");
return failure();
}
- objects.push_back(builder.getAttr<gpu::ObjectAttr>(
- target,
- builder.getStringAttr(StringRef(object->data(), object->size()))));
+ Attribute object = target.createObject(*serializedModule, targetOptions);
+ if (!object) {
+ op.emitError("An error happened while creating the object.");
+ return failure();
+ }
+ objects.push_back(object);
}
builder.setInsertionPointAfter(op);
builder.create<gpu::BinaryOp>(op.getLoc(), op.getName(), handler,
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 37f9e09d34c04e7..54069064839eac3 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -84,7 +84,9 @@ void mlir::sparse_tensor::buildSparseCompiler(
nvvmTargetOptions.features = options.gpuFeatures;
pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
pm.addPass(createGpuToLLVMConversionPass());
- pm.addPass(createGpuModuleToBinaryPass());
+ GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+ gpuModuleToBinaryPassOptions.compilationTarget = options.gpuFormat;
+ pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
}
pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 7bf6804902479a8..d19d473a5327627 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -126,6 +126,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
return module;
}
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
+ int optLevel) {
+ ScopedContext scopedContext;
+ CUmodule module = nullptr;
+ char jitErrorBuffer[4096] = {0};
+ CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+ CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+ CU_JIT_OPTIMIZATION_LEVEL};
+ void *jitOptionsVals[] = {jitErrorBuffer,
+ reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
+ reinterpret_cast<void *>(optLevel)};
+
+ CUresult result =
+ cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
+ if (result) {
+ fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer);
+ CUDA_REPORT_IF_ERROR(result);
+ }
+ return module;
+}
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
}
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..da2ae87fef6715f 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -38,6 +38,11 @@ extern "C" hipModule_t mgpuModuleLoad(void *data) {
return module;
}
+extern "C" hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) {
+ assert(false && "This function is not available in HIP.");
+ return nullptr;
+}
+
extern "C" void mgpuModuleUnload(hipModule_t module) {
HIP_REPORT_IF_ERROR(hipModuleUnload(module));
}
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 13188b1107d928b..7f263627db54fbe 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -47,6 +47,10 @@ class NVVMTargetAttrImpl
std::optional<SmallVector<char, 0>>
serializeToObject(Attribute attribute, Operation *module,
const gpu::TargetOptions &options) const;
+
+ Attribute createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const;
};
} // namespace
@@ -227,9 +231,9 @@ std::optional<std::string> NVPTXSerializer::findTool(StringRef tool) {
}
// 2. Check PATH.
- if (std::optional<std::string> ptxasCompiler =
+ if (std::optional<std::string> toolPath =
llvm::sys::Process::FindInEnvPath("PATH", tool))
- return *ptxasCompiler;
+ return *toolPath;
// 3. Check `getCUDAToolkitPath()`.
pathRef = getCUDAToolkitPath();
@@ -255,8 +259,7 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) {
// Determine if the serializer should create a fatbinary with the PTX embeded
// or a simple CUBIN binary.
const bool createFatbin =
- (targetOptions.getCompilationTarget() & gpu::TargetOptions::fatbinary) ==
- gpu::TargetOptions::fatbinary;
+ targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin;
// Find the `ptxas` & `fatbinary` tools.
std::optional<std::string> ptxasCompiler = findTool("ptxas");
@@ -522,7 +525,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule,
llvm::dbgs().flush();
});
#undef DEBUG_TYPE
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload)
+ if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine);
// Emit PTX code.
@@ -541,8 +544,12 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule,
#undef DEBUG_TYPE
// Return PTX if the compilation target is assembly.
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly)
- return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
+ if (targetOptions.getCompilationTarget() ==
+ gpu::CompilationTarget::Assembly) {
+ // Make sure to include the null terminator.
+ StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
+ return SmallVector<char, 0>(bin.begin(), bin.end());
+ }
// Compile to binary.
#if MLIR_NVPTXCOMPILER_ENABLED == 1
@@ -573,3 +580,20 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
return std::nullopt;
#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
}
+
+Attribute
+NVVMTargetAttrImpl::createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const {
+ auto target = cast<NVVMTargetAttr>(attribute);
+ gpu::CompilationTarget format = options.getCompilationTarget();
+ DictionaryAttr objectProps;
+ Builder builder(attribute.getContext());
+ if (format == gpu::CompilationTarget::Assembly)
+ objectProps = builder.getDictionaryAttr(
+ {builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO()))});
+ return builder.getAttr<gpu::ObjectAttr>(
+ attribute, format,
+ builder.getStringAttr(StringRef(object.data(), object.size())),
+ objectProps);
+}
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index effb88554e8ee04..611d08fe3e79e56 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -60,6 +60,10 @@ class ROCDLTargetAttrImpl
std::optional<SmallVector<char, 0>>
serializeToObject(Attribute attribute, Operation *module,
const gpu::TargetOptions &options) const;
+
+ Attribute createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const;
};
} // namespace
@@ -417,7 +421,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule,
<< llvmModule << "\n";
});
#undef DEBUG_TYPE
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload)
+ if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine);
// Translate the Module to ISA.
@@ -434,7 +438,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule,
});
#undef DEBUG_TYPE
// Return ISA assembly code if the compilation target is assembly.
- if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly)
+ if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
// Compile to binary.
@@ -463,3 +467,16 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
return std::nullopt;
#endif // MLIR_ROCM_CONVERSIONS_ENABLED == 1
}
+
+Attribute
+ROCDLTargetAttrImpl::createObject(Attribute attribute,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const {
+ gpu::CompilationTarget format = options.getCompilationTarget();
+ Builder builder(attribute.getContext());
+ return builder.getAttr<gpu::ObjectAttr>(
+ attribute,
+ format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary
+ : format,
+ builder.getStringAttr(StringRef(object.data(), object.size())), nullptr);
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
index 3b060ac1779db26..47fe6973778cd7f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
@@ -44,6 +44,9 @@ class SelectObjectAttrImpl
Operation *binaryOperation,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) const;
+
+ // Returns the selected object for embedding.
+ gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
};
// Returns an identifier for the global string holding the binary.
std::string getBinaryIdentifier(StringRef binaryName) {
@@ -58,24 +61,15 @@ void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
});
}
-LogicalResult SelectObjectAttrImpl::embedBinary(
- Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
- LLVM::ModuleTranslation &moduleTranslation) const {
- assert(operation && "The binary operation must be non null.");
- if (!operation)
- return failure();
-
- auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
- if (!op) {
- operation->emitError("Operation must be a GPU binary.");
- return failure();
- }
-
+gpu::ObjectAttr
+SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
// Obtain the index of the object to select.
int64_t index = -1;
- if (Attribute target = cast<gpu::SelectObjectAttr>(attribute).getTarget()) {
+ if (Attribute target =
+ cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
+ .getTarget()) {
// If the target attribute is a number it is the index. Otherwise compare
// the attribute to every target inside the object array to find the index.
if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
@@ -95,10 +89,28 @@ LogicalResult SelectObjectAttrImpl::embedBinary(
}
if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
- op->emitError("The requested target object couldn't be found.");
+ op->emitError("the requested target object couldn't be found");
+ return nullptr;
+ }
+ return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
+}
+
+LogicalResult SelectObjectAttrImpl::embedBinary(
+ Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const {
+ assert(operation && "The binary operation must be non null.");
+ if (!operation)
+ return failure();
+
+ auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
+ if (!op) {
+ operation->emitError("operation must be a GPU binary");
return failure();
}
- auto object = mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
+
+ gpu::ObjectAttr object = getSelectedObject(op);
+ if (!object)
+ return failure();
llvm::Module *module = moduleTranslation.getLLVMModule();
@@ -130,6 +142,9 @@ class LaunchKernel {
// Get the module load callee.
FunctionCallee getModuleLoadFn();
+ // Get the module load JIT callee.
+ FunctionCallee getModuleLoadJITFn();
+
// Get the module unload callee.
FunctionCallee getModuleUnloadFn();
@@ -149,7 +164,8 @@ class LaunchKernel {
Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
// Create the full kernel launch.
- mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op);
+ mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+ mlir::gpu::ObjectAttr object);
private:
Module &module;
@@ -174,13 +190,22 @@ LogicalResult SelectObjectAttrImpl::launchKernel(
auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
if (!launchFuncOp) {
- launchFuncOperation->emitError("Operation must be a GPU launch func Op.");
+ launchFuncOperation->emitError("operation must be a GPU launch func Op.");
return failure();
}
+ auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
+ if (!binOp) {
+ binaryOperation->emitError("operation must be a GPU binary.");
+ return failure();
+ }
+ gpu::ObjectAttr object = getSelectedObject(binOp);
+ if (!object)
+ return failure();
+
return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
moduleTranslation)
- .createKernelLaunch(launchFuncOp);
+ .createKernelLaunch(launchFuncOp, object);
}
llvm::LaunchKernel::LaunchKernel(
@@ -215,6 +240,12 @@ llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy}), false));
}
+llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() {
+ return module.getOrInsertFunction(
+ "mgpuModuleLoadJIT",
+ FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i32Ty}), false));
+}
+
llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() {
return module.getOrInsertFunction(
"mgpuModuleUnload",
@@ -299,7 +330,8 @@ llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
// call %streamDestroy(%4)
// call %moduleUnload(%1)
mlir::LogicalResult
-llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) {
+llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+ mlir::gpu::ObjectAttr object) {
auto llvmValue = [&](mlir::Value value) -> Value * {
Value *v = moduleTranslation.lookupValue(value);
assert(v && "Value has not been translated.");
@@ -326,13 +358,29 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) {
// Create the argument array.
Value *argArray = createKernelArgArray(op);
+ // Default JIT optimization level.
+ llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0);
+ // Check if there's an optimization level embedded in the object.
+ DictionaryAttr objectProps = object.getProperties();
+ mlir::Attribute optAttr;
+ if (objectProps && (optAttr = objectProps.get("O"))) {
+ auto optLevel = dyn_cast<IntegerAttr>(optAttr);
+ if (!optLevel)
+ return op.emitError("the optimization level must be an integer");
+ optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue());
+ }
+
// Load the kernel module.
StringRef moduleName = op.getKernelModuleName().getValue();
std::string binaryIdentifier = getBinaryIdentifier(moduleName);
Value *binary = module.getGlobalVariable(binaryIdentifier, true);
if (!binary)
return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
- Value *moduleObject = builder.CreateCall(getModuleLoadFn(), {binary});
+
+ Value *moduleObject =
+ object.getFormat() == gpu::CompilationTarget::Assembly
+ ? builder.CreateCall(getModuleLoadJITFn(), {binary, optV})
+ : builder.CreateCall(getModuleLoadFn(), {binary});
// Load the kernel function.
Value *moduleFunction = builder.CreateCall(
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 66a9cb01106ba5d..bf143d036c2f66f 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -26,6 +26,8 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
"If arch-specific Arm integration tests run emulated, use this Arm native lli.")
set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING
"If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.")
+ set(MLIR_GPU_COMPILATION_TEST_FORMAT "fatbin" CACHE STRING
+ "The GPU compilation format used by the tests.")
option(MLIR_RUN_AMX_TESTS "Run AMX tests.")
option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.")
option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")
diff --git a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
index 555b28a8293ee4f..22d7caa38feec97 100644
--- a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
+++ b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
@@ -1,10 +1,10 @@
// REQUIRES: host-supports-nvptx
// RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s
-// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s
+// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA
module attributes {gpu.container_module} {
// CHECK-LABEL:gpu.binary @kernel_module1
- // CHECK:[#gpu.object<#nvvm.target<chip = "sm_70">, "{{.*}}">]
+ // CHECK:[#gpu.object<#nvvm.target<chip = "sm_70">, offload = "{{.*}}">]
gpu.module @kernel_module1 [#nvvm.target<chip = "sm_70">] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
}
// CHECK-LABEL:gpu.binary @kernel_module2
- // CHECK:[#gpu.object<#nvvm.target<flags = {fast}>, "{{.*}}">, #gpu.object<#nvvm.target, "{{.*}}">]
+ // CHECK-ISA:[#gpu.object<#nvvm.target<flags = {fast}>, properties = {O = 2 : i32}, assembly = "{{.*}}">, #gpu.object<#nvvm.target, properties = {O = 2 : i32}, assembly = "{{.*}}">]
gpu.module @kernel_module2 [#nvvm.target<flags = {fast}>, #nvvm.target] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
diff --git a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
index fb7cfb70c17ed3b..9f987c71387f4ca 100644
--- a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
+++ b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
@@ -1,10 +1,10 @@
// REQUIRES: host-supports-amdgpu
// RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s
-// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s
+// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA
module attributes {gpu.container_module} {
// CHECK-LABEL:gpu.binary @kernel_module1
- // CHECK:[#gpu.object<#rocdl.target<chip = "gfx90a">, "{{.*}}">]
+ // CHECK:[#gpu.object<#rocdl.target<chip = "gfx90a">, offload = "{{.*}}">]
gpu.module @kernel_module1 [#rocdl.target<chip = "gfx90a">] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
}
// CHECK-LABEL:gpu.binary @kernel_module2
- // CHECK:[#gpu.object<#rocdl.target<flags = {fast}>, "{{.*}}">, #gpu.object<#rocdl.target, "{{.*}}">]
+ // CHECK-ISA:[#gpu.object<#rocdl.target<flags = {fast}>, assembly = "{{.*}}">, #gpu.object<#rocdl.target, assembly = "{{.*}}">]
gpu.module @kernel_module2 [#rocdl.target<flags = {fast}>, #rocdl.target] {
llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
%arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index b314a768a089632..0d2f52e8adbfcda 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -127,6 +127,16 @@ module attributes {gpu.container_module} {
gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">]
+ gpu.binary @binary_4 [#gpu.object<#nvvm.target, bin = "">,
+ #gpu.object<#nvvm.target, assembly = "">,
+ #gpu.object<#nvvm.target, offload = "">,
+ #gpu.object<#nvvm.target, properties = { O = 3 : i32 }, offload = "">
+ ]
+
+ // Check that fatbin gets ellided as it's the default format.
+ // CHECK: gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, "">]
+ gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, fatbin = "">]
+
func.func private @two_value_generator() -> (f32, memref<?xf32, 1>)
func.func @foo() {
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
index 6788ccea3a222c5..19f12d39c8428ad 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
@@ -1,2 +1,4 @@
if not config.enable_cuda_runner or not config.mlir_run_cuda_sm80_tests:
config.unsupported = True
+
+config.substitutions.append(("%gpu_compilation_format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
index aa71abbcf0e717c..67c8ce8dfa3004f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80 and cusparselt
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
// DEFINE: %{run} = mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
// DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
index 062798a39b8106e..8917ab1e5a70d71 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -1,7 +1,7 @@
//
// NOTE: this test requires gpu-sm80 and cusparselt
//
-// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
// RUN: %s \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
index 8f67553592fbac0..28af7a03e805ec8 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
@@ -4,7 +4,7 @@
// with RT lib:
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
@@ -14,7 +14,7 @@
// without RT lib:
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
index 4b321f32d04c0c5..bb7efc8c3c2aec9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
// DEFINE: %{run} = mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
// DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
index 0658ff1ff41c0ed..2031c0ae75964ff 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
index 947c7d9cbbc3cb7..409eb9697297136 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
// DEFINE: %{run} = mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
// DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
index 4266e51658073f9..958ad5c124ca8df 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index 80972f244ec02d7..73db1b825319a7c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -4,7 +4,7 @@
// RUN: mlir-opt \
// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
// RUN: %s \
-// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \
+// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index dd4b0bbd8e59acb..61de57564beda2e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -2,7 +2,7 @@
// NOTE: this test requires gpu-sm80
//
// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
// DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
// DEFINE: mlir-cpu-runner \
// DEFINE: --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
index 56d1e6d2973562b..4aa8ce326cc892e 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
@@ -1,7 +1,7 @@
// RUN: mlir-opt %s \
// RUN: -test-transform-dialect-interpreter \
// RUN: -test-transform-dialect-erase-schedule \
-// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
+// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
index 357ab8ec4d75921..21f3a76c82af1f3 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
@@ -11,7 +11,7 @@
// RUN: mlir-opt %s \
// RUN: -test-transform-dialect-interpreter \
// RUN: -test-transform-dialect-erase-schedule \
-// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
+// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
index 591bf1b4fd18231..95068974a1a07be 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
index 51bd23f817b33f1..9ab0e59a291e076 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
@@ -3,7 +3,7 @@
// Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
// This test also uses gpu.memcpy operations (instead of gpu.host_register).
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --entry-point-result=void \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
index 0307b3d504be9f6..41f4c1d35454d6f 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index b131b8682ddee06..13a05a2766e5df3 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -8,7 +8,7 @@
// Same as above but with the memref bare pointer lowering convention.
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \
+// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
index 155423db7e05049..5cf4f1bf8a61487 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
index e5047b6efa3bf25..7b10a570e1f10cd 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index 163e9fdba60c1a9..070679689240c1b 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index 381db2639c371f3..107e8a407d00cf3 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index 23c6c117e67f36b..4aa44b9ce5e9673 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index 3c5a100b5b90d57..717dc542cc594b7 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index d2a5127a34c3bdd..d96a63f26264e33 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -1,7 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
-// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \
+// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary="format=%gpu_compilation_format" \
// RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
// RUN: | mlir-cpu-runner \
diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
index a5d04f7322b4914..605a717b83f3f33 100644
--- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
index acb8dd43f50b4cf..5f1e33e87df9cb9 100644
--- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
@@ -1,2 +1,4 @@
if not config.enable_cuda_runner:
config.unsupported = True
+
+config.substitutions.append(("%gpu_compilation_format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
index 7657bf4732d32b7..3635caac43555a6 100644
--- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index 1a35d1e78b09475..01c5939b251649d 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
index 40fcea857d5b4eb..2a7482f9cece156 100644
--- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index de68d3b90f11f39..6e32eb147d499ef 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -16,7 +16,7 @@
// RUN: -canonicalize -cse \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 6d998522058154f..760ded16556ff8f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -10,7 +10,7 @@
// RUN: -convert-func-to-llvm \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
// Basic PTX check to make sure we are generating the right instructions.
@@ -34,7 +34,7 @@
// RUN: -convert-func-to-llvm \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
index 882c63a866eb4f3..9f1b2ce005a2869 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -14,7 +14,7 @@
// RUN: -canonicalize \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
// Basic PTX check to make sure we are generating the right instructions.
@@ -41,7 +41,7 @@
// RUN: -canonicalize \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
index 5a9acdf3d8da6ba..f16dcd9a72272e3 100644
--- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir
+++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index 48dce309c23bfe7..328c803c39efb73 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -70,6 +70,10 @@ struct TestLowerToNVVMOptions
*this, "cubin-features",
llvm::cl::desc("Features to use to serialize to cubin."),
llvm::cl::init("+ptx60")};
+ PassOptions::Option<std::string> cubinFormat{
+ *this, "cubin-format",
+ llvm::cl::desc("Compilation format to use to serialize to cubin."),
+ llvm::cl::init("isa")};
};
//===----------------------------------------------------------------------===//
@@ -257,7 +261,9 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
// Serialize all GPU modules to binaries.
- pm.addPass(createGpuModuleToBinaryPass());
+ GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+ gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
+ pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
// Convert vector to LLVM (always needed).
// TODO: C++20 designated initializers.
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index ef1fdbc0cba07c0..2de40ba5e8e57e6 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -29,6 +29,7 @@ config.run_cuda_tests = @MLIR_ENABLE_CUDA_CONVERSIONS@
config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
+config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index 62c9b527e1e38cb..a00ebba7b97e6d2 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -79,7 +79,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMMToLLVM)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -115,7 +115,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToPTX)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -145,7 +145,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToBinary)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
diff --git a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
index 89cf5c5d2ada586..9ada2dab40ff79e 100644
--- a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
@@ -83,7 +83,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLMToLLVM)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -119,7 +119,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToPTX)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
@@ -149,7 +149,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToBinary)) {
// Serialize the module.
auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
ASSERT_TRUE(!!serializer);
- gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary);
+ gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary);
for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
std::optional<SmallVector<char, 0>> object =
serializer.serializeToObject(gpuModule, options);
More information about the Mlir-commits
mailing list