[Mlir-commits] [mlir] [mlir][gpu][NVPTX] Enable NVIDIA GPU JIT compilation path (PR #66220)

Thu Sep 14 12:04:34 PDT 2023

https://github.com/fabianmcg updated https://github.com/llvm/llvm-project/pull/66220:

>From 608af55af6b768efbc2ab4139aea61a8728e50e6 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Tue, 12 Sep 2023 19:32:43 +0000
Subject: [PATCH 1/4] [mlir][gpu][NVPTX] Enable NVIDIA GPU JIT compilation path

This patch adds an NVPTX compilation path that enables JIT compilation on NVIDIA
targets. The following modifications were performed:
1. Adding a format field to the GPU object attribute, allowing the translation
attribute to use the correct runtime function to load the module. Likewise, a
dictionary attribute was added to add any possible extra options.

2. Adding the "createObject" method to "GPUTargetAttrInterface"; this method
returns a GPU object from a binary string.

3. Adding the function "mgpuModuleLoadJIT", which is only available for NVIDIA GPUs,
as there is no equivalent for AMD.

4. Adding the CMake flag `MLIR_GPU_COMPILATION_TEST_FORMAT` to specify the format
to use during testing.
---
 .../GPU/IR/CompilationAttrInterfaces.td       | 15 +++-
 .../mlir/Dialect/GPU/IR/CompilationAttrs.td   | 25 +++++-
 .../Dialect/GPU/IR/CompilationInterfaces.h    | 38 ++++----
 .../mlir/Dialect/GPU/Transforms/Passes.td     |  3 +-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 49 ++++++++--
 .../Dialect/GPU/Transforms/ModuleToBinary.cpp | 36 ++++----
 .../ExecutionEngine/CudaRuntimeWrappers.cpp   | 21 +++++
 .../ExecutionEngine/RocmRuntimeWrappers.cpp   |  5 ++
 mlir/lib/Target/LLVM/NVVM/Target.cpp          | 38 ++++++--
 mlir/lib/Target/LLVM/ROCDL/Target.cpp         | 21 ++++-
 .../LLVMIR/Dialect/GPU/SelectObjectAttr.cpp   | 90 ++++++++++++++-----
 mlir/test/CMakeLists.txt                      |  2 +
 .../Dialect/GPU/module-to-binary-nvvm.mlir    |  6 +-
 .../Dialect/GPU/module-to-binary-rocdl.mlir   |  6 +-
 mlir/test/Dialect/GPU/ops.mlir                | 10 +++
 .../Integration/GPU/CUDA/all-reduce-and.mlir  |  2 +-
 .../Integration/GPU/CUDA/all-reduce-max.mlir  |  2 +-
 .../Integration/GPU/CUDA/all-reduce-min.mlir  |  2 +-
 .../Integration/GPU/CUDA/all-reduce-op.mlir   |  2 +-
 .../Integration/GPU/CUDA/all-reduce-or.mlir   |  2 +-
 .../GPU/CUDA/all-reduce-region.mlir           |  2 +-
 .../Integration/GPU/CUDA/all-reduce-xor.mlir  |  2 +-
 mlir/test/Integration/GPU/CUDA/async.mlir     |  2 +-
 .../Integration/GPU/CUDA/gpu-to-cubin.mlir    |  2 +-
 mlir/test/Integration/GPU/CUDA/lit.local.cfg  |  2 +
 .../GPU/CUDA/multiple-all-reduce.mlir         |  2 +-
 mlir/test/Integration/GPU/CUDA/printf.mlir    |  2 +-
 mlir/test/Integration/GPU/CUDA/shuffle.mlir   |  2 +-
 .../Integration/GPU/CUDA/two-modules.mlir     |  2 +-
 mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp |  8 +-
 mlir/test/lit.site.cfg.py.in                  |  1 +
 .../Target/LLVM/SerializeNVVMTarget.cpp       |  6 +-
 .../Target/LLVM/SerializeROCDLTarget.cpp      |  6 +-
 33 files changed, 306 insertions(+), 108 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
index 5255286619e3bf2..160730480394272 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
@@ -33,12 +33,21 @@ def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> {
 
         If serialization fails then the method should return `std::nullopt`.
 
-        The `module` argument must be a GPU Module Op. The `options` argument is
-        meant to be used for passing additional options that are not in the
+        The `module` parameter must be a GPU Module Op. The `options` parameter
+        is meant to be used for passing additional options that are not in the
         attribute.
       }],
       "std::optional<SmallVector<char, 0>>", "serializeToObject",
-      (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>
+      (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>,
+    InterfaceMethod<[{
+        Creates a GPU object attribute from a binary string.
+
+        The `object` parameter is a binary string. The `options` parameter is
+        meant to be used for passing additional options that are not in the
+        attribute.
+      }], "Attribute", "createObject",
+        (ins "const SmallVector<char, 0>&":$object,
+             "const gpu::TargetOptions&":$options)>
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 9c1110d8e9a9463..3d2e9848a2b25a0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -20,6 +20,18 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 // GPU object attribute.
 //===----------------------------------------------------------------------===//
 
+def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">;
+def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">;
+def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">;
+def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">;
+def GPU_CompilationTargetEnum : GPU_I32Enum<
+  "CompilationTarget", "GPU object format", [
+    GPU_ObjectOffload,
+    GPU_ObjectISA,
+    GPU_ObjectBinary,
+    GPU_ObjectFatbin
+  ]>;
+
 def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
   let description = [{
     A GPU object attribute pairs a GPU target with a binary string,
@@ -32,8 +44,17 @@ def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
       #gpu.object<#nvvm.target, "...">
     ```
   }];
-  let parameters = (ins "Attribute":$target, "StringAttr":$object);
-  let assemblyFormat = [{`<` $target `,` $object `>`}];
+  let parameters = (ins
+    "Attribute":$target,
+    DefaultValuedParameter<"CompilationTarget", "CompilationTarget::Fatbin">:$format,
+    "StringAttr":$object,
+    OptionalParameter<"DictionaryAttr">:$properties
+  );
+  let assemblyFormat = [{ `<`
+      $target `,`  (`properties` `=` $properties ^ `,`)?
+      custom<Object>($format, $object)
+    `>`
+  }];
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index a1f64be57fa699d..ee7daed58f98314 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -25,6 +25,8 @@ namespace LLVM {
 class ModuleTranslation;
 }
 namespace gpu {
+enum class CompilationTarget : uint32_t;
+
 /// This class indicates that the attribute associated with this trait is a GPU
 /// offloading translation attribute. These kinds of attributes must implement
 /// an interface for handling the translation of GPU offloading operations like
@@ -42,27 +44,15 @@ class OffloadingTranslationAttrTrait
 /// ensure type safeness. Targets are free to ignore these options.
 class TargetOptions {
 public:
-  /// The target representation of the compilation process.
-  typedef enum {
-    offload = 1,  /// The process should produce an offloading representation.
-                  /// For the NVVM & ROCDL targets this option produces LLVM IR.
-    assembly = 2, /// The process should produce assembly code.
-    binary = 4,   /// The process should produce a binary.
-    fatbinary = 8, /// The process should produce a fat binary.
-    binOrFatbin =
-        binary |
-        fatbinary, /// The process should produce a binary or fatbinary. It's up
-                   /// to the target to decide which.
-  } CompilationTarget;
-
   /// Constructor initializing the toolkit path, the list of files to link to,
   /// extra command line options, the compilation target and a callback for
   /// obtaining the parent symbol table. The default compilation target is
   /// `binOrFatbin`.
-  TargetOptions(StringRef toolkitPath = {},
-                ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
-                CompilationTarget compilationTarget = binOrFatbin,
-                function_ref<SymbolTable *()> getSymbolTableCallback = {});
+  TargetOptions(
+      StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
+      StringRef cmdOptions = {},
+      CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+      function_ref<SymbolTable *()> getSymbolTableCallback = {});
 
   /// Returns the typeID.
   TypeID getTypeID() const;
@@ -90,13 +80,17 @@ class TargetOptions {
   /// table.
   SymbolTable *getSymbolTable() const;
 
+  /// Returns the default compilation target: `CompilationTarget::Fatbin`.
+  static CompilationTarget getDefaultCompilationTarget();
+
 protected:
   /// Derived classes must use this constructor to initialize `typeID` to the
   /// appropiate value: ie. `TargetOptions(TypeID::get<DerivedClass>())`.
-  TargetOptions(TypeID typeID, StringRef toolkitPath = {},
-                ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
-                CompilationTarget compilationTarget = binOrFatbin,
-                function_ref<SymbolTable *()> getSymbolTableCallback = {});
+  TargetOptions(
+      TypeID typeID, StringRef toolkitPath = {},
+      ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
+      CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+      function_ref<SymbolTable *()> getSymbolTableCallback = {});
 
   /// Path to the target toolkit.
   std::string toolkitPath;
@@ -108,7 +102,7 @@ class TargetOptions {
   /// process.
   std::string cmdOptions;
 
-  /// Compilation process target representation.
+  /// Compilation process target format.
   CompilationTarget compilationTarget;
 
   /// Callback for obtaining the parent symbol table of all the GPU modules
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 0bfb2750992058f..3de8e18851369df 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -68,7 +68,6 @@ def GpuModuleToBinaryPass
     2. `assembly`, `isa`: produces assembly code.
     3. `binary`, `bin`: produces binaries.
     4. `fatbinary`, `fatbin`: produces fatbinaries.
-    5. `binOrFatbin`: produces bins or fatbins, the target decides which.
   }];
   let options = [
     Option<"offloadingHandler", "handler", "Attribute", "nullptr",
@@ -79,7 +78,7 @@ def GpuModuleToBinaryPass
            "Extra files to link to.">,
     Option<"cmdOptions", "opts", "std::string", [{""}],
            "Command line options to pass to the tools.">,
-    Option<"compilationTarget", "format", "std::string", [{"binOrFatbin"}],
+    Option<"compilationTarget", "format", "std::string", [{"fatbin"}],
            "The target representation of the compilation process.">
   ];
 }
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index fde379cd0afe13f..5eb2cadc884e151 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1959,7 +1959,8 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
 //===----------------------------------------------------------------------===//
 
 LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
-                                 Attribute target, StringAttr object) {
+                                 Attribute target, CompilationTarget format,
+                                 StringAttr object, DictionaryAttr properties) {
   if (!target)
     return emitError() << "the target attribute cannot be null";
   if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>())
@@ -1968,6 +1969,40 @@ LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
                         "`gpu::TargetAttrInterface`";
 }
 
+namespace {
+LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format,
+                          StringAttr &object) {
+  std::optional<CompilationTarget> formatResult;
+  StringRef enumKeyword;
+  auto loc = odsParser.getCurrentLocation();
+  if (failed(odsParser.parseOptionalKeyword(&enumKeyword)))
+    formatResult = CompilationTarget::Fatbin;
+  if (!formatResult &&
+      (formatResult =
+           gpu::symbolizeEnum<gpu::CompilationTarget>(enumKeyword)) &&
+      odsParser.parseEqual())
+    return odsParser.emitError(loc, "expected an equal sign");
+  if (!formatResult)
+    return odsParser.emitError(loc, "expected keyword for GPU object format");
+  FailureOr<StringAttr> objectResult =
+      FieldParser<StringAttr>::parse(odsParser);
+  if (failed(objectResult))
+    return odsParser.emitError(odsParser.getCurrentLocation(),
+                               "failed to parse GPU_ObjectAttr parameter "
+                               "'object' which is to be a `StringAttr`");
+  format = *formatResult;
+  object = *objectResult;
+  return success();
+}
+
+void printObject(AsmPrinter &odsParser, CompilationTarget format,
+                 StringAttr object) {
+  if (format != CompilationTarget::Fatbin)
+    odsParser << stringifyEnum(format) << " = ";
+  odsParser << object;
+}
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // GPU select object attribute
 //===----------------------------------------------------------------------===//
@@ -2020,6 +2055,14 @@ SymbolTable *TargetOptions::getSymbolTable() const {
   return getSymbolTableCallback ? getSymbolTableCallback() : nullptr;
 }
 
+CompilationTarget TargetOptions::getCompilationTarget() const {
+  return compilationTarget;
+}
+
+CompilationTarget TargetOptions::getDefaultCompilationTarget() {
+  return CompilationTarget::Fatbin;
+}
+
 std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
 TargetOptions::tokenizeCmdOptions() const {
   std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options;
@@ -2043,10 +2086,6 @@ TargetOptions::tokenizeCmdOptions() const {
   return options;
 }
 
-TargetOptions::CompilationTarget TargetOptions::getCompilationTarget() const {
-  return compilationTarget;
-}
-
 MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
 
 #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index e29a1f0c3248d04..2bf89f8c57903e5 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -57,14 +57,14 @@ void GpuModuleToBinaryPass::getDependentDialects(
 
 void GpuModuleToBinaryPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
-  int targetFormat = llvm::StringSwitch<int>(compilationTarget)
-                         .Cases("offloading", "llvm", TargetOptions::offload)
-                         .Cases("assembly", "isa", TargetOptions::assembly)
-                         .Cases("binary", "bin", TargetOptions::binary)
-                         .Cases("fatbinary", "fatbin", TargetOptions::fatbinary)
-                         .Case("binOrFatbin", TargetOptions::binOrFatbin)
-                         .Default(-1);
-  if (targetFormat == -1)
+  auto targetFormat =
+      llvm::StringSwitch<std::optional<CompilationTarget>>(compilationTarget)
+          .Cases("offloading", "llvm", CompilationTarget::Offload)
+          .Cases("assembly", "isa", CompilationTarget::Assembly)
+          .Cases("binary", "bin", CompilationTarget::Binary)
+          .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin)
+          .Default(std::nullopt);
+  if (!targetFormat)
     getOperation()->emitError() << "Invalid format specified.";
 
   // Lazy symbol table builder callback.
@@ -82,10 +82,8 @@ void GpuModuleToBinaryPass::runOnOperation() {
     return &parentTable.value();
   };
 
-  TargetOptions targetOptions(
-      toolkitPath, linkFiles, cmdOptions,
-      static_cast<TargetOptions::CompilationTarget>(targetFormat),
-      lazyTableBuilder);
+  TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat,
+                              lazyTableBuilder);
   if (failed(transformGpuModulesToBinaries(
           getOperation(),
           offloadingHandler ? dyn_cast<OffloadingLLVMTranslationAttrInterface>(
@@ -107,17 +105,19 @@ LogicalResult moduleSerializer(GPUModuleOp op,
     auto target = dyn_cast<gpu::TargetAttrInterface>(targetAttr);
     assert(target &&
            "Target attribute doesn't implements `TargetAttrInterface`.");
-    std::optional<SmallVector<char, 0>> object =
+    std::optional<SmallVector<char, 0>> serializedModule =
         target.serializeToObject(op, targetOptions);
-
-    if (!object) {
+    if (!serializedModule) {
       op.emitError("An error happened while serializing the module.");
       return failure();
     }
 
-    objects.push_back(builder.getAttr<gpu::ObjectAttr>(
-        target,
-        builder.getStringAttr(StringRef(object->data(), object->size()))));
+    Attribute object = target.createObject(*serializedModule, targetOptions);
+    if (!object) {
+      op.emitError("An error happened while creating the object.");
+      return failure();
+    }
+    objects.push_back(object);
   }
   builder.setInsertionPointAfter(op);
   builder.create<gpu::BinaryOp>(op.getLoc(), op.getName(), handler,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 7bf6804902479a8..d19d473a5327627 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -126,6 +126,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
   return module;
 }
 
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
+                                                                int optLevel) {
+  ScopedContext scopedContext;
+  CUmodule module = nullptr;
+  char jitErrorBuffer[4096] = {0};
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+                               CU_JIT_OPTIMIZATION_LEVEL};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
+                            reinterpret_cast<void *>(optLevel)};
+
+  CUresult result =
+      cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
+  if (result) {
+    fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer);
+    CUDA_REPORT_IF_ERROR(result);
+  }
+  return module;
+}
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
   CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
 }
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..da2ae87fef6715f 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -38,6 +38,11 @@ extern "C" hipModule_t mgpuModuleLoad(void *data) {
   return module;
 }
 
+extern "C" hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) {
+  assert(false && "This function is not available in HIP.");
+  return nullptr;
+}
+
 extern "C" void mgpuModuleUnload(hipModule_t module) {
   HIP_REPORT_IF_ERROR(hipModuleUnload(module));
 }
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 13188b1107d928b..7f263627db54fbe 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -47,6 +47,10 @@ class NVVMTargetAttrImpl
   std::optional<SmallVector<char, 0>>
   serializeToObject(Attribute attribute, Operation *module,
                     const gpu::TargetOptions &options) const;
+
+  Attribute createObject(Attribute attribute,
+                         const SmallVector<char, 0> &object,
+                         const gpu::TargetOptions &options) const;
 };
 } // namespace
 
@@ -227,9 +231,9 @@ std::optional<std::string> NVPTXSerializer::findTool(StringRef tool) {
   }
 
   // 2. Check PATH.
-  if (std::optional<std::string> ptxasCompiler =
+  if (std::optional<std::string> toolPath =
           llvm::sys::Process::FindInEnvPath("PATH", tool))
-    return *ptxasCompiler;
+    return *toolPath;
 
   // 3. Check `getCUDAToolkitPath()`.
   pathRef = getCUDAToolkitPath();
@@ -255,8 +259,7 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) {
   // Determine if the serializer should create a fatbinary with the PTX embeded
   // or a simple CUBIN binary.
   const bool createFatbin =
-      (targetOptions.getCompilationTarget() & gpu::TargetOptions::fatbinary) ==
-      gpu::TargetOptions::fatbinary;
+      targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin;
 
   // Find the `ptxas` & `fatbinary` tools.
   std::optional<std::string> ptxasCompiler = findTool("ptxas");
@@ -522,7 +525,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule,
     llvm::dbgs().flush();
   });
 #undef DEBUG_TYPE
-  if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload)
+  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
     return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine);
 
   // Emit PTX code.
@@ -541,8 +544,12 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule,
 #undef DEBUG_TYPE
 
   // Return PTX if the compilation target is assembly.
-  if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly)
-    return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
+  if (targetOptions.getCompilationTarget() ==
+      gpu::CompilationTarget::Assembly) {
+    // Make sure to include the null terminator.
+    StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
+    return SmallVector<char, 0>(bin.begin(), bin.end());
+  }
 
     // Compile to binary.
 #if MLIR_NVPTXCOMPILER_ENABLED == 1
@@ -573,3 +580,20 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   return std::nullopt;
 #endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
 }
+
+Attribute
+NVVMTargetAttrImpl::createObject(Attribute attribute,
+                                 const SmallVector<char, 0> &object,
+                                 const gpu::TargetOptions &options) const {
+  auto target = cast<NVVMTargetAttr>(attribute);
+  gpu::CompilationTarget format = options.getCompilationTarget();
+  DictionaryAttr objectProps;
+  Builder builder(attribute.getContext());
+  if (format == gpu::CompilationTarget::Assembly)
+    objectProps = builder.getDictionaryAttr(
+        {builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO()))});
+  return builder.getAttr<gpu::ObjectAttr>(
+      attribute, format,
+      builder.getStringAttr(StringRef(object.data(), object.size())),
+      objectProps);
+}
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index effb88554e8ee04..611d08fe3e79e56 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -60,6 +60,10 @@ class ROCDLTargetAttrImpl
   std::optional<SmallVector<char, 0>>
   serializeToObject(Attribute attribute, Operation *module,
                     const gpu::TargetOptions &options) const;
+
+  Attribute createObject(Attribute attribute,
+                         const SmallVector<char, 0> &object,
+                         const gpu::TargetOptions &options) const;
 };
 } // namespace
 
@@ -417,7 +421,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule,
                  << llvmModule << "\n";
   });
 #undef DEBUG_TYPE
-  if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload)
+  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
     return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine);
 
   // Translate the Module to ISA.
@@ -434,7 +438,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule,
   });
 #undef DEBUG_TYPE
   // Return ISA assembly code if the compilation target is assembly.
-  if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly)
+  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
     return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
 
   // Compile to binary.
@@ -463,3 +467,16 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   return std::nullopt;
 #endif // MLIR_ROCM_CONVERSIONS_ENABLED == 1
 }
+
+Attribute
+ROCDLTargetAttrImpl::createObject(Attribute attribute,
+                                  const SmallVector<char, 0> &object,
+                                  const gpu::TargetOptions &options) const {
+  gpu::CompilationTarget format = options.getCompilationTarget();
+  Builder builder(attribute.getContext());
+  return builder.getAttr<gpu::ObjectAttr>(
+      attribute,
+      format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary
+                                              : format,
+      builder.getStringAttr(StringRef(object.data(), object.size())), nullptr);
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
index 3b060ac1779db26..47fe6973778cd7f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
@@ -44,6 +44,9 @@ class SelectObjectAttrImpl
                              Operation *binaryOperation,
                              llvm::IRBuilderBase &builder,
                              LLVM::ModuleTranslation &moduleTranslation) const;
+
+  // Returns the selected object for embedding.
+  gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
 };
 // Returns an identifier for the global string holding the binary.
 std::string getBinaryIdentifier(StringRef binaryName) {
@@ -58,24 +61,15 @@ void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
   });
 }
 
-LogicalResult SelectObjectAttrImpl::embedBinary(
-    Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
-    LLVM::ModuleTranslation &moduleTranslation) const {
-  assert(operation && "The binary operation must be non null.");
-  if (!operation)
-    return failure();
-
-  auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
-  if (!op) {
-    operation->emitError("Operation must be a GPU binary.");
-    return failure();
-  }
-
+gpu::ObjectAttr
+SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
   ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
 
   // Obtain the index of the object to select.
   int64_t index = -1;
-  if (Attribute target = cast<gpu::SelectObjectAttr>(attribute).getTarget()) {
+  if (Attribute target =
+          cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
+              .getTarget()) {
     // If the target attribute is a number it is the index. Otherwise compare
     // the attribute to every target inside the object array to find the index.
     if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
@@ -95,10 +89,28 @@ LogicalResult SelectObjectAttrImpl::embedBinary(
   }
 
   if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
-    op->emitError("The requested target object couldn't be found.");
+    op->emitError("the requested target object couldn't be found");
+    return nullptr;
+  }
+  return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
+}
+
+LogicalResult SelectObjectAttrImpl::embedBinary(
+    Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation) const {
+  assert(operation && "The binary operation must be non null.");
+  if (!operation)
+    return failure();
+
+  auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
+  if (!op) {
+    operation->emitError("operation must be a GPU binary");
     return failure();
   }
-  auto object = mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
+
+  gpu::ObjectAttr object = getSelectedObject(op);
+  if (!object)
+    return failure();
 
   llvm::Module *module = moduleTranslation.getLLVMModule();
 
@@ -130,6 +142,9 @@ class LaunchKernel {
   // Get the module load callee.
   FunctionCallee getModuleLoadFn();
 
+  // Get the module load JIT callee.
+  FunctionCallee getModuleLoadJITFn();
+
   // Get the module unload callee.
   FunctionCallee getModuleUnloadFn();
 
@@ -149,7 +164,8 @@ class LaunchKernel {
   Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
 
   // Create the full kernel launch.
-  mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op);
+  mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+                                         mlir::gpu::ObjectAttr object);
 
 private:
   Module &module;
@@ -174,13 +190,22 @@ LogicalResult SelectObjectAttrImpl::launchKernel(
 
   auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
   if (!launchFuncOp) {
-    launchFuncOperation->emitError("Operation must be a GPU launch func Op.");
+    launchFuncOperation->emitError("operation must be a GPU launch func Op.");
     return failure();
   }
 
+  auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
+  if (!binOp) {
+    binaryOperation->emitError("operation must be a GPU binary.");
+    return failure();
+  }
+  gpu::ObjectAttr object = getSelectedObject(binOp);
+  if (!object)
+    return failure();
+
   return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
                             moduleTranslation)
-      .createKernelLaunch(launchFuncOp);
+      .createKernelLaunch(launchFuncOp, object);
 }
 
 llvm::LaunchKernel::LaunchKernel(
@@ -215,6 +240,12 @@ llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
       FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy}), false));
 }
 
+llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() {
+  return module.getOrInsertFunction(
+      "mgpuModuleLoadJIT",
+      FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i32Ty}), false));
+}
+
 llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() {
   return module.getOrInsertFunction(
       "mgpuModuleUnload",
@@ -299,7 +330,8 @@ llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
 // call %streamDestroy(%4)
 // call %moduleUnload(%1)
 mlir::LogicalResult
-llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) {
+llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+                                       mlir::gpu::ObjectAttr object) {
   auto llvmValue = [&](mlir::Value value) -> Value * {
     Value *v = moduleTranslation.lookupValue(value);
     assert(v && "Value has not been translated.");
@@ -326,13 +358,29 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) {
   // Create the argument array.
   Value *argArray = createKernelArgArray(op);
 
+  // Default JIT optimization level.
+  llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0);
+  // Check if there's an optimization level embedded in the object.
+  DictionaryAttr objectProps = object.getProperties();
+  mlir::Attribute optAttr;
+  if (objectProps && (optAttr = objectProps.get("O"))) {
+    auto optLevel = dyn_cast<IntegerAttr>(optAttr);
+    if (!optLevel)
+      return op.emitError("the optimization level must be an integer");
+    optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue());
+  }
+
   // Load the kernel module.
   StringRef moduleName = op.getKernelModuleName().getValue();
   std::string binaryIdentifier = getBinaryIdentifier(moduleName);
   Value *binary = module.getGlobalVariable(binaryIdentifier, true);
   if (!binary)
     return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
-  Value *moduleObject = builder.CreateCall(getModuleLoadFn(), {binary});
+
+  Value *moduleObject =
+      object.getFormat() == gpu::CompilationTarget::Assembly
+          ? builder.CreateCall(getModuleLoadJITFn(), {binary, optV})
+          : builder.CreateCall(getModuleLoadFn(), {binary});
 
   // Load the kernel function.
   Value *moduleFunction = builder.CreateCall(
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 66a9cb01106ba5d..bf143d036c2f66f 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -26,6 +26,8 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
       "If arch-specific Arm integration tests run emulated, use this Arm native lli.")
   set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING
       "If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.")
+  set(MLIR_GPU_COMPILATION_TEST_FORMAT "fatbin" CACHE STRING
+      "The GPU compilation format used by the tests.")
   option(MLIR_RUN_AMX_TESTS "Run AMX tests.")
   option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.")
   option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")
diff --git a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
index 555b28a8293ee4f..22d7caa38feec97 100644
--- a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
+++ b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir
@@ -1,10 +1,10 @@
 // REQUIRES: host-supports-nvptx
 // RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s
-// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s
+// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA
 
 module attributes {gpu.container_module} {
   // CHECK-LABEL:gpu.binary @kernel_module1
-  // CHECK:[#gpu.object<#nvvm.target<chip = "sm_70">, "{{.*}}">]
+  // CHECK:[#gpu.object<#nvvm.target<chip = "sm_70">, offload = "{{.*}}">]
   gpu.module @kernel_module1 [#nvvm.target<chip = "sm_70">] {
     llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
         %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
   }
 
   // CHECK-LABEL:gpu.binary @kernel_module2
-  // CHECK:[#gpu.object<#nvvm.target<flags = {fast}>, "{{.*}}">, #gpu.object<#nvvm.target, "{{.*}}">]
+  // CHECK-ISA:[#gpu.object<#nvvm.target<flags = {fast}>, properties = {O = 2 : i32}, assembly = "{{.*}}">, #gpu.object<#nvvm.target, properties = {O = 2 : i32}, assembly = "{{.*}}">]
   gpu.module @kernel_module2 [#nvvm.target<flags = {fast}>, #nvvm.target] {
     llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
         %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
diff --git a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
index fb7cfb70c17ed3b..9f987c71387f4ca 100644
--- a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
+++ b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir
@@ -1,10 +1,10 @@
 // REQUIRES: host-supports-amdgpu
 // RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s
-// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s
+// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA
 
 module attributes {gpu.container_module} {
   // CHECK-LABEL:gpu.binary @kernel_module1
-  // CHECK:[#gpu.object<#rocdl.target<chip = "gfx90a">, "{{.*}}">]
+  // CHECK:[#gpu.object<#rocdl.target<chip = "gfx90a">, offload = "{{.*}}">]
   gpu.module @kernel_module1 [#rocdl.target<chip = "gfx90a">] {
     llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
         %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
   }
 
   // CHECK-LABEL:gpu.binary @kernel_module2
-  // CHECK:[#gpu.object<#rocdl.target<flags = {fast}>, "{{.*}}">, #gpu.object<#rocdl.target, "{{.*}}">]
+  // CHECK-ISA:[#gpu.object<#rocdl.target<flags = {fast}>, assembly = "{{.*}}">, #gpu.object<#rocdl.target, assembly = "{{.*}}">]
   gpu.module @kernel_module2 [#rocdl.target<flags = {fast}>, #rocdl.target] {
     llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
         %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index b314a768a089632..0d2f52e8adbfcda 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -127,6 +127,16 @@ module attributes {gpu.container_module} {
 
   gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">]
 
+  gpu.binary @binary_4 [#gpu.object<#nvvm.target, bin = "">,
+                        #gpu.object<#nvvm.target, assembly = "">,
+                        #gpu.object<#nvvm.target, offload = "">,
+                        #gpu.object<#nvvm.target, properties = { O = 3 : i32 }, offload = "">
+                        ]
+
+  // Check that fatbin gets ellided as it's the default format.
+  // CHECK: gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, "">]
+  gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, fatbin = "">]
+
   func.func private @two_value_generator() -> (f32, memref<?xf32, 1>)
 
   func.func @foo() {
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index c48a515ed022135..1401ac7bd489cc9 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -10,7 +10,7 @@
 
 // Same as above but with the memref bare pointer lowering convention.
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \
+// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
index e8ffc3f830c7c91..2e72ccabd636514 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
index fde50e9b6b92fbd..532834197f63d32 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index 08c3571ef1c35fa..7d0f56e9a125691 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index 134296f39c2b49e..9d00b49c75ff13f 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index c2be1b65950ea51..bf726821cea40a2 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index 6b75321b7bfc235..9cfadac04d33d00 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index 1314d32a779a883..f53249d1bb2143f 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -3,7 +3,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
 // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
-// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \
+// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary="format=%format" \
 // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
 // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
 // RUN: | mlir-cpu-runner \
diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
index abc93f7b1703a66..09211e34d8d14d4 100644
--- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
index acb8dd43f50b4cf..c49265d67433ede 100644
--- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
@@ -1,2 +1,4 @@
 if not config.enable_cuda_runner:
     config.unsupported = True
+
+config.substitutions.append(("%format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
index 3389f805ac63d0f..6aced55992417eb 100644
--- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index eef5ac66ca52ad4..c22c02ccb24500d 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
index 05cb854d18dd4f3..949631709febfcb 100644
--- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
index fde66de2fce6e7e..7b52db81fc11079 100644
--- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir
+++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index 5d0c420f65d5319..a554fe6dcadb12c 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -70,6 +70,10 @@ struct TestLowerToNVVMOptions
       *this, "cubin-features",
       llvm::cl::desc("Features to use to serialize to cubin."),
       llvm::cl::init("+ptx60")};
+  PassOptions::Option<std::string> cubinFormat{
+      *this, "cubin-format",
+      llvm::cl::desc("Compilation format to use to serialize to cubin."),
+      llvm::cl::init("isa")};
 };
 
 //===----------------------------------------------------------------------===//
@@ -257,7 +261,9 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
   pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
 
   // Serialize all GPU modules to binaries.
-  pm.addPass(createGpuModuleToBinaryPass());
+  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+  gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
+  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
 
   // Convert vector to LLVM (always needed).
   // TODO: C++20 designated initializers.
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index ef1fdbc0cba07c0..2de40ba5e8e57e6 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -29,6 +29,7 @@ config.run_cuda_tests = @MLIR_ENABLE_CUDA_CONVERSIONS@
 config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
 config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
+config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
 config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
 config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index 62c9b527e1e38cb..a00ebba7b97e6d2 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -79,7 +79,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMMToLLVM)) {
   // Serialize the module.
   auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
   ASSERT_TRUE(!!serializer);
-  gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload);
+  gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload);
   for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
     std::optional<SmallVector<char, 0>> object =
         serializer.serializeToObject(gpuModule, options);
@@ -115,7 +115,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToPTX)) {
   // Serialize the module.
   auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
   ASSERT_TRUE(!!serializer);
-  gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly);
+  gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly);
   for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
     std::optional<SmallVector<char, 0>> object =
         serializer.serializeToObject(gpuModule, options);
@@ -145,7 +145,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToBinary)) {
   // Serialize the module.
   auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
   ASSERT_TRUE(!!serializer);
-  gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary);
+  gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary);
   for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
     std::optional<SmallVector<char, 0>> object =
         serializer.serializeToObject(gpuModule, options);
diff --git a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
index 89cf5c5d2ada586..9ada2dab40ff79e 100644
--- a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
@@ -83,7 +83,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLMToLLVM)) {
   // Serialize the module.
   auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
   ASSERT_TRUE(!!serializer);
-  gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload);
+  gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload);
   for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
     std::optional<SmallVector<char, 0>> object =
         serializer.serializeToObject(gpuModule, options);
@@ -119,7 +119,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToPTX)) {
   // Serialize the module.
   auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
   ASSERT_TRUE(!!serializer);
-  gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly);
+  gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly);
   for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
     std::optional<SmallVector<char, 0>> object =
         serializer.serializeToObject(gpuModule, options);
@@ -149,7 +149,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToBinary)) {
   // Serialize the module.
   auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
   ASSERT_TRUE(!!serializer);
-  gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary);
+  gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary);
   for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
     std::optional<SmallVector<char, 0>> object =
         serializer.serializeToObject(gpuModule, options);

>From 05afac027db6cbebcdacdf6761c0e0672bfd9b1e Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Wed, 13 Sep 2023 20:43:05 +0000
Subject: [PATCH 2/4] Add a format option to the SparseCompiler pipeline.

---
 .../mlir/Dialect/SparseTensor/Pipelines/Passes.h | 16 ++++++++++++++++
 .../Pipelines/SparseTensorPipelines.cpp          |  4 +++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index 5deab8321cbcbde..ac3c2fde8a4840f 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -144,6 +144,22 @@ struct SparseCompilerOptions
                                            desc("GPU target architecture")};
   PassOptions::Option<std::string> gpuFeatures{*this, "gpu-features",
                                                desc("GPU target features")};
+  /// For NVIDIA GPUs there are 3 compilation format options:
+  /// 1. `isa`: the compiler generates PTX and the runtime JITs the PTX.
+  /// 2. `bin`: generates a CUBIN object for `chip=gpuChip`.
+  /// 3. `fatbin`: generates a fat binary with a CUBIN object for `gpuChip` and
+  /// also embeds the PTX in the fat binary.
+  /// Notes:
+  /// Option 1 adds a significant runtime performance hit, however, tests are
+  /// more likely to pass with this option.
+  /// Option 2 is better for execution time as there is no JIT; however, the
+  /// program will fail if there's an arch mismatch between `gpuChip` and the
+  /// GPU running the program.
+  /// Option 3 is the best compromise between options 1 & 2 as it can JIT in
+  /// case of an arch mismatch, however, it's only possible to JIT to a higher
+  /// CC than `gpuChip`.
+  PassOptions::Option<std::string> gpuFormat{
+      *this, "gpu-format", desc("GPU compilation format"), init("isa")};
 
   /// This option is used to enable GPU library generation.
   PassOptions::Option<bool> enableGPULibgen{
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 37f9e09d34c04e7..54069064839eac3 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -84,7 +84,9 @@ void mlir::sparse_tensor::buildSparseCompiler(
     nvvmTargetOptions.features = options.gpuFeatures;
     pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
     pm.addPass(createGpuToLLVMConversionPass());
-    pm.addPass(createGpuModuleToBinaryPass());
+    GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+    gpuModuleToBinaryPassOptions.compilationTarget = options.gpuFormat;
+    pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
   }
 
   pm.addPass(createReconcileUnrealizedCastsPass());

>From 4ad818f186ab35d0bc344203ab1560de43fda3be Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Thu, 14 Sep 2023 13:05:47 +0000
Subject: [PATCH 3/4] Updated the docs & migrated more tests to use
 MLIR_GPU_COMPILATION_TEST_FORMAT

---
 .../mlir/Dialect/GPU/IR/CompilationAttrs.td   | 33 +++++++++++++++----
 .../Dialect/GPU/IR/CompilationInterfaces.h    |  2 +-
 .../Dialect/SparseTensor/Pipelines/Passes.h   |  8 ++---
 .../SparseTensor/GPU/CUDA/lit.local.cfg       |  2 ++
 .../sparse-matmul-2-4-lib-from-linalg.mlir    |  2 +-
 .../CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir |  2 +-
 .../GPU/CUDA/sparse-gemm-lib.mlir             |  4 +--
 .../GPU/CUDA/sparse-matmul-lib.mlir           |  2 +-
 .../GPU/CUDA/sparse-matvec-const.mlir         |  2 +-
 .../GPU/CUDA/sparse-matvec-lib.mlir           |  2 +-
 .../SparseTensor/GPU/CUDA/sparse-matvec.mlir  |  2 +-
 .../GPU/CUDA/sparse-mma-2-4-f16.mlir          |  2 +-
 .../GPU/CUDA/sparse-sampled-matmul-lib.mlir   |  2 +-
 ...ansform-mma-sync-matmul-f16-f16-accum.mlir |  2 +-
 .../sm80/transform-mma-sync-matmul-f32.mlir   |  2 +-
 .../GPU/CUDA/TensorCore/wmma-matmul-f16.mlir  |  2 +-
 .../TensorCore/wmma-matmul-f32-bare-ptr.mlir  |  2 +-
 .../GPU/CUDA/TensorCore/wmma-matmul-f32.mlir  |  2 +-
 .../sm90/tma_load_128x64_swizzle128b.mlir     |  2 +-
 .../sm90/tma_load_64x8_8x128_noswizzle.mlir   |  4 +--
 ...a_load_64x8_8x128_noswizzle-transform.mlir |  4 +--
 21 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 3d2e9848a2b25a0..cc43c53c269d87a 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -20,12 +20,13 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 // GPU object attribute.
 //===----------------------------------------------------------------------===//
 
+// For documentation on this enum cases, see the `GPU_ObjectAttr` docs.
 def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">;
 def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">;
 def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">;
 def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">;
 def GPU_CompilationTargetEnum : GPU_I32Enum<
-  "CompilationTarget", "GPU object format", [
+  "CompilationTarget", "GPU compilation format", [
     GPU_ObjectOffload,
     GPU_ObjectISA,
     GPU_ObjectBinary,
@@ -34,14 +35,34 @@ def GPU_CompilationTargetEnum : GPU_I32Enum<
 
 def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
   let description = [{
-    A GPU object attribute pairs a GPU target with a binary string,
-    encapsulating the information of how the object was generated with the
-    object itself.
+    A GPU object attribute glues together a GPU target, the object kind, a
+    binary string with the object, and the object properties, encapsulating how
+    the object was generated and its properties with the object itself.
 
-    The target attribute must implement the `TargetAttrInterface` interface.
+    There are four object formats:
+    1. `Offload`: represents generic objects not described by the other three
+    formats, and its meaning is target-dependent. For example, on the NVPTX and
+    AMDGPU targets, this format is associated with LLVM bitcode.
+    2. `Assembly`: represents GPU assembly code. For example, in the NVPTX
+    target, assembly is PTX code, which can be JITted at runtime.
+    3. `Binary`: represents executable code for a GPU single architecture. For
+    example, PTX code that was compiled for a specific compute capability. Note
+    that this format is likely to throw an error if there is an arch mismatch
+    between the compiled and running arch.
+    4. `Fatbin`: represents a GPU fat binary with executable code for multiple
+    architectures. This format is the default; thus, it gets elided inassembly
+    code.
+
+    Object properties are specified through the `properties` dictionary
+    attribute and can be used to define additional information.
+    The target attribute must implement or promise the `TargetAttrInterface`
+    interface.
 
     ```
-      #gpu.object<#nvvm.target, "...">
+      #gpu.object<#rocdl.target, offload = "..."> // An offload object.
+      #gpu.object<#nvvm.target, properties = {O = 3 : i32}, assembly = "..."> // An assembly object with additional properties.
+      #gpu.object<#rocdl.target, bin = "..."> // A binary object.
+      #gpu.object<#nvvm.target, "..."> // A fatbin object.
     ```
   }];
   let parameters = (ins
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index ee7daed58f98314..6d7cb5ca7a7f81f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -47,7 +47,7 @@ class TargetOptions {
   /// Constructor initializing the toolkit path, the list of files to link to,
   /// extra command line options, the compilation target and a callback for
   /// obtaining the parent symbol table. The default compilation target is
-  /// `binOrFatbin`.
+  /// `Fatbin`.
   TargetOptions(
       StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
       StringRef cmdOptions = {},
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index ac3c2fde8a4840f..e65b78607df4557 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -145,7 +145,7 @@ struct SparseCompilerOptions
   PassOptions::Option<std::string> gpuFeatures{*this, "gpu-features",
                                                desc("GPU target features")};
   /// For NVIDIA GPUs there are 3 compilation format options:
-  /// 1. `isa`: the compiler generates PTX and the runtime JITs the PTX.
+  /// 1. `isa`: the compiler generates PTX and the driver JITs the PTX.
   /// 2. `bin`: generates a CUBIN object for `chip=gpuChip`.
   /// 3. `fatbin`: generates a fat binary with a CUBIN object for `gpuChip` and
   /// also embeds the PTX in the fat binary.
@@ -156,10 +156,10 @@ struct SparseCompilerOptions
   /// program will fail if there's an arch mismatch between `gpuChip` and the
   /// GPU running the program.
   /// Option 3 is the best compromise between options 1 & 2 as it can JIT in
-  /// case of an arch mismatch, however, it's only possible to JIT to a higher
-  /// CC than `gpuChip`.
+  /// case of an arch mismatch between `gpuChip` and the running arch. However,
+  /// it's only possible to JIT to a higher CC than `gpuChip`.
   PassOptions::Option<std::string> gpuFormat{
-      *this, "gpu-format", desc("GPU compilation format"), init("isa")};
+      *this, "gpu-format", desc("GPU compilation format"), init("fatbin")};
 
   /// This option is used to enable GPU library generation.
   PassOptions::Option<bool> enableGPULibgen{
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
index 6788ccea3a222c5..3c6ac3ce14f3e23 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
@@ -1,2 +1,4 @@
 if not config.enable_cuda_runner or not config.mlir_run_cuda_sm80_tests:
     config.unsupported = True
+
+config.substitutions.append(("%format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
index aa71abbcf0e717c..7d306d81428c733 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE: --shared-libs=%mlir_cuda_runtime \
 // DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
index 062798a39b8106e..59694cb540f5f11 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -1,7 +1,7 @@
 //
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
-// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
 // RUN:          %s \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
index a310e59d53038fd..64709a2e93bee81 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
@@ -4,7 +4,7 @@
 // with RT lib:
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71"  \
+// RUN:   --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format"  \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
@@ -14,7 +14,7 @@
 // without RT lib:
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71"  \
+// RUN:   --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format"  \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
index 1e51aae5f389260..324f7a75d3e1e45 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:    --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE:    --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
index ca47de6cca27f6d..039156e046364f6 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
@@ -3,7 +3,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
index 16a240838d7c4fd..9ded223f0c67cbf 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
index c5c3546cdf01694..98fb423001b86f9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
@@ -3,7 +3,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index aee8a6a6558e4f5..d73fd5558cbd327 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -6,7 +6,7 @@
 // RUN: mlir-opt \
 // RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
 // RUN: %s \
-// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \
+// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index 934bd837420c1bf..28f5bf063db2dd7 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
 // DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
 // DEFINE:   mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
index d959fdb6a9db178..641ea40681074c6 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
@@ -3,7 +3,7 @@
 // RUN: mlir-opt %s \
 // RUN:  -test-transform-dialect-interpreter \
 // RUN:  -test-transform-dialect-erase-schedule \
-// RUN:  -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
+// RUN:  -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
index 0ec15f2a9c79d70..c494a48b1b55680 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
@@ -13,7 +13,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -test-transform-dialect-interpreter \
 // RUN:   -test-transform-dialect-erase-schedule \
-// RUN:   -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
+// RUN:   -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
index 4d8a281113593c6..273d6b06792707f 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
index 664d344b2769bf7..6c5ee42c8473fe3 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
@@ -5,7 +5,7 @@
 // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
 // This test also uses gpu.memcpy operations (instead of gpu.host_register).
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --entry-point-result=void \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
index 4d76eb898dc2935..5183c8d9eb04b00 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index de68d3b90f11f39..9e14dc6e183f9b7 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -16,7 +16,7 @@
 // RUN:         -canonicalize -cse \
 // RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 6d998522058154f..675de1f4a1cf632 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -10,7 +10,7 @@
 // RUN:         -convert-func-to-llvm \
 // RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
 
 // Basic PTX check to make sure we are generating the right instructions.
@@ -34,7 +34,7 @@
 // RUN:         -convert-func-to-llvm \
 // RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
index e66978bc594b1b8..6f5520d98aeebe7 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -16,7 +16,7 @@
 // RUN:     -canonicalize \
 // RUN:     -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
 
 // Basic PTX check to make sure we are generating the right instructions.
@@ -43,7 +43,7 @@
 // RUN:     -canonicalize \
 // RUN:     -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \

>From a9c9e7f23e72ba1eef7eccfe167d86940a1b8be8 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Thu, 14 Sep 2023 19:01:15 +0000
Subject: [PATCH 4/4] Switched %format -> %gpu_compilation_format, arch ->
 architecture and & -> and

---
 mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td  |  4 ++--
 .../mlir/Dialect/SparseTensor/Pipelines/Passes.h      | 11 ++++++-----
 .../Dialect/SparseTensor/GPU/CUDA/lit.local.cfg       |  2 +-
 .../sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir    |  2 +-
 .../GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir     |  2 +-
 .../SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir        |  4 ++--
 .../SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir      |  2 +-
 .../SparseTensor/GPU/CUDA/sparse-matvec-const.mlir    |  2 +-
 .../SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir      |  2 +-
 .../Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir  |  2 +-
 .../SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir     |  2 +-
 .../GPU/CUDA/sparse-sampled-matmul-lib.mlir           |  2 +-
 .../sm80/transform-mma-sync-matmul-f16-f16-accum.mlir |  2 +-
 .../sm80/transform-mma-sync-matmul-f32.mlir           |  2 +-
 .../GPU/CUDA/TensorCore/wmma-matmul-f16.mlir          |  2 +-
 .../GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir |  2 +-
 .../GPU/CUDA/TensorCore/wmma-matmul-f32.mlir          |  2 +-
 mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir    |  2 +-
 mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir    |  2 +-
 mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir    |  2 +-
 mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir     |  2 +-
 mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir     |  2 +-
 mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir |  2 +-
 mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir    |  2 +-
 mlir/test/Integration/GPU/CUDA/async.mlir             |  2 +-
 mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir      |  2 +-
 mlir/test/Integration/GPU/CUDA/lit.local.cfg          |  2 +-
 .../Integration/GPU/CUDA/multiple-all-reduce.mlir     |  2 +-
 mlir/test/Integration/GPU/CUDA/printf.mlir            |  2 +-
 mlir/test/Integration/GPU/CUDA/shuffle.mlir           |  2 +-
 .../GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir    |  2 +-
 .../GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir  |  4 ++--
 .../tma_load_64x8_8x128_noswizzle-transform.mlir      |  4 ++--
 mlir/test/Integration/GPU/CUDA/two-modules.mlir       |  2 +-
 34 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index cc43c53c269d87a..6659f4a2c58e825 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -47,8 +47,8 @@ def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
     target, assembly is PTX code, which can be JITted at runtime.
     3. `Binary`: represents executable code for a GPU single architecture. For
     example, PTX code that was compiled for a specific compute capability. Note
-    that this format is likely to throw an error if there is an arch mismatch
-    between the compiled and running arch.
+    that this format is likely to throw an error if there is an architecture
+    mismatch between the compiled and running architecture.
     4. `Fatbin`: represents a GPU fat binary with executable code for multiple
     architectures. This format is the default; thus, it gets elided inassembly
     code.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index e65b78607df4557..b07ab8b2a60341d 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -153,11 +153,12 @@ struct SparseCompilerOptions
   /// Option 1 adds a significant runtime performance hit, however, tests are
   /// more likely to pass with this option.
   /// Option 2 is better for execution time as there is no JIT; however, the
-  /// program will fail if there's an arch mismatch between `gpuChip` and the
-  /// GPU running the program.
-  /// Option 3 is the best compromise between options 1 & 2 as it can JIT in
-  /// case of an arch mismatch between `gpuChip` and the running arch. However,
-  /// it's only possible to JIT to a higher CC than `gpuChip`.
+  /// program will fail if there's an architecture mismatch between `gpuChip`
+  /// and the GPU running the program.
+  /// Option 3 is the best compromise between options 1 and 2 as it can JIT in
+  /// case of an architecture mismatch between `gpuChip` and the running
+  /// architecture. However, it's only possible to JIT to a higher CC than
+  /// `gpuChip`.
   PassOptions::Option<std::string> gpuFormat{
       *this, "gpu-format", desc("GPU compilation format"), init("fatbin")};
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
index 3c6ac3ce14f3e23..19f12d39c8428ad 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
@@ -1,4 +1,4 @@
 if not config.enable_cuda_runner or not config.mlir_run_cuda_sm80_tests:
     config.unsupported = True
 
-config.substitutions.append(("%format", config.gpu_compilation_format))
+config.substitutions.append(("%gpu_compilation_format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
index 7d306d81428c733..67c8ce8dfa3004f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
+// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE: --shared-libs=%mlir_cuda_runtime \
 // DEFINE: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
index 59694cb540f5f11..8917ab1e5a70d71 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -1,7 +1,7 @@
 //
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
-// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
+// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
 // RUN:          %s \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
index 64709a2e93bee81..9e29dbcca7ff4b8 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
@@ -4,7 +4,7 @@
 // with RT lib:
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format"  \
+// RUN:   --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format"  \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
@@ -14,7 +14,7 @@
 // without RT lib:
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format"  \
+// RUN:   --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format"  \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
index 324f7a75d3e1e45..b21576635eddd60 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:    --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
+// DEFINE:    --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
index 039156e046364f6..9f995e2d1349239 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
@@ -3,7 +3,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
+// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
index 9ded223f0c67cbf..b6dfce577f2a424 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
+// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
index 98fb423001b86f9..c6faf2660541a7e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir
@@ -3,7 +3,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format" \
+// RUN:   --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index d73fd5558cbd327..8ee7a266083b0c0 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -6,7 +6,7 @@
 // RUN: mlir-opt \
 // RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
 // RUN: %s \
-// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%format" \
+// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index 28f5bf063db2dd7..850c1ca069a1a8a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%format
+// DEFINE:   --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
 // DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
 // DEFINE:   mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
index 641ea40681074c6..21f0e24d5e5da46 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
@@ -3,7 +3,7 @@
 // RUN: mlir-opt %s \
 // RUN:  -test-transform-dialect-interpreter \
 // RUN:  -test-transform-dialect-erase-schedule \
-// RUN:  -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%format" \
+// RUN:  -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
index c494a48b1b55680..22b422acf7cb564 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
@@ -13,7 +13,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -test-transform-dialect-interpreter \
 // RUN:   -test-transform-dialect-erase-schedule \
-// RUN:   -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%format" \
+// RUN:   -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
index 273d6b06792707f..a20da0673d653bb 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
index 6c5ee42c8473fe3..643f563bcb7b732 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
@@ -5,7 +5,7 @@
 // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
 // This test also uses gpu.memcpy operations (instead of gpu.host_register).
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --entry-point-result=void \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
index 5183c8d9eb04b00..4159b5b8e57e9fb 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index 1401ac7bd489cc9..558e19d15e9107a 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -10,7 +10,7 @@
 
 // Same as above but with the memref bare pointer lowering convention.
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
index 2e72ccabd636514..f1ae0be2d9871c1 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
index 532834197f63d32..191f64d297a3b91 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index 7d0f56e9a125691..089b18e311d53fa 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index 9d00b49c75ff13f..686c3931ee3d2de 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index bf726821cea40a2..e8903b0812ef3c7 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index 9cfadac04d33d00..cfb22b3d470dfc7 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index f53249d1bb2143f..be26ff6d7ac1828 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -3,7 +3,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
 // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
-// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary="format=%format" \
+// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary="format=%gpu_compilation_format" \
 // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
 // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
 // RUN: | mlir-cpu-runner \
diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
index 09211e34d8d14d4..f3587ddfeecc7ff 100644
--- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
index c49265d67433ede..5f1e33e87df9cb9 100644
--- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
@@ -1,4 +1,4 @@
 if not config.enable_cuda_runner:
     config.unsupported = True
 
-config.substitutions.append(("%format", config.gpu_compilation_format))
+config.substitutions.append(("%gpu_compilation_format", config.gpu_compilation_format))
diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
index 6aced55992417eb..3c6135d1d72af86 100644
--- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index c22c02ccb24500d..52560e9ebb06e83 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
index 949631709febfcb..16ccf90595c5722 100644
--- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index 9e14dc6e183f9b7..6e32eb147d499ef 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -16,7 +16,7 @@
 // RUN:         -canonicalize -cse \
 // RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 675de1f4a1cf632..760ded16556ff8f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -10,7 +10,7 @@
 // RUN:         -convert-func-to-llvm \
 // RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
 
 // Basic PTX check to make sure we are generating the right instructions.
@@ -34,7 +34,7 @@
 // RUN:         -convert-func-to-llvm \
 // RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
index 6f5520d98aeebe7..da38c160cf2873c 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -16,7 +16,7 @@
 // RUN:     -canonicalize \
 // RUN:     -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
 
 // Basic PTX check to make sure we are generating the right instructions.
@@ -43,7 +43,7 @@
 // RUN:     -canonicalize \
 // RUN:     -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
 // RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%format -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
index 7b52db81fc11079..8cbbe000bfa74b3 100644
--- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir
+++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-supports-nvptx
 
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%format" \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \