[Mlir-commits] [mlir] 5e8a116 - Revert "[mlir][gpu] Fallback to JIT compilation" "[mlir][gpu] Increase default SM version from 35 to 50" and "[mlir][gpu] Improving Cubin Serialization with ptxas Compiler"

Mon Jul 24 10:23:36 PDT 2023

Author: Mehdi Amini
Date: 2023-07-24T10:23:15-07:00
New Revision: 5e8a1164f22757c19387abf79608e99782b8ec38

URL: https://github.com/llvm/llvm-project/commit/5e8a1164f22757c19387abf79608e99782b8ec38
DIFF: https://github.com/llvm/llvm-project/commit/5e8a1164f22757c19387abf79608e99782b8ec38.diff

LOG: Revert "[mlir][gpu] Fallback to JIT compilation" "[mlir][gpu] Increase default SM version from 35 to 50" and  "[mlir][gpu] Improving Cubin Serialization with ptxas Compiler"

This reverts commit 2e0e00ed841951e358a85a871647be9b3a622f51
and reverts commit a6eb40692c795a9cc29266779ceca2e304141114
and reverts commit 585cbe3f639783bf0307b47504acbd205f135310.

15 tests are broken on the mlir-nvidia buildbot:

'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_INVALID_SOURCE'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
    mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
    mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
    mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
    mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 38c758c0908fa6..1afbcb2128d490 100644

--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -70,32 +70,6 @@ inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
 }
 
 namespace gpu {
-
-/// Options for Serialization
-struct SerializationToCubinOptions {
-  /// LLVM target triple
-  std::string triple;
-
-  /// SM Architecture of the GPU
-  std::string chip;
-
-  /// PTX version that is wanted to produce
-  std::string features;
-
-  /// Optimization level
-  int optLevel = 2;
-
-  /// Dump generated PTX to stderr for debug purposes
-  bool dumpPtx = false;
-
-  /// Compiles generated PTX by ptxas compiler. When it is false, the generated
-  /// PTX is compilet by JIT compielr by the driver.
-  bool usePtxas = true;
-
-  /// Parameters to pass ptxas compiler. It is ignored for JIT compiler.
-  std::string ptxasParams;
-};
-
 /// Base pass class to serialize kernel functions through LLVM into
 /// user-specified IR and add the resulting blob as module attribute.
 class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
@@ -143,18 +117,9 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
       *this, "gpu-binary-annotation",
       llvm::cl::desc("Annotation attribute string for GPU binary"),
       llvm::cl::init(getDefaultGpuBinaryAnnotation())};
-
   Option<bool> dumpPtx{*this, "dump-ptx",
                        ::llvm::cl::desc("Dump generated PTX"),
                        llvm::cl::init(false)};
-
-  Option<bool> usePtxas{
-      *this, "use-ptxas",
-      ::llvm::cl::desc("Compile generated PTX by ptxas compiler"),
-      llvm::cl::init(true)};
-  Option<std::string> ptxasParams{
-      *this, "ptxas-params",
-      ::llvm::cl::desc("Parameters to pass ptxas compiler")};
 };
 } // namespace gpu
 
@@ -172,8 +137,11 @@ void registerGpuSerializeToHsacoPass();
 
 /// Create an instance of the GPU kernel function to CUBIN binary serialization
 /// pass with optLevel (default level 2).
-std::unique_ptr<Pass>
-createGpuSerializeToCubinPass(const gpu::SerializationToCubinOptions &options);
+std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
+                                                    StringRef chip,
+                                                    StringRef features,
+                                                    int optLevel = 2,
+                                                    bool dumpPtx = false);
 
 /// Create an instance of the GPU kernel function to HSAco binary serialization
 /// pass.

diff  --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
index b61b8a9a6fae88..0077debe26a0b2 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
@@ -12,14 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/WithColor.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
 #include "mlir/Pass/Pass.h"
@@ -43,106 +36,6 @@ static void emitCudaError(const llvm::Twine &expr, const char *buffer,
                      .concat("]"));
 }
 
-static constexpr char kPtxasCompilerName[] = "ptxas";
-
-/// Compiles the given generated PTX code with the given ptxas compiler.
-static FailureOr<std::string>
-compileWithPtxas(StringRef smCapability, StringRef ptxasParams,
-                 StringRef ptxSource, bool dumpPtx, std::string *message) {
-  // Step 0. Find ptxas compiler
-  std::optional<std::string> ptxasCompiler =
-      llvm::sys::Process::FindInEnvPath("PATH", kPtxasCompilerName);
-  if (!ptxasCompiler.has_value())
-    return failure();
-
-  // Step 1. Create temporary files: ptx source file, log file and cubin file
-  llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile;
-  llvm::sys::fs::createTemporaryFile("mlir-ptx", "", ptxSourceFile);
-  llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile);
-  llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile);
-  llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile);
-  std::string cubinFile = std::string(ptxSourceFile) + ".cubin";
-  llvm::FileRemover stdinRemover(stdinFile.c_str());
-  llvm::FileRemover stdoutRemover(stdoutFile.c_str());
-  llvm::FileRemover stderrRemover(stderrFile.c_str());
-  llvm::FileRemover binRemover(cubinFile.c_str());
-  llvm::FileRemover srcRemover(ptxSourceFile.c_str());
-
-  // Step 2. Write the generated PTX into a file, so we can pass it  to ptxas
-  // compiler
-  std::error_code ec;
-  llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec);
-  fPtxSource << ptxSource;
-  fPtxSource.close();
-  if (fPtxSource.has_error()) {
-    *message = std::string(
-        "Could not write the generated ptx into a temporary file\n");
-    return failure();
-  }
-
-  // Step 3. Build the ptxas command  line
-  std::vector<StringRef> argVector{StringRef("ptxas"), StringRef("-arch"),
-                                   smCapability,       StringRef(ptxSourceFile),
-                                   StringRef("-o"),    StringRef(cubinFile)};
-#ifdef _WIN32
-  auto tokenize = llvm::cl::TokenizeWindowsCommandLine;
-#else
-  auto tokenize = llvm::cl::TokenizeGNUCommandLine;
-#endif // _WIN32
-  llvm::BumpPtrAllocator scratchAllocator;
-  llvm::StringSaver stringSaver(scratchAllocator);
-  SmallVector<const char *> rawArgs;
-  tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false);
-  for (const auto *rawArg : rawArgs)
-    argVector.emplace_back(rawArg);
-
-  std::optional<StringRef> redirects[] = {
-      stdinFile.str(),
-      stdoutFile.str(),
-      stderrFile.str(),
-  };
-
-  // Step 4. Invoke ptxas
-  if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(),
-                                llvm::ArrayRef<llvm::StringRef>(argVector),
-                                /*Env=*/std::nullopt,
-                                /*Redirects=*/redirects,
-                                /*SecondsToWait=*/0,
-                                /*MemoryLimit=*/0,
-                                /*ErrMsg=*/message)) {
-    if (message->empty()) {
-      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeErrorlog =
-          llvm::MemoryBuffer::getFile(stderrFile);
-      *message = std::string("Invoking ptxas is failed, see the file: ");
-      if (maybeErrorlog)
-        *message += maybeErrorlog->get()->getBuffer().str();
-    }
-    stderrRemover.releaseFile();
-    return failure();
-  }
-
-  // Step 5. The output of ptxas if  verbose flag is set. This is useful
-  // because it shows local memory usage, register usage, and etc.
-  if (dumpPtx) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFlog =
-        llvm::MemoryBuffer::getFile(stderrFile);
-    if (maybeFlog) {
-      llvm::WithColor::note() << maybeFlog->get()->getBuffer().str();
-    }
-  }
-
-  // Step 6. Read the cubin file, and return. It will eventually be written
-  // into executable.
-  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFcubin =
-      llvm::MemoryBuffer::getFile(cubinFile);
-  if (!maybeFcubin) {
-    *message = std::string("Could not read cubin file \n");
-    return failure();
-  }
-
-  return std::string(maybeFcubin->get()->getBuffer());
-}
-
 #define RETURN_ON_CUDA_ERROR(expr)                                             \
   do {                                                                         \
     if (auto status = (expr)) {                                                \
@@ -160,14 +53,12 @@ class SerializeToCubinPass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToCubinPass)
 
   SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda",
-                       StringRef chip = "sm_50", StringRef features = "+ptx60",
-                       int optLevel = 2, bool dumpPtx = false,
-                       bool usePtxas = true, StringRef ptxasParams = {});
+                       StringRef chip = "sm_35", StringRef features = "+ptx60",
+                       int optLevel = 2, bool dumpPtx = false);
 
   StringRef getArgument() const override { return "gpu-to-cubin"; }
   StringRef getDescription() const override {
-    return "Lower GPU kernel function to CUBIN binary "
-           "annotations";
+    return "Lower GPU kernel function to CUBIN binary annotations";
   }
 
 private:
@@ -189,10 +80,9 @@ llvm::once_flag SerializeToCubinPass::initializeBackendOnce;
 
 SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
                                            StringRef features, int optLevel,
-                                           bool dumpPtx, bool usePtxas,
-                                           StringRef ptxasParams) {
-  // No matter how this pass is constructed, ensure that
-  // the NVPTX backend is initialized exactly once.
+                                           bool dumpPtx) {
+  // No matter how this pass is constructed, ensure that the NVPTX backend
+  // is initialized exactly once.
   llvm::call_once(initializeBackendOnce, []() {
     // Initialize LLVM NVPTX backend.
     LLVMInitializeNVPTXTarget();
@@ -204,9 +94,7 @@ SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
   maybeSetOption(this->triple, triple);
   maybeSetOption(this->chip, chip);
   maybeSetOption(this->features, features);
-  maybeSetOption(this->ptxasParams, ptxasParams);
   this->dumpPtx = dumpPtx;
-  this->usePtxas = usePtxas;
   if (this->optLevel.getNumOccurrences() == 0)
     this->optLevel.setValue(optLevel);
 }
@@ -224,8 +112,7 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
 
   RETURN_ON_CUDA_ERROR(cuInit(0));
 
-  // Linking requires a device
-  // context.
+  // Linking requires a device context.
   CUdevice device;
   RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
   CUcontext context;
@@ -244,23 +131,9 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
 
   auto kernelName = getOperation().getName().str();
   if (dumpPtx) {
-    llvm::errs() << "// Kernel Name : [" << kernelName << "]\n";
-    llvm::errs() << isa << "\n";
+    llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n";
+    llvm::dbgs() << isa << "\n";
   }
-
-  if (usePtxas) {
-    // Try to compile it with ptxas first.
-    std::string message;
-    FailureOr<std::string> maybeCubinImage =
-        compileWithPtxas(this->chip, ptxasParams, isa, dumpPtx, &message);
-    if (succeeded(maybeCubinImage)) {
-      return std::make_unique<std::vector<char>>(
-          maybeCubinImage.value().begin(), maybeCubinImage.value().end());
-    }
-    llvm::errs() << message << ". It fallsback to JIT compilation.\n";
-  }
-
-  // Fallback to JIT compilation if ptxas fails.
   RETURN_ON_CUDA_ERROR(cuLinkAddData(
       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
@@ -277,7 +150,7 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
   auto result =
       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
 
-  // This will also destroy the cubin  data.
+  // This will also destroy the cubin data.
   RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
   RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
 
@@ -286,22 +159,17 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
 
 // Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
 void mlir::registerGpuSerializeToCubinPass() {
-  PassRegistration<SerializeToCubinPass> registerSerializeToCubin([] {
-    // Initialize LLVM NVPTX backend.
-    LLVMInitializeNVPTXTarget();
-    LLVMInitializeNVPTXTargetInfo();
-    LLVMInitializeNVPTXTargetMC();
-    LLVMInitializeNVPTXAsmPrinter();
-
-    return std::make_unique<SerializeToCubinPass>();
-  });
+  PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
+      [] { return std::make_unique<SerializeToCubinPass>(); });
 }
 
-std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(
-    const gpu::SerializationToCubinOptions &options) {
-  return std::make_unique<SerializeToCubinPass>(
-      options.triple, options.chip, options.features, options.optLevel,
-      options.dumpPtx, options.usePtxas, options.ptxasParams);
+std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(StringRef triple,
+                                                          StringRef arch,
+                                                          StringRef features,
+                                                          int optLevel,
+                                                          bool dumpPtx) {
+  return std::make_unique<SerializeToCubinPass>(triple, arch, features,
+                                                optLevel, dumpPtx);
 }
 
 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE

diff  --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index bd629b6aa81d07..990f8f7327d809 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -102,12 +102,8 @@ void mlir::sparse_tensor::buildSparseCompiler(
   // Finalize GPU code generation.
   if (gpuCodegen) {
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
-    gpu::SerializationToCubinOptions cubinOptions;
-    cubinOptions.triple = options.gpuTriple;
-    cubinOptions.chip = options.gpuChip;
-    cubinOptions.features = options.gpuFeatures;
-    pm.addNestedPass<gpu::GPUModuleOp>(
-        createGpuSerializeToCubinPass(cubinOptions));
+    pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
+        options.gpuTriple, options.gpuChip, options.gpuFeatures));
 #endif
     pm.addPass(createGpuToLLVMConversionPass());
   }

diff  --git a/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp b/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
index a056ed78ed171a..1c442b0147c8b3 100644
--- a/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
@@ -38,7 +38,7 @@ class TestSerializeToCubinPass
 
 TestSerializeToCubinPass::TestSerializeToCubinPass() {
   this->triple = "nvptx64-nvidia-cuda";
-  this->chip = "sm_50";
+  this->chip = "sm_35";
   this->features = "+ptx60";
 }
 

diff  --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index a994a8799afa37..5db6f56fb4b381 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -172,12 +172,8 @@ void buildGpuPassPipeline(OpPassManager &pm,
   pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
-  gpu::SerializationToCubinOptions cubinOptions;
-  cubinOptions.triple = options.cubinTriple;
-  cubinOptions.chip = options.cubinChip;
-  cubinOptions.features = options.cubinFeatures;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createGpuSerializeToCubinPass(cubinOptions));
+  pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
+      options.cubinTriple, options.cubinChip, options.cubinFeatures));
 #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
 }