[Mlir-commits] [mlir] 5e8a116 - Revert "[mlir][gpu] Fallback to JIT compilation" "[mlir][gpu] Increase default SM version from 35 to 50" and "[mlir][gpu] Improving Cubin Serialization with ptxas Compiler"
Mehdi Amini
llvmlistbot at llvm.org
Mon Jul 24 10:23:36 PDT 2023
Author: Mehdi Amini
Date: 2023-07-24T10:23:15-07:00
New Revision: 5e8a1164f22757c19387abf79608e99782b8ec38
URL: https://github.com/llvm/llvm-project/commit/5e8a1164f22757c19387abf79608e99782b8ec38
DIFF: https://github.com/llvm/llvm-project/commit/5e8a1164f22757c19387abf79608e99782b8ec38.diff
LOG: Revert "[mlir][gpu] Fallback to JIT compilation" "[mlir][gpu] Increase default SM version from 35 to 50" and "[mlir][gpu] Improving Cubin Serialization with ptxas Compiler"
This reverts commit 2e0e00ed841951e358a85a871647be9b3a622f51
and reverts commit a6eb40692c795a9cc29266779ceca2e304141114
and reverts commit 585cbe3f639783bf0307b47504acbd205f135310.
15 tests are broken on the mlir-nvidia buildbot:
'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_INVALID_SOURCE'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'
Added:
Modified:
mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 38c758c0908fa6..1afbcb2128d490 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -70,32 +70,6 @@ inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
}
namespace gpu {
-
-/// Options for Serialization
-struct SerializationToCubinOptions {
- /// LLVM target triple
- std::string triple;
-
- /// SM Architecture of the GPU
- std::string chip;
-
- /// PTX version that is wanted to produce
- std::string features;
-
- /// Optimization level
- int optLevel = 2;
-
- /// Dump generated PTX to stderr for debug purposes
- bool dumpPtx = false;
-
- /// Compiles generated PTX by ptxas compiler. When it is false, the generated
- /// PTX is compilet by JIT compielr by the driver.
- bool usePtxas = true;
-
- /// Parameters to pass ptxas compiler. It is ignored for JIT compiler.
- std::string ptxasParams;
-};
-
/// Base pass class to serialize kernel functions through LLVM into
/// user-specified IR and add the resulting blob as module attribute.
class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
@@ -143,18 +117,9 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
*this, "gpu-binary-annotation",
llvm::cl::desc("Annotation attribute string for GPU binary"),
llvm::cl::init(getDefaultGpuBinaryAnnotation())};
-
Option<bool> dumpPtx{*this, "dump-ptx",
::llvm::cl::desc("Dump generated PTX"),
llvm::cl::init(false)};
-
- Option<bool> usePtxas{
- *this, "use-ptxas",
- ::llvm::cl::desc("Compile generated PTX by ptxas compiler"),
- llvm::cl::init(true)};
- Option<std::string> ptxasParams{
- *this, "ptxas-params",
- ::llvm::cl::desc("Parameters to pass ptxas compiler")};
};
} // namespace gpu
@@ -172,8 +137,11 @@ void registerGpuSerializeToHsacoPass();
/// Create an instance of the GPU kernel function to CUBIN binary serialization
/// pass with optLevel (default level 2).
-std::unique_ptr<Pass>
-createGpuSerializeToCubinPass(const gpu::SerializationToCubinOptions &options);
+std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
+ StringRef chip,
+ StringRef features,
+ int optLevel = 2,
+ bool dumpPtx = false);
/// Create an instance of the GPU kernel function to HSAco binary serialization
/// pass.
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
index b61b8a9a6fae88..0077debe26a0b2 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
@@ -12,14 +12,7 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/WithColor.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
#include "mlir/Pass/Pass.h"
@@ -43,106 +36,6 @@ static void emitCudaError(const llvm::Twine &expr, const char *buffer,
.concat("]"));
}
-static constexpr char kPtxasCompilerName[] = "ptxas";
-
-/// Compiles the given generated PTX code with the given ptxas compiler.
-static FailureOr<std::string>
-compileWithPtxas(StringRef smCapability, StringRef ptxasParams,
- StringRef ptxSource, bool dumpPtx, std::string *message) {
- // Step 0. Find ptxas compiler
- std::optional<std::string> ptxasCompiler =
- llvm::sys::Process::FindInEnvPath("PATH", kPtxasCompilerName);
- if (!ptxasCompiler.has_value())
- return failure();
-
- // Step 1. Create temporary files: ptx source file, log file and cubin file
- llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile;
- llvm::sys::fs::createTemporaryFile("mlir-ptx", "", ptxSourceFile);
- llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile);
- llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile);
- llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile);
- std::string cubinFile = std::string(ptxSourceFile) + ".cubin";
- llvm::FileRemover stdinRemover(stdinFile.c_str());
- llvm::FileRemover stdoutRemover(stdoutFile.c_str());
- llvm::FileRemover stderrRemover(stderrFile.c_str());
- llvm::FileRemover binRemover(cubinFile.c_str());
- llvm::FileRemover srcRemover(ptxSourceFile.c_str());
-
- // Step 2. Write the generated PTX into a file, so we can pass it to ptxas
- // compiler
- std::error_code ec;
- llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec);
- fPtxSource << ptxSource;
- fPtxSource.close();
- if (fPtxSource.has_error()) {
- *message = std::string(
- "Could not write the generated ptx into a temporary file\n");
- return failure();
- }
-
- // Step 3. Build the ptxas command line
- std::vector<StringRef> argVector{StringRef("ptxas"), StringRef("-arch"),
- smCapability, StringRef(ptxSourceFile),
- StringRef("-o"), StringRef(cubinFile)};
-#ifdef _WIN32
- auto tokenize = llvm::cl::TokenizeWindowsCommandLine;
-#else
- auto tokenize = llvm::cl::TokenizeGNUCommandLine;
-#endif // _WIN32
- llvm::BumpPtrAllocator scratchAllocator;
- llvm::StringSaver stringSaver(scratchAllocator);
- SmallVector<const char *> rawArgs;
- tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false);
- for (const auto *rawArg : rawArgs)
- argVector.emplace_back(rawArg);
-
- std::optional<StringRef> redirects[] = {
- stdinFile.str(),
- stdoutFile.str(),
- stderrFile.str(),
- };
-
- // Step 4. Invoke ptxas
- if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(),
- llvm::ArrayRef<llvm::StringRef>(argVector),
- /*Env=*/std::nullopt,
- /*Redirects=*/redirects,
- /*SecondsToWait=*/0,
- /*MemoryLimit=*/0,
- /*ErrMsg=*/message)) {
- if (message->empty()) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeErrorlog =
- llvm::MemoryBuffer::getFile(stderrFile);
- *message = std::string("Invoking ptxas is failed, see the file: ");
- if (maybeErrorlog)
- *message += maybeErrorlog->get()->getBuffer().str();
- }
- stderrRemover.releaseFile();
- return failure();
- }
-
- // Step 5. The output of ptxas if verbose flag is set. This is useful
- // because it shows local memory usage, register usage, and etc.
- if (dumpPtx) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFlog =
- llvm::MemoryBuffer::getFile(stderrFile);
- if (maybeFlog) {
- llvm::WithColor::note() << maybeFlog->get()->getBuffer().str();
- }
- }
-
- // Step 6. Read the cubin file, and return. It will eventually be written
- // into executable.
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFcubin =
- llvm::MemoryBuffer::getFile(cubinFile);
- if (!maybeFcubin) {
- *message = std::string("Could not read cubin file \n");
- return failure();
- }
-
- return std::string(maybeFcubin->get()->getBuffer());
-}
-
#define RETURN_ON_CUDA_ERROR(expr) \
do { \
if (auto status = (expr)) { \
@@ -160,14 +53,12 @@ class SerializeToCubinPass
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToCubinPass)
SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda",
- StringRef chip = "sm_50", StringRef features = "+ptx60",
- int optLevel = 2, bool dumpPtx = false,
- bool usePtxas = true, StringRef ptxasParams = {});
+ StringRef chip = "sm_35", StringRef features = "+ptx60",
+ int optLevel = 2, bool dumpPtx = false);
StringRef getArgument() const override { return "gpu-to-cubin"; }
StringRef getDescription() const override {
- return "Lower GPU kernel function to CUBIN binary "
- "annotations";
+ return "Lower GPU kernel function to CUBIN binary annotations";
}
private:
@@ -189,10 +80,9 @@ llvm::once_flag SerializeToCubinPass::initializeBackendOnce;
SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
StringRef features, int optLevel,
- bool dumpPtx, bool usePtxas,
- StringRef ptxasParams) {
- // No matter how this pass is constructed, ensure that
- // the NVPTX backend is initialized exactly once.
+ bool dumpPtx) {
+ // No matter how this pass is constructed, ensure that the NVPTX backend
+ // is initialized exactly once.
llvm::call_once(initializeBackendOnce, []() {
// Initialize LLVM NVPTX backend.
LLVMInitializeNVPTXTarget();
@@ -204,9 +94,7 @@ SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
maybeSetOption(this->triple, triple);
maybeSetOption(this->chip, chip);
maybeSetOption(this->features, features);
- maybeSetOption(this->ptxasParams, ptxasParams);
this->dumpPtx = dumpPtx;
- this->usePtxas = usePtxas;
if (this->optLevel.getNumOccurrences() == 0)
this->optLevel.setValue(optLevel);
}
@@ -224,8 +112,7 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
RETURN_ON_CUDA_ERROR(cuInit(0));
- // Linking requires a device
- // context.
+ // Linking requires a device context.
CUdevice device;
RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
CUcontext context;
@@ -244,23 +131,9 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
auto kernelName = getOperation().getName().str();
if (dumpPtx) {
- llvm::errs() << "// Kernel Name : [" << kernelName << "]\n";
- llvm::errs() << isa << "\n";
+ llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n";
+ llvm::dbgs() << isa << "\n";
}
-
- if (usePtxas) {
- // Try to compile it with ptxas first.
- std::string message;
- FailureOr<std::string> maybeCubinImage =
- compileWithPtxas(this->chip, ptxasParams, isa, dumpPtx, &message);
- if (succeeded(maybeCubinImage)) {
- return std::make_unique<std::vector<char>>(
- maybeCubinImage.value().begin(), maybeCubinImage.value().end());
- }
- llvm::errs() << message << ". It fallsback to JIT compilation.\n";
- }
-
- // Fallback to JIT compilation if ptxas fails.
RETURN_ON_CUDA_ERROR(cuLinkAddData(
linkState, CUjitInputType::CU_JIT_INPUT_PTX,
const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
@@ -277,7 +150,7 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
auto result =
std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
- // This will also destroy the cubin data.
+ // This will also destroy the cubin data.
RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
@@ -286,22 +159,17 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
// Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
void mlir::registerGpuSerializeToCubinPass() {
- PassRegistration<SerializeToCubinPass> registerSerializeToCubin([] {
- // Initialize LLVM NVPTX backend.
- LLVMInitializeNVPTXTarget();
- LLVMInitializeNVPTXTargetInfo();
- LLVMInitializeNVPTXTargetMC();
- LLVMInitializeNVPTXAsmPrinter();
-
- return std::make_unique<SerializeToCubinPass>();
- });
+ PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
+ [] { return std::make_unique<SerializeToCubinPass>(); });
}
-std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(
- const gpu::SerializationToCubinOptions &options) {
- return std::make_unique<SerializeToCubinPass>(
- options.triple, options.chip, options.features, options.optLevel,
- options.dumpPtx, options.usePtxas, options.ptxasParams);
+std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(StringRef triple,
+ StringRef arch,
+ StringRef features,
+ int optLevel,
+ bool dumpPtx) {
+ return std::make_unique<SerializeToCubinPass>(triple, arch, features,
+ optLevel, dumpPtx);
}
#else // MLIR_GPU_TO_CUBIN_PASS_ENABLE
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index bd629b6aa81d07..990f8f7327d809 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -102,12 +102,8 @@ void mlir::sparse_tensor::buildSparseCompiler(
// Finalize GPU code generation.
if (gpuCodegen) {
#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
- gpu::SerializationToCubinOptions cubinOptions;
- cubinOptions.triple = options.gpuTriple;
- cubinOptions.chip = options.gpuChip;
- cubinOptions.features = options.gpuFeatures;
- pm.addNestedPass<gpu::GPUModuleOp>(
- createGpuSerializeToCubinPass(cubinOptions));
+ pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
+ options.gpuTriple, options.gpuChip, options.gpuFeatures));
#endif
pm.addPass(createGpuToLLVMConversionPass());
}
diff --git a/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp b/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
index a056ed78ed171a..1c442b0147c8b3 100644
--- a/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestConvertGPUKernelToCubin.cpp
@@ -38,7 +38,7 @@ class TestSerializeToCubinPass
TestSerializeToCubinPass::TestSerializeToCubinPass() {
this->triple = "nvptx64-nvidia-cuda";
- this->chip = "sm_50";
+ this->chip = "sm_35";
this->features = "+ptx60";
}
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index a994a8799afa37..5db6f56fb4b381 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -172,12 +172,8 @@ void buildGpuPassPipeline(OpPassManager &pm,
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
- gpu::SerializationToCubinOptions cubinOptions;
- cubinOptions.triple = options.cubinTriple;
- cubinOptions.chip = options.cubinChip;
- cubinOptions.features = options.cubinFeatures;
- pm.addNestedPass<gpu::GPUModuleOp>(
- createGpuSerializeToCubinPass(cubinOptions));
+ pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
+ options.cubinTriple, options.cubinChip, options.cubinFeatures));
#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
}
More information about the Mlir-commits
mailing list