[Mlir-commits] [mlir] bd22554 - [MLIR][GPU] Run generic LLVM optimizations when serializing (on AMD)

Fri Nov 19 11:21:35 PST 2021

Author: Krzysztof Drewniak
Date: 2021-11-19T19:21:24Z
New Revision: bd22554af06e1f16dc9ff12eac8987f0ceebe8c1

URL: https://github.com/llvm/llvm-project/commit/bd22554af06e1f16dc9ff12eac8987f0ceebe8c1
DIFF: https://github.com/llvm/llvm-project/commit/bd22554af06e1f16dc9ff12eac8987f0ceebe8c1.diff

LOG: [MLIR][GPU] Run generic LLVM optimizations when serializing (on AMD)

- Adds hooks that allow SerializeTo* passes to arbitrarily transform
the produced LLVM Module before it is passed to the code generation
passes.

- Uses these hooks within the SerializeToHsaco pass in order to run
LLVM optimizations and to set the optimization level on the
TargetMachine.

- Adds an optLevel parameter to SerializeToHsaco

Future work may include moving much of what's been added to
SerializeToHsaco to SerializeToBlob, but that would require
confirmation from the NVVM backend maintainers that it would be
appropriate to do so.

Depends on D114107

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D114113

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/Passes.h
    mlir/lib/Dialect/GPU/CMakeLists.txt
    mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
    mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
index a207c6b2279e1..0ce874fc4c3b3 100644

--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -54,14 +54,23 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
 protected:
   void getDependentDialects(DialectRegistry &registry) const override;
 
-private:
-  /// Creates the LLVM target machine to generate the ISA.
-  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+  /// Hook allowing the application of optimizations before codegen
+  /// By default, does nothing
+  virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule,
+                                     llvm::TargetMachine &targetMachine);
 
   /// Translates the 'getOperation()' result to an LLVM module.
   virtual std::unique_ptr<llvm::Module>
   translateToLLVMIR(llvm::LLVMContext &llvmContext);
 
+private:
+  /// Creates the LLVM target machine to generate the ISA.
+  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+
+  /// Translates the module to ISA
+  Optional<std::string> translateToISA(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine);
+
   /// Serializes the target ISA to binary form.
   virtual std::unique_ptr<std::vector<char>>
   serializeISA(const std::string &isa) = 0;

diff  --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 5fe79070737c4..1243f4e540aba 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -162,6 +162,7 @@ if(MLIR_ENABLE_ROCM_RUNNER)
   target_link_libraries(MLIRGPUOps
     PRIVATE
     lldELF
+    MLIRExecutionEngine
     MLIRROCDLToLLVMIRTranslation
   )
 

diff  --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
index 07c138e380e2e..f8be171d1f5da 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
@@ -31,18 +31,28 @@ gpu::SerializeToBlobPass::SerializeToBlobPass(TypeID passID)
 gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
     : OperationPass<gpu::GPUModuleOp>(other) {}
 
-static std::string translateToISA(llvm::Module &llvmModule,
-                                  llvm::TargetMachine &targetMachine) {
+Optional<std::string>
+gpu::SerializeToBlobPass::translateToISA(llvm::Module &llvmModule,
+                                         llvm::TargetMachine &targetMachine) {
   llvmModule.setDataLayout(targetMachine.createDataLayout());
 
+  if (failed(optimizeLlvm(llvmModule, targetMachine)))
+    return llvm::None;
+
   std::string targetISA;
   llvm::raw_string_ostream stream(targetISA);
-  llvm::buffer_ostream pstream(stream);
+
   llvm::legacy::PassManager codegenPasses;
-  targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
-                                    llvm::CGFT_AssemblyFile);
-  codegenPasses.run(llvmModule);
-  return targetISA;
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CGFT_AssemblyFile))
+      return llvm::None;
+
+    codegenPasses.run(llvmModule);
+  }
+  return stream.str();
 }
 
 void gpu::SerializeToBlobPass::runOnOperation() {
@@ -58,7 +68,13 @@ void gpu::SerializeToBlobPass::runOnOperation() {
   if (!targetMachine)
     return signalPassFailure();
 
-  std::string targetISA = translateToISA(*llvmModule, *targetMachine);
+  Optional<std::string> maybeTargetISA =
+      translateToISA(*llvmModule, *targetMachine);
+
+  if (!maybeTargetISA.hasValue())
+    return signalPassFailure();
+
+  std::string targetISA = std::move(maybeTargetISA.getValue());
 
   // Serialize the target ISA.
   std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
@@ -71,6 +87,14 @@ void gpu::SerializeToBlobPass::runOnOperation() {
   getOperation()->setAttr(gpuBinaryAnnotation, attr);
 }
 
+LogicalResult
+gpu::SerializeToBlobPass::optimizeLlvm(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine) {
+  // TODO: If serializeToCubin ends up defining optimizations, factor them
+  // into here from SerializeToHsaco
+  return success();
+}
+
 void gpu::SerializeToBlobPass::getDependentDialects(
     DialectRegistry &registry) const {
   registerLLVMDialectTranslation(registry);

diff  --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
index 29edd535e3cf2..16949a70a810d 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/MLIRContext.h"
 
 #if MLIR_GPU_TO_HSACO_PASS_ENABLE
+#include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
@@ -53,12 +54,24 @@ namespace {
 class SerializeToHsacoPass
     : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
 public:
-  SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features);
+  SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
+                       int optLevel);
+  SerializeToHsacoPass(const SerializeToHsacoPass &other);
   StringRef getArgument() const override { return "gpu-to-hsaco"; }
   StringRef getDescription() const override {
     return "Lower GPU kernel function to HSACO binary annotations";
   }
 
+protected:
+  Option<int> optLevel{
+      *this, "opt-level",
+      llvm::cl::desc("Optimization level for HSACO compilation"),
+      llvm::cl::init(2)};
+
+  /// Adds LLVM optimization passes
+  LogicalResult optimizeLlvm(llvm::Module &llvmModule,
+                             llvm::TargetMachine &targetMachine) override;
+
 private:
   void getDependentDialects(DialectRegistry &registry) const override;
 
@@ -72,6 +85,8 @@ class SerializeToHsacoPass
 };
 } // namespace
 
+SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other)
+    : PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {}
 static std::string getDefaultChip() {
   const char kDefaultChip[] = "gfx900";
 
@@ -137,10 +152,12 @@ static void maybeSetOption(Pass::Option<std::string> &option,
 }
 
 SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
-                                           StringRef features) {
+                                           StringRef features, int optLevel) {
   maybeSetOption(this->triple, [&triple] { return triple.str(); });
   maybeSetOption(this->chip, [&arch] { return arch.str(); });
   maybeSetOption(this->features, [&features] { return features.str(); });
+  if (this->optLevel.getNumOccurrences() == 0)
+    this->optLevel.setValue(optLevel);
 }
 
 void SerializeToHsacoPass::getDependentDialects(
@@ -149,6 +166,30 @@ void SerializeToHsacoPass::getDependentDialects(
   gpu::SerializeToBlobPass::getDependentDialects(registry);
 }
 
+LogicalResult
+SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule,
+                                   llvm::TargetMachine &targetMachine) {
+  int optLevel = this->optLevel.getValue();
+  if (optLevel < 0 || optLevel > 3)
+    return getOperation().emitError()
+           << "Invalid HSA optimization level" << optLevel << "\n";
+
+  targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel));
+
+  auto transformer =
+      makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine);
+  auto error = transformer(&llvmModule);
+  if (error) {
+    InFlightDiagnostic mlirError = getOperation()->emitError();
+    llvm::handleAllErrors(
+        std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) {
+          mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n";
+        });
+    return mlirError;
+  }
+  return success();
+}
+
 std::unique_ptr<SmallVectorImpl<char>>
 SerializeToHsacoPass::assembleIsa(const std::string &isa) {
   auto loc = getOperation().getLoc();
@@ -286,7 +327,7 @@ void mlir::registerGpuSerializeToHsacoPass() {
         LLVMInitializeAMDGPUTargetMC();
 
         return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "",
-                                                      "");
+                                                      "", 2);
       });
 }
 #else  // MLIR_GPU_TO_HSACO_PASS_ENABLE