[Mlir-commits] [mlir] ba8ae98 - [MLIR] Fixes NVGPU Integration Test Passes Ordering (#69934)

Tue Oct 24 06:56:51 PDT 2023

Author: Guray Ozen
Date: 2023-10-24T15:56:47+02:00
New Revision: ba8ae9866be1d0f65f1c86fd1c657f47863c9115

URL: https://github.com/llvm/llvm-project/commit/ba8ae9866be1d0f65f1c86fd1c657f47863c9115
DIFF: https://github.com/llvm/llvm-project/commit/ba8ae9866be1d0f65f1c86fd1c657f47863c9115.diff

LOG: [MLIR] Fixes NVGPU Integration Test Passes Ordering (#69934)

The test-`lower-to-nvvm pipeline`, designed for NVGPU dialect within GPU
kernels, plays important role for compiling integration tests. This PR
restructured the passes, and cleaned up the code. It also fixes the
order of pipelines.

This fix is needed for #69913

Added: 
    

Modified: 
    mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index b5af22f23a77cbc..c4cc0d5ae38d9be 100644

--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -28,6 +28,8 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
@@ -39,27 +41,11 @@ using namespace mlir;
 namespace {
 struct TestLowerToNVVMOptions
     : public PassPipelineOptions<TestLowerToNVVMOptions> {
-  PassOptions::Option<int64_t> hostIndexBitWidth{
-      *this, "host-index-bitwidth",
+  PassOptions::Option<int64_t> indexBitWidth{
+      *this, "index-bitwidth",
       llvm::cl::desc("Bitwidth of the index type for the host (warning this "
                      "should be 64 until the GPU layering is fixed)"),
       llvm::cl::init(64)};
-  PassOptions::Option<bool> hostUseBarePtrCallConv{
-      *this, "host-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the host (warning "
-          "this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
-  PassOptions::Option<int64_t> kernelIndexBitWidth{
-      *this, "kernel-index-bitwidth",
-      llvm::cl::desc("Bitwidth of the index type for the GPU kernels"),
-      llvm::cl::init(64)};
-  PassOptions::Option<bool> kernelUseBarePtrCallConv{
-      *this, "kernel-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the kernel "
-          "(warning this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
   PassOptions::Option<std::string> cubinTriple{
       *this, "cubin-triple",
       llvm::cl::desc("Triple to use to serialize to cubin."),
@@ -74,175 +60,78 @@ struct TestLowerToNVVMOptions
   PassOptions::Option<std::string> cubinFormat{
       *this, "cubin-format",
       llvm::cl::desc("Compilation format to use to serialize to cubin."),
-      llvm::cl::init("isa")};
+      llvm::cl::init("bin")};
   PassOptions::Option<int> optLevel{
       *this, "opt-level",
       llvm::cl::desc("Optimization level for NVVM compilation"),
       llvm::cl::init(2)};
 };
 
+//===----------------------------------------------------------------------===//
+// Common pipeline
+//===----------------------------------------------------------------------===//
+void buildCommonPassPipeline(OpPassManager &pm,
+                             const TestLowerToNVVMOptions &options) {
+  pm.addPass(createConvertNVGPUToNVVMPass());
+  pm.addPass(createGpuKernelOutliningPass());
+  pm.addPass(createConvertLinalgToLoopsPass());
+  pm.addPass(createConvertVectorToSCFPass());
+  pm.addPass(createConvertSCFToCFPass());
+  pm.addPass(createConvertNVVMToLLVMPass());
+  pm.addPass(createConvertVectorToLLVMPass());
+  pm.addPass(createConvertMathToLLVMPass());
+  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
+  pm.addPass(createConvertFuncToLLVMPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+
+  GpuNVVMAttachTargetOptions nvvmTargetOptions;
+  nvvmTargetOptions.triple = options.cubinTriple;
+  nvvmTargetOptions.chip = options.cubinChip;
+  nvvmTargetOptions.features = options.cubinFeatures;
+  nvvmTargetOptions.optLevel = options.optLevel;
+  pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(createArithToLLVMConversionPass());
+  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
+  convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth;
+  pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+}
+
 //===----------------------------------------------------------------------===//
 // GPUModule-specific stuff.
 //===----------------------------------------------------------------------===//
 void buildGpuPassPipeline(OpPassManager &pm,
                           const TestLowerToNVVMOptions &options) {
   pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
+  pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
 
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
-  // Convert SCF to CF (always needed).
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-  // Convert Math to LLVM (always needed).
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToLLVMPass());
-  // Expand complicated MemRef operations before lowering them.
-  pm.addNestedPass<gpu::GPUModuleOp>(memref::createExpandStridedMetadataPass());
-  // The expansion may create affine expressions. Get rid of them.
-  pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());
-
-  // Convert MemRef to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  FinalizeMemRefToLLVMConversionPassOptions
-      finalizeMemRefToLLVMConversionPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
-      options.kernelIndexBitWidth;
-  finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(createFinalizeMemRefToLLVMConversionPass(
-      finalizeMemRefToLLVMConversionPassOptions));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertFuncToLLVMPassOptions.indexBitwidth = options.kernelIndexBitWidth;
-  convertFuncToLLVMPassOptions.useBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
-  // TODO: C++20 designated initializers.
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt.indexBitwidth = options.kernelIndexBitWidth;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
-  // TODO: C++20 designated initializers.
-  // The following pass is inconsistent.
-  // TODO: fix inconsistence.
-  ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
-  convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth;
-  convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
-
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-
-  // Convert vector to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
-  convertVectorToLLVMPassOptions.reassociateFPReductions = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
-  // This pass is needed for PTX building
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
+//===----------------------------------------------------------------------===//
+// Host Post-GPU pipeline
+//===----------------------------------------------------------------------===//
+void buildHostPostPipeline(OpPassManager &pm,
+                           const TestLowerToNVVMOptions &options) {
+  pm.addPass(createGpuToLLVMConversionPass());
 
-  // Sprinkle some cleanups.
+  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+  gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
+  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
-
-  // Finally we can reconcile unrealized casts.
-  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
 }
 
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
                                   const TestLowerToNVVMOptions &options) {
-  // Start with a cleanup pass.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
   //===----------------------------------------------------------------------===//
-  // NVGPU lowers device code as well as host code to the driver, so must run
-  // before outlining.
+  // Common pipeline
   //===----------------------------------------------------------------------===//
-  // TODO: C++20 designated initializers.
-  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
-  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
-
-  //===----------------------------------------------------------------------===//
-  // Host-specific stuff.
-  //===----------------------------------------------------------------------===//
-  // Important, must be run at the top-level.
-  pm.addPass(createGpuKernelOutliningPass());
-
-  // Important, all host passes must be run at the func level so that host
-  // conversions can remain with 64 bit indices without polluting the GPU
-  // kernel that may have 32 bit indices.
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
-  // Convert SCF to CF (always needed).
-  pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
-  // Convert Math to LLVM (always needed).
-  pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
-  // Expand complicated MemRef operations before lowering them.
-  pm.addNestedPass<func::FuncOp>(memref::createExpandStridedMetadataPass());
-  // The expansion may create affine expressions. Get rid of them.
-  pm.addNestedPass<func::FuncOp>(createLowerAffinePass());
-
-  // Convert MemRef to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  FinalizeMemRefToLLVMConversionPassOptions
-      finalizeMemRefToLLVMConversionPassOptions;
-  finalizeMemRefToLLVMConversionPassOptions.useAlignedAlloc = true;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
-      options.hostIndexBitWidth;
-  finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(createFinalizeMemRefToLLVMConversionPass(
-      finalizeMemRefToLLVMConversionPassOptions));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertFuncToLLVMPassOptions.indexBitwidth = options.hostIndexBitWidth;
-  convertFuncToLLVMPassOptions.useBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
-  // TODO: C++20 designated initializers.
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
-  pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
-
-  // Sprinkle some cleanups.
-  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  pm.addNestedPass<func::FuncOp>(createCSEPass());
+  buildCommonPassPipeline(pm, options);
 
   //===----------------------------------------------------------------------===//
   // GPUModule-specific stuff.
@@ -252,68 +141,7 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
   //===----------------------------------------------------------------------===//
   // Host post-GPUModule-specific stuff.
   //===----------------------------------------------------------------------===//
-  // Attach an NVVM target to all the GPU modules with the provided target
-  // options.
-  // TODO: C++20 designated initializers.
-  GpuNVVMAttachTargetOptions nvvmTargetOptions;
-  nvvmTargetOptions.triple = options.cubinTriple;
-  nvvmTargetOptions.chip = options.cubinChip;
-  nvvmTargetOptions.features = options.cubinFeatures;
-  nvvmTargetOptions.optLevel = options.optLevel;
-  pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
-
-  // Convert GPU to LLVM.
-  // TODO: C++20 designated initializers.
-  GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
-  // Note: hostBarePtrCallConv must be false for now otherwise
-  // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
-  // lower the to bare ptr.
-  gpuToLLVMConversionOptions.hostBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  gpuToLLVMConversionOptions.kernelBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  gpuToLLVMConversionOptions.useOpaquePointers = true;
-
-  // TODO: something useful here.
-  // gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
-  pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
-
-  // Serialize all GPU modules to binaries.
-  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
-  gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
-  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
-
-  // Convert vector to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
-  convertVectorToLLVMPassOptions.reassociateFPReductions = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
-  pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  convertFuncToLLVMPassOptions2.indexBitwidth = options.hostIndexBitWidth;
-  convertFuncToLLVMPassOptions2.useBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions2.useOpaquePointers = true;
-  pm.addPass(createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions2));
-
-  // Sprinkle some cleanups.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
-  // Finally we can reconcile unrealized casts.
-  pm.addPass(createReconcileUnrealizedCastsPass());
+  buildHostPostPipeline(pm, options);
 }
 } // namespace