[Mlir-commits] [mlir] [MLIR] Fixes NVGPU Integration Test Passes (PR #69934)

Mon Oct 23 08:16:26 PDT 2023

https://github.com/grypp created https://github.com/llvm/llvm-project/pull/69934

The test-`lower-to-nvvm pipeline`, designed for NVGPU dialect within GPU kernels, plays important role for compiling integration tests. Thiks PR restructured the passes, and cleaned up the code.

>From 189bf377a7ec9b195bc5deaec7c042030b30ee00 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Mon, 23 Oct 2023 17:15:30 +0200
Subject: [PATCH] [MLIR] Fixes NVGPU Integration Test Passes

The test-`lower-to-nvvm pipeline`, designed for NVGPU dialect within GPU kernels, plays important role for compiling integration tests. Thiks PR restructured the passes, and cleaned up the code.
---
 mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp | 282 ++++--------------
 1 file changed, 55 insertions(+), 227 deletions(-)

diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index b5af22f23a77cbc..c4cc0d5ae38d9be 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -28,6 +28,8 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
@@ -39,27 +41,11 @@ using namespace mlir;
 namespace {
 struct TestLowerToNVVMOptions
     : public PassPipelineOptions<TestLowerToNVVMOptions> {
-  PassOptions::Option<int64_t> hostIndexBitWidth{
-      *this, "host-index-bitwidth",
+  PassOptions::Option<int64_t> indexBitWidth{
+      *this, "index-bitwidth",
       llvm::cl::desc("Bitwidth of the index type for the host (warning this "
                      "should be 64 until the GPU layering is fixed)"),
       llvm::cl::init(64)};
-  PassOptions::Option<bool> hostUseBarePtrCallConv{
-      *this, "host-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the host (warning "
-          "this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
-  PassOptions::Option<int64_t> kernelIndexBitWidth{
-      *this, "kernel-index-bitwidth",
-      llvm::cl::desc("Bitwidth of the index type for the GPU kernels"),
-      llvm::cl::init(64)};
-  PassOptions::Option<bool> kernelUseBarePtrCallConv{
-      *this, "kernel-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the kernel "
-          "(warning this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
   PassOptions::Option<std::string> cubinTriple{
       *this, "cubin-triple",
       llvm::cl::desc("Triple to use to serialize to cubin."),
@@ -74,175 +60,78 @@ struct TestLowerToNVVMOptions
   PassOptions::Option<std::string> cubinFormat{
       *this, "cubin-format",
       llvm::cl::desc("Compilation format to use to serialize to cubin."),
-      llvm::cl::init("isa")};
+      llvm::cl::init("bin")};
   PassOptions::Option<int> optLevel{
       *this, "opt-level",
       llvm::cl::desc("Optimization level for NVVM compilation"),
       llvm::cl::init(2)};
 };
 
+//===----------------------------------------------------------------------===//
+// Common pipeline
+//===----------------------------------------------------------------------===//
+void buildCommonPassPipeline(OpPassManager &pm,
+                             const TestLowerToNVVMOptions &options) {
+  pm.addPass(createConvertNVGPUToNVVMPass());
+  pm.addPass(createGpuKernelOutliningPass());
+  pm.addPass(createConvertLinalgToLoopsPass());
+  pm.addPass(createConvertVectorToSCFPass());
+  pm.addPass(createConvertSCFToCFPass());
+  pm.addPass(createConvertNVVMToLLVMPass());
+  pm.addPass(createConvertVectorToLLVMPass());
+  pm.addPass(createConvertMathToLLVMPass());
+  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
+  pm.addPass(createConvertFuncToLLVMPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+
+  GpuNVVMAttachTargetOptions nvvmTargetOptions;
+  nvvmTargetOptions.triple = options.cubinTriple;
+  nvvmTargetOptions.chip = options.cubinChip;
+  nvvmTargetOptions.features = options.cubinFeatures;
+  nvvmTargetOptions.optLevel = options.optLevel;
+  pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(createArithToLLVMConversionPass());
+  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
+  convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth;
+  pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+}
+
 //===----------------------------------------------------------------------===//
 // GPUModule-specific stuff.
 //===----------------------------------------------------------------------===//
 void buildGpuPassPipeline(OpPassManager &pm,
                           const TestLowerToNVVMOptions &options) {
   pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
+  pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
 
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
-  // Convert SCF to CF (always needed).
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-  // Convert Math to LLVM (always needed).
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToLLVMPass());
-  // Expand complicated MemRef operations before lowering them.
-  pm.addNestedPass<gpu::GPUModuleOp>(memref::createExpandStridedMetadataPass());
-  // The expansion may create affine expressions. Get rid of them.
-  pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());
-
-  // Convert MemRef to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  FinalizeMemRefToLLVMConversionPassOptions
-      finalizeMemRefToLLVMConversionPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
-      options.kernelIndexBitWidth;
-  finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(createFinalizeMemRefToLLVMConversionPass(
-      finalizeMemRefToLLVMConversionPassOptions));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertFuncToLLVMPassOptions.indexBitwidth = options.kernelIndexBitWidth;
-  convertFuncToLLVMPassOptions.useBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
-  // TODO: C++20 designated initializers.
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt.indexBitwidth = options.kernelIndexBitWidth;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
-  // TODO: C++20 designated initializers.
-  // The following pass is inconsistent.
-  // TODO: fix inconsistence.
-  ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
-  convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth;
-  convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
-
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-
-  // Convert vector to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
-  convertVectorToLLVMPassOptions.reassociateFPReductions = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
-  // This pass is needed for PTX building
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
+//===----------------------------------------------------------------------===//
+// Host Post-GPU pipeline
+//===----------------------------------------------------------------------===//
+void buildHostPostPipeline(OpPassManager &pm,
+                           const TestLowerToNVVMOptions &options) {
+  pm.addPass(createGpuToLLVMConversionPass());
 
-  // Sprinkle some cleanups.
+  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+  gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
+  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
-
-  // Finally we can reconcile unrealized casts.
-  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
 }
 
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
                                   const TestLowerToNVVMOptions &options) {
-  // Start with a cleanup pass.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
   //===----------------------------------------------------------------------===//
-  // NVGPU lowers device code as well as host code to the driver, so must run
-  // before outlining.
+  // Common pipeline
   //===----------------------------------------------------------------------===//
-  // TODO: C++20 designated initializers.
-  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
-  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
-
-  //===----------------------------------------------------------------------===//
-  // Host-specific stuff.
-  //===----------------------------------------------------------------------===//
-  // Important, must be run at the top-level.
-  pm.addPass(createGpuKernelOutliningPass());
-
-  // Important, all host passes must be run at the func level so that host
-  // conversions can remain with 64 bit indices without polluting the GPU
-  // kernel that may have 32 bit indices.
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
-  // Convert SCF to CF (always needed).
-  pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
-  // Convert Math to LLVM (always needed).
-  pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
-  // Expand complicated MemRef operations before lowering them.
-  pm.addNestedPass<func::FuncOp>(memref::createExpandStridedMetadataPass());
-  // The expansion may create affine expressions. Get rid of them.
-  pm.addNestedPass<func::FuncOp>(createLowerAffinePass());
-
-  // Convert MemRef to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  FinalizeMemRefToLLVMConversionPassOptions
-      finalizeMemRefToLLVMConversionPassOptions;
-  finalizeMemRefToLLVMConversionPassOptions.useAlignedAlloc = true;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
-      options.hostIndexBitWidth;
-  finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(createFinalizeMemRefToLLVMConversionPass(
-      finalizeMemRefToLLVMConversionPassOptions));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertFuncToLLVMPassOptions.indexBitwidth = options.hostIndexBitWidth;
-  convertFuncToLLVMPassOptions.useBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
-  // TODO: C++20 designated initializers.
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
-  pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
-
-  // Sprinkle some cleanups.
-  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  pm.addNestedPass<func::FuncOp>(createCSEPass());
+  buildCommonPassPipeline(pm, options);
 
   //===----------------------------------------------------------------------===//
   // GPUModule-specific stuff.
@@ -252,68 +141,7 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
   //===----------------------------------------------------------------------===//
   // Host post-GPUModule-specific stuff.
   //===----------------------------------------------------------------------===//
-  // Attach an NVVM target to all the GPU modules with the provided target
-  // options.
-  // TODO: C++20 designated initializers.
-  GpuNVVMAttachTargetOptions nvvmTargetOptions;
-  nvvmTargetOptions.triple = options.cubinTriple;
-  nvvmTargetOptions.chip = options.cubinChip;
-  nvvmTargetOptions.features = options.cubinFeatures;
-  nvvmTargetOptions.optLevel = options.optLevel;
-  pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
-
-  // Convert GPU to LLVM.
-  // TODO: C++20 designated initializers.
-  GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
-  // Note: hostBarePtrCallConv must be false for now otherwise
-  // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
-  // lower the to bare ptr.
-  gpuToLLVMConversionOptions.hostBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  gpuToLLVMConversionOptions.kernelBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  gpuToLLVMConversionOptions.useOpaquePointers = true;
-
-  // TODO: something useful here.
-  // gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
-  pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
-
-  // Serialize all GPU modules to binaries.
-  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
-  gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
-  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
-
-  // Convert vector to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
-  convertVectorToLLVMPassOptions.reassociateFPReductions = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
-  pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  convertFuncToLLVMPassOptions2.indexBitwidth = options.hostIndexBitWidth;
-  convertFuncToLLVMPassOptions2.useBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions2.useOpaquePointers = true;
-  pm.addPass(createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions2));
-
-  // Sprinkle some cleanups.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
-  // Finally we can reconcile unrealized casts.
-  pm.addPass(createReconcileUnrealizedCastsPass());
+  buildHostPostPipeline(pm, options);
 }
 } // namespace