[Mlir-commits] [mlir] [MLIR] Use `test-lower-to-nvvm` for sm_90 Integration Tests on GitHub (PR #68184)

Wed Oct 4 00:28:53 PDT 2023

https://github.com/grypp created https://github.com/llvm/llvm-project/pull/68184

This PR enables `test-lower-to-nvvm` pass pipeline for the integration tests for NVIDIA sm_90 architecture.

This modification involves two key adjustments:

1) Calls `createConvertNVGPUToNVVMPass` before the outlining process. This particular pass is responsible for generating both device and host code. On the host, it calls the CUDA driver to build the TMA descriptor (`cuTensorMap`).

2) Integrates the `createConvertNVVMToLLVMPass` to generate PTXs  for NVVM Ops.

>From d6f6fb9926e0e96093004c7540292c215e111a99 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Wed, 4 Oct 2023 09:27:50 +0200
Subject: [PATCH] [MLIR] Use `test-lower-to-nvvm` for sm_90 Integration Tests
 on GitHub

This PR enables `test-lower-to-nvvm` pass pipeline for the integration tests for NVIDIA sm_90 architecture.

This modification involves two key adjustments:

1) Calls `createConvertNVGPUToNVVMPass` before the outlining process. This particular pass is responsible for generating both device and host code. On the host, it calls the CUDA driver to build the TMA descriptor (`cuTensorMap`).

2) Integrates the `createConvertNVVMToLLVMPass` to generate PTXs  for NVVM Ops.
---
 .../sm90/tma_load_128x64_swizzle128b.mlir     | 16 +------------
 .../CUDA/sm90/tma_load_64x64_swizzle128b.mlir | 16 +------------
 .../sm90/tma_load_64x8_8x128_noswizzle.mlir   | 20 ++++++----------
 mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp | 23 +++++++++++++++----
 4 files changed, 27 insertions(+), 48 deletions(-)

diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index aa11773defdb15f..2ad39405cc06f4b 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -1,25 +1,11 @@
 // RUN: mlir-opt %s \
-// RUN:    -convert-nvgpu-to-nvvm \
-// RUN:    -gpu-kernel-outlining \
-// RUN:    -convert-vector-to-scf  \
-// RUN:    -convert-scf-to-cf \
-// RUN:    -convert-nvvm-to-llvm \
-// RUN:    -convert-vector-to-llvm \
-// RUN:    -convert-index-to-llvm=index-bitwidth=32 \
-// RUN:    -convert-arith-to-llvm \
-// RUN:    -finalize-memref-to-llvm='use-opaque-pointers=1' \
-// RUN:    -convert-func-to-llvm \
-// RUN:    -canonicalize -cse \
-// RUN:    -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
-// RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 // RUN:   --entry-point-result=void \
 // RUN:  | FileCheck %s
 
-
 // Test swizzling with TMA load
 // 128B Swizzle Each numbered cell is 16 byte 
 // |-------------------------------|
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 5c465f7de8abdb5..242c5ff875cf44a 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -1,19 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:    -convert-nvgpu-to-nvvm \
-// RUN:    -canonicalize -cse \
-// RUN:    -gpu-kernel-outlining \
-// RUN:    -convert-vector-to-scf  \
-// RUN:    -convert-scf-to-cf \
-// RUN:    -convert-nvvm-to-llvm \
-// RUN:    -convert-vector-to-llvm \
-// RUN:    -convert-index-to-llvm=index-bitwidth=32 \
-// RUN:    -convert-arith-to-llvm \
-// RUN:    -finalize-memref-to-llvm='use-opaque-pointers=1' \
-// RUN:    -convert-func-to-llvm \
-// RUN:    -canonicalize -cse \
-// RUN:    -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
-// RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 5331ebb87d37de5..44b127bd409ba62 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -1,16 +1,10 @@
-// RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
-// RUN:         -gpu-kernel-outlining \
-// RUN:         -convert-nvvm-to-llvm \
-// RUN:         -convert-scf-to-cf  \
-// RUN:         -convert-vector-to-llvm \
-// RUN:         -convert-index-to-llvm=index-bitwidth=32 \
-// RUN:         -convert-arith-to-llvm \
-// RUN:         -finalize-memref-to-llvm='use-opaque-pointers=1' \
-// RUN:         -convert-func-to-llvm \
-// RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
-// RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
+// RUN: mlir-opt %s \
+// RUN:  -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
+// RUN:  | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN:  | FileCheck %s
 
 // Basic PTX check to make sure we are generating the right instructions.
 
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index 174d27b0da8a1dd..b5af22f23a77cbc 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
+#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
@@ -143,11 +144,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
   pm.addNestedPass<gpu::GPUModuleOp>(
       createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
 
-  // TODO: C++20 designated initializers.
-  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
-  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
   pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
 
   // Convert vector to LLVM (always needed).
@@ -157,6 +153,9 @@ void buildGpuPassPipeline(OpPassManager &pm,
   pm.addNestedPass<gpu::GPUModuleOp>(
       createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
 
+  // This pass is needed for PTX building
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
+
   // Sprinkle some cleanups.
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
@@ -167,6 +166,20 @@ void buildGpuPassPipeline(OpPassManager &pm,
 
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
                                   const TestLowerToNVVMOptions &options) {
+  // Start with a cleanup pass.
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  //===----------------------------------------------------------------------===//
+  // NVGPU lowers device code as well as host code to the driver, so must run
+  // before outlining.
+  //===----------------------------------------------------------------------===//
+  // TODO: C++20 designated initializers.
+  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
+  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
+  pm.addNestedPass<func::FuncOp>(
+      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
+
   //===----------------------------------------------------------------------===//
   // Host-specific stuff.
   //===----------------------------------------------------------------------===//