[Mlir-commits] [mlir] [mlir][gpu] Move test-lower-to-nvvm to InitAllPasses.h (PR #75775)

Mon Dec 18 01:20:14 PST 2023

llvmbot wrote:




@llvm/pr-subscribers-mlir-vector

Author: Guray Ozen (grypp)

<details>
<summary>Changes</summary>

The `test-lower-to-nvvm` pipeline serves as the common and proper pipeline for nvvm+host compilation, and it's used across our CUDA integration tests.

This PR updates the `test-lower-to-nvvm` pipeline to `gpu-lower-to-nvvm` and moves it within `InitAllPasses.h`. The aim is to call it from Python, also having a standardize compilation process for nvvm. 

---

Patch is 28.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75775.diff


34 Files Affected:

- (added) mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h (+22) 
- (modified) mlir/include/mlir/InitAllPasses.h (+4) 
- (modified) mlir/lib/Dialect/GPU/CMakeLists.txt (+1) 
- (added) mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt (+12) 
- (renamed) mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp (+13-14) 
- (modified) mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir (+1-1) 
- (modified) mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir (+1-1) 
- (modified) mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir (+1-1) 
- (modified) mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir (+3-3) 
- (modified) mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir (+2-2) 
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/printf.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/shuffle.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir (+1-1) 
- (modified) mlir/test/Integration/GPU/CUDA/two-modules.mlir (+1-1) 
- (modified) mlir/test/lib/Dialect/GPU/CMakeLists.txt (-1) 
- (modified) mlir/tools/mlir-opt/mlir-opt.cpp (-3) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
new file mode 100644
index 00000000000000..c2ab4fb54536a4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
@@ -0,0 +1,22 @@
+//===- Passes.h - GPU NVVM pipeline entry points -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes of all sparse tensor pipelines.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PIPELINES_PASSES_H_
+#define MLIR_DIALECT_GPU_PIPELINES_PASSES_H_
+
+namespace mlir {
+namespace gpu {
+void registerGPUToNVVMPipeline();
+} // namespace gpu
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index f22980036ffcfa..311d93477d037e 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -24,6 +24,7 @@
 #include "mlir/Dialect/Bufferization/Pipelines/Passes.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/Func/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/Pipelines/Passes.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
@@ -91,6 +92,9 @@ inline void registerAllPasses() {
   bufferization::registerBufferizationPipelines();
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
+#if MLIR_CUDA_CONVERSIONS_ENABLED
+  gpu::registerGPUToNVVMPipeline();
+#endif
 }
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index e8b69879ad6a7e..ab6834cb262fb5 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -105,6 +105,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   )
 
 add_subdirectory(TransformOps)
+add_subdirectory(Pipelines)
 
 if(MLIR_ENABLE_CUDA_RUNNER)
   if(NOT MLIR_ENABLE_CUDA_CONVERSIONS)
diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
new file mode 100644
index 00000000000000..095f8fd5205172
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_mlir_dialect_library(MLIRGPUPipelines
+  GPUToNVVMPipeline.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
+
+  LINK_LIBS PUBLIC
+  MLIRMemRefTransforms
+  MLIRFuncDialect
+  MLIRPass
+  MLIRTransforms
+)
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
similarity index 90%
rename from mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
rename to mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 28f76bde0820a6..4cb8b1dc7bc2d1 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -1,4 +1,4 @@
-//===- TestLowerToNVVM.cpp - Test lowering to NVVM as a sink pass ---------===//
+//===- GPUToNVVMPipeline.cpp - Test lowering to NVVM as a sink pass -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -27,6 +27,7 @@
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Pipelines/Passes.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Linalg/Passes.h"
@@ -39,8 +40,8 @@ using namespace mlir;
 
 #if MLIR_CUDA_CONVERSIONS_ENABLED
 namespace {
-struct TestLowerToNVVMOptions
-    : public PassPipelineOptions<TestLowerToNVVMOptions> {
+struct GPUToNVVMPipelineOptions
+    : public PassPipelineOptions<GPUToNVVMPipelineOptions> {
   PassOptions::Option<int64_t> indexBitWidth{
       *this, "index-bitwidth",
       llvm::cl::desc("Bitwidth of the index type for the host (warning this "
@@ -83,16 +84,14 @@ struct TestLowerToNVVMOptions
 // Common pipeline
 //===----------------------------------------------------------------------===//
 void buildCommonPassPipeline(OpPassManager &pm,
-                             const TestLowerToNVVMOptions &options) {
+                             const GPUToNVVMPipelineOptions &options) {
   pm.addPass(createConvertNVGPUToNVVMPass());
   pm.addPass(createGpuKernelOutliningPass());
   pm.addPass(createConvertLinalgToLoopsPass());
   pm.addPass(createConvertVectorToSCFPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createConvertNVVMToLLVMPass());
-  pm.addPass(createConvertVectorToLLVMPass());
   pm.addPass(createConvertMathToLLVMPass());
-  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
 
@@ -115,7 +114,7 @@ void buildCommonPassPipeline(OpPassManager &pm,
 // GPUModule-specific stuff.
 //===----------------------------------------------------------------------===//
 void buildGpuPassPipeline(OpPassManager &pm,
-                          const TestLowerToNVVMOptions &options) {
+                          const GPUToNVVMPipelineOptions &options) {
   pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
   ConvertGpuOpsToNVVMOpsOptions opt;
   opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv;
@@ -130,7 +129,7 @@ void buildGpuPassPipeline(OpPassManager &pm,
 // Host Post-GPU pipeline
 //===----------------------------------------------------------------------===//
 void buildHostPostPipeline(OpPassManager &pm,
-                           const TestLowerToNVVMOptions &options) {
+                           const GPUToNVVMPipelineOptions &options) {
   GpuToLLVMConversionPassOptions opt;
   opt.hostBarePtrCallConv = options.hostUseBarePtrCallConv;
   opt.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv;
@@ -145,7 +144,7 @@ void buildHostPostPipeline(OpPassManager &pm,
 }
 
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
-                                  const TestLowerToNVVMOptions &options) {
+                                  const GPUToNVVMPipelineOptions &options) {
   //===----------------------------------------------------------------------===//
   // Common pipeline
   //===----------------------------------------------------------------------===//
@@ -164,14 +163,14 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
 } // namespace
 
 namespace mlir {
-namespace test {
-void registerTestLowerToNVVM() {
-  PassPipelineRegistration<TestLowerToNVVMOptions>(
-      "test-lower-to-nvvm",
+namespace gpu {
+void registerGPUToNVVMPipeline() {
+  PassPipelineRegistration<GPUToNVVMPipelineOptions>(
+      "gpu-lower-to-nvvm",
       "An example of pipeline to lower the main dialects (arith, linalg, "
       "memref, scf, vector) down to NVVM.",
       buildLowerToNVVMPassPipeline);
 }
-} // namespace test
+} // namespace gpu
 } // namespace mlir
 #endif // MLIR_CUDA_CONVERSIONS_ENABLED
\ No newline at end of file
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
index 4483d18231e80e..42348e39832ade 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:  | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm -debug-only=serialize-to-isa \
 // RUN:  2>&1 | FileCheck %s
 
 // CHECK: Generated by LLVM NVPTX Back-End
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index e36b83e931933a..62d0d9e1cac984 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -4,7 +4,7 @@
 // RUN: mlir-opt \
 // RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
 // RUN: %s \
-// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt --gpu-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
index 8c991493a2b017..94a57d7c266819 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
 // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
 // RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \
-// RUN:  -convert-arith-to-llvm -test-lower-to-nvvm | \
+// RUN:  -convert-arith-to-llvm -gpu-lower-to-nvvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
index f26c18c4ae3dd2..896051ab5dd7eb 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
@@ -2,7 +2,7 @@
 // everything on the same thread.
 // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -test-lower-to-nvvm | \
+// RUN:  -gpu-lower-to-nvvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
@@ -13,7 +13,7 @@
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -test-lower-to-nvvm | \
+// RUN:  -gpu-lower-to-nvvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
@@ -23,7 +23,7 @@
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -test-lower-to-nvvm | \
+// RUN:  -gpu-lower-to-nvvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
index c9f45ddad6ffcf..d4bd51aab03535 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN:  -transform-interpreter \
 // RUN:  -test-transform-dialect-erase-schedule \
-// RUN:  -test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
+// RUN:  -gpu-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
index 367b4f32ede386..3e5f291db8e744 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
@@ -11,7 +11,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -transform-interpreter \
 // RUN:   -test-transform-dialect-erase-schedule \
-// RUN:   -test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
+// RUN:   -gpu-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
index 95068974a1a07b..bbeddd5bb2285f 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
index 9ab0e59a291e07..d5950eae2543a6 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
@@ -3,7 +3,7 @@
 // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
 // This test also uses gpu.memcpy operations (instead of gpu.host_register).
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --entry-point-result=void \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
index 41f4c1d35454d6..c75f9c1b5649b1 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index 13a05a2766e5df..fe999e0aa575b1 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-lower-to-nvvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -8,7 +8,7 @@
 
 // Same as above but with the memref bare pointer lowering convention.
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
index d858358a2892c6..dcd503c7bd806c 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
index 1ec926d9cacb01..8236550feb1113 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index 070679689240c1..6f965c225e2d89 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index 107e8a407d00cf..340db39f5d28f8 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index 4aa44b9ce5e967..b4fc32ff9b838a 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index 717dc542cc594b..f43a095584d69c 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/75775