[Mlir-commits] [mlir] d21b672 - [mlir][test][gpu] Migrate CUDA tests to the TargetAttr compilation workflow (#65768)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Sat Sep 9 04:03:43 PDT 2023
Author: Fabian Mora
Date: 2023-09-09T07:03:38-04:00
New Revision: d21b67293be15f8a89378e4785d70cc037866406
URL: https://github.com/llvm/llvm-project/commit/d21b67293be15f8a89378e4785d70cc037866406
DIFF: https://github.com/llvm/llvm-project/commit/d21b67293be15f8a89378e4785d70cc037866406.diff
LOG: [mlir][test][gpu] Migrate CUDA tests to the TargetAttr compilation workflow (#65768)
Migrate tests referencing `gpu-to-cubin` to the new compilation workflow
using `TargetAttrs`. The `test-lower-to-nvvm` pass pipeline was modified
to use the new compilation workflow to simplify the introduction of
future tests.
The `createLowerGpuOpsToNVVMOpsPass` function was removed, as it didn't
allow for passing all options available in the `ConvertGpuOpsToNVVMOp`
pass.
Added:
Modified:
mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
mlir/include/mlir/Conversion/Passes.td
mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
mlir/test/Integration/GPU/CUDA/async.mlir
mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
mlir/test/Integration/GPU/CUDA/printf.mlir
mlir/test/Integration/GPU/CUDA/shuffle.mlir
mlir/test/Integration/GPU/CUDA/two-modules.mlir
mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
index 46f29c6dd8b9267..e0f4c71051e506a 100644
--- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
+++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -16,9 +16,7 @@ namespace mlir {
class LLVMTypeConverter;
class ConversionTarget;
class RewritePatternSet;
-
-template <typename OpT>
-class OperationPass;
+class Pass;
namespace gpu {
class GPUModuleOp;
@@ -45,14 +43,6 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter,
/// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns);
-
-/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The
-/// index bitwidth used for the lowering of the device side index computations
-/// is configurable.
-std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass(
- unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
- bool hasRedux = false);
-
} // namespace mlir
#endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index ed37abf85275bf3..3218760931b8cb0 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -486,7 +486,6 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {
def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
let summary = "Generate NVVM operations for gpu operations";
- let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
let dependentDialects = [
"cf::ControlFlowDialect",
"memref::MemRefDialect",
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 06469dc82b3fc5d..764b6a779b98c7a 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -210,11 +210,7 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
/// code.
struct LowerGpuOpsToNVVMOpsPass
: public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
- LowerGpuOpsToNVVMOpsPass() = default;
- LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) {
- this->indexBitwidth = indexBitwidth;
- this->hasRedux = hasRedux;
- }
+ using Base::Base;
void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
@@ -378,8 +374,3 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
"__nv_tanh");
populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan");
}
-
-std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) {
- return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth, hasRedux);
-}
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index a7fd5a25e68314e..24c4c4c43a93dea 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler(
pm.addPass(createSparseGPUCodegenPass());
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
- pm.addNestedPass<gpu::GPUModuleOp>(createLowerGpuOpsToNVVMOpsPass());
+ pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
}
// TODO(springerm): Add sparse support to the BufferDeallocation pass and add
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
index 2c1ae3ee840d0f8..0cb06b7bf1d2001 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
@@ -1,6 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \
+// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \
// RUN: 2>&1 | FileCheck %s
// CHECK: Generated by LLVM NVPTX Back-End
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index 8eb90fd3ca9946e..80972f244ec02d7 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -2,10 +2,9 @@
// NOTE: this test requires gpu-sm80
//
// RUN: mlir-opt \
-// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \
+// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
// RUN: %s \
-// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
+// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
index 8571c5ca5f3dc26..8c991493a2b0174 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
@@ -1,9 +1,7 @@
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
// RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
// RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \
-// RUN: -convert-arith-to-llvm -gpu-kernel-outlining |\
-// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
-// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
+// RUN: -convert-arith-to-llvm -test-lower-to-nvvm | \
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
index c671c1843862f94..f26c18c4ae3dd28 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
@@ -2,9 +2,7 @@
// everything on the same thread.
// RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN: -gpu-kernel-outlining |\
-// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
-// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
+// RUN: -test-lower-to-nvvm | \
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
@@ -15,9 +13,7 @@
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \
// RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN: -gpu-kernel-outlining |\
-// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
-// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
+// RUN: -test-lower-to-nvvm | \
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
@@ -27,9 +23,7 @@
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
// RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN: -gpu-kernel-outlining |\
-// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
-// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
+// RUN: -test-lower-to-nvvm | \
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
// RUN: -shared-libs=%mlir_cuda_runtime \
// RUN: -shared-libs=%mlir_c_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
index 535ba52d66f0062..591bf1b4fd18231 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
-// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
index c4ca46521eeb4c7..51bd23f817b33f1 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
@@ -3,9 +3,7 @@
// Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
// This test also uses gpu.memcpy operations (instead of gpu.host_register).
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \
-// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \
+// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --entry-point-result=void \
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
index ae410dce281b17a..0307b3d504be9f6 100644
--- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
-// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index f4324a14a36b64a..b131b8682ddee06 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
@@ -10,9 +8,7 @@
// Same as above but with the memref bare pointer lowering convention.
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \
+// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
index 0a8d38f14527914..155423db7e05049 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
index bcd785d35291c66..e5047b6efa3bf25 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index aa4b0e8820479dc..163e9fdba60c1a9 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index 2e7d046c3921411..381db2639c371f3 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index 32cfa27c8988a2c..23c6c117e67f36b 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index 30767b9495b6f20..3c5a100b5b90d57 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index e6dd91ace974322..d2a5127a34c3bdd 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -1,7 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
+// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \
// RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
// RUN: | mlir-cpu-runner \
diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
index afcb674858c8691..a5d04f7322b4914 100644
--- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,8 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
index 444e2877c822c50..7657bf4732d32b7 100644
--- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index fce773974d5ba0b..1a35d1e78b09475 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -1,6 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
index 6a784ca32f9ef7b..40fcea857d5b4eb 100644
--- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
index 5f6e5d75aff5b5c..5a9acdf3d8da6ba 100644
--- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir
+++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
@@ -1,7 +1,5 @@
// RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
-// RUN: | mlir-opt -gpu-to-llvm \
+// RUN: | mlir-opt -test-lower-to-nvvm \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index 5db6f56fb4b3817..99e19dae0d72b7b 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -65,7 +65,7 @@ struct TestLowerToNVVMOptions
llvm::cl::init("nvptx64-nvidia-cuda")};
PassOptions::Option<std::string> cubinChip{
*this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."),
- llvm::cl::init("sm_80")};
+ llvm::cl::init("sm_50")};
PassOptions::Option<std::string> cubinFeatures{
*this, "cubin-features",
llvm::cl::desc("Features to use to serialize to cubin."),
@@ -126,13 +126,14 @@ void buildGpuPassPipeline(OpPassManager &pm,
// TODO: C++20 designated initializers.
// The following pass is inconsistent.
- // ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
- // convertGpuOpsToNVVMOpsOptions.indexBitwidth =
- // options.kernelIndexBitWidth;
+ // TODO: fix inconsistence.
+ ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
+ convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv =
+ options.kernelUseBarePtrCallConv;
+ convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth;
+ convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true;
pm.addNestedPass<gpu::GPUModuleOp>(
- // TODO: fix inconsistence.
- createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/
- options.kernelIndexBitWidth));
+ createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
// TODO: C++20 designated initializers.
ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
@@ -141,22 +142,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
- // TODO: C++20 designated initializers.
- GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
- // Note: hostBarePtrCallConv must be false for now otherwise
- // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
- // lower the to bare ptr.
- gpuToLLVMConversionOptions.hostBarePtrCallConv =
- options.hostUseBarePtrCallConv;
- gpuToLLVMConversionOptions.kernelBarePtrCallConv =
- options.kernelUseBarePtrCallConv;
- gpuToLLVMConversionOptions.useOpaquePointers = true;
-
- // TODO: something useful here.
- // gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
- pm.addNestedPass<gpu::GPUModuleOp>(
- createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
-
// Convert vector to LLVM (always needed).
// TODO: C++20 designated initializers.
ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
@@ -170,11 +155,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
// Finally we can reconcile unrealized casts.
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
-
-#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
- pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
- options.cubinTriple, options.cubinChip, options.cubinFeatures));
-#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
}
void buildLowerToNVVMPassPipeline(OpPassManager &pm,
@@ -251,22 +231,16 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
//===----------------------------------------------------------------------===//
// Host post-GPUModule-specific stuff.
//===----------------------------------------------------------------------===//
- // Convert vector to LLVM (always needed).
+ // Attach an NVVM target to all the GPU modules with the provided target
+ // options.
// TODO: C++20 designated initializers.
- ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
- convertVectorToLLVMPassOptions.reassociateFPReductions = true;
- pm.addNestedPass<func::FuncOp>(
- createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
+ GpuNVVMAttachTargetOptions nvvmTargetOptions;
+ nvvmTargetOptions.triple = options.cubinTriple;
+ nvvmTargetOptions.chip = options.cubinChip;
+ nvvmTargetOptions.features = options.cubinFeatures;
+ pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
- ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
- pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
-
- // This must happen after cubin translation otherwise gpu.launch_func is
- // illegal if no cubin annotation is present.
+ // Convert GPU to LLVM.
// TODO: C++20 designated initializers.
GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
// Note: hostBarePtrCallConv must be false for now otherwise
@@ -277,10 +251,28 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
gpuToLLVMConversionOptions.kernelBarePtrCallConv =
options.kernelUseBarePtrCallConv;
gpuToLLVMConversionOptions.useOpaquePointers = true;
+
// TODO: something useful here.
// gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
+ // Serialize all GPU modules to binaries.
+ pm.addPass(createGpuModuleToBinaryPass());
+
+ // Convert vector to LLVM (always needed).
+ // TODO: C++20 designated initializers.
+ ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
+ convertVectorToLLVMPassOptions.reassociateFPReductions = true;
+ pm.addNestedPass<func::FuncOp>(
+ createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
+
+ ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
+ // Must be 64b on the host, things don't compose properly around
+ // gpu::LaunchOp and gpu::HostRegisterOp.
+ // TODO: fix GPU layering.
+ convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
+ pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
+
// Convert Func to LLVM (always needed).
// TODO: C++20 designated initializers.
ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
More information about the Mlir-commits
mailing list