[Mlir-commits] [mlir] 632ccc5 - [mlir][sparse][gpu] remove tuple as one of the spmm_buffer_size output type
Kun Wu
llvmlistbot at llvm.org
Mon Jun 19 08:58:23 PDT 2023
Author: Kun Wu
Date: 2023-06-19T15:57:50Z
New Revision: 632ccc538cf454a00010fa83e0734b562a34faa8
URL: https://github.com/llvm/llvm-project/commit/632ccc538cf454a00010fa83e0734b562a34faa8
DIFF: https://github.com/llvm/llvm-project/commit/632ccc538cf454a00010fa83e0734b562a34faa8.diff
LOG: [mlir][sparse][gpu] remove tuple as one of the spmm_buffer_size output type
Reviewed By: aartbik
Differential Revision: https://reviews.llvm.org/D153188
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index d7b985443297a..0e13295b1db07 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1963,7 +1963,7 @@ def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
}];
}
-def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
+def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface, AttrSizedResultSegments]> {
let summary = "Precompute buffersize for SpMM operation";
let description = [{
The `gpu.spmm_buffer_size` operation returns the buffer size required
@@ -1994,8 +1994,7 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
GPU_SparseDnTensorHandle:$dnmatB,
GPU_SparseDnTensorHandle:$dnmatC,
TypeAttr:$computeType);
- let results = (outs Res<AnyTypeOf<[Index, TupleOf<[Index, Index,
- Index]>]>>:$bufferSzs,
+ let results = (outs Variadic<Index>:$bufferSzs,
Optional<GPU_AsyncToken>:$asyncToken);
let builders = [OpBuilder<(ins
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 05800aba8690f..580de21f40797 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -1746,18 +1746,31 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
rewriter.getIndexAttr(3));
- bufferSize = rewriter.create<LLVM::AllocaOp>(loc, llvmInt64PointerType,
- llvmInt64Type, three);
- bufferSize =
- rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, bufferSize);
-
+ auto bufferSize = rewriter.create<LLVM::AllocaOp>(loc, llvmInt64PointerType,
+ llvmInt64Type, three);
createCuSparseLtSpMMBufferSizeBuilder
.create(loc, rewriter,
{bufferSize, adaptor.getEnv(), modeA, modeB,
adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(),
computeType, stream})
.getResult();
- rewriter.replaceOp(op, {bufferSize, stream});
+
+ auto bufferSizePtr1 = rewriter.create<LLVM::GEPOp>(
+ loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize,
+ ValueRange{rewriter.create<LLVM::ConstantOp>(
+ loc, getIndexType(), rewriter.getIndexAttr(1))});
+ auto bufferSizePtr2 = rewriter.create<LLVM::GEPOp>(
+ loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize,
+ ValueRange{rewriter.create<LLVM::ConstantOp>(
+ loc, getIndexType(), rewriter.getIndexAttr(2))});
+ auto bufferSize0 =
+ rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSize);
+ auto bufferSize1 =
+ rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSizePtr1);
+ auto bufferSize2 =
+ rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSizePtr2);
+
+ rewriter.replaceOp(op, {bufferSize0, bufferSize1, bufferSize2, stream});
} else {
auto computeType = genConstInt32From(
rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index abb4d05760fba..00cfb65e99a87 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -556,9 +556,10 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a,
auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
- auto workspace_size = reinterpret_cast<size_t *>(bs);
- auto compressed_size = &(reinterpret_cast<size_t *>(bs)[1]);
- auto compressed_buffer_size = &(reinterpret_cast<size_t *>(bs)[2]);
+ auto workspace_size = reinterpret_cast<int64_t *>(bs);
+ auto compressed_size = &(reinterpret_cast<int64_t *>(bs)[1]);
+ auto compressed_buffer_size = &(reinterpret_cast<int64_t *>(bs)[2]);
+ size_t workspace_size_, compressed_size_, compressed_buffer_size_;
auto cTp = static_cast<cusparseComputeType>(ctp);
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
@@ -577,15 +578,14 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a,
handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
CUSPARSE_REPORT_IF_ERROR(
- cusparseLtMatmulGetWorkspace(handle, &(matA->plan), workspace_size))
+ cusparseLtMatmulGetWorkspace(handle, &(matA->plan), &workspace_size_))
CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(
- handle, &(matA->plan), compressed_size, compressed_buffer_size))
-
+ handle, &(matA->plan), &compressed_size_, &compressed_buffer_size_))
// avoid zero-alloc
- *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size);
- *compressed_size = (*compressed_size == 0 ? 1 : *compressed_size);
+ *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_);
+ *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_);
*compressed_buffer_size =
- (*compressed_buffer_size == 0 ? 1 : *compressed_buffer_size);
+ (compressed_buffer_size_ == 0 ? 1 : compressed_buffer_size_);
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
index 8fa28cfeae3b0..d46baa7c4ef66 100644
--- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -23,7 +23,7 @@ module attributes {gpu.container_module} {
%env, %token3 = gpu.create_sparse_env async [%token2]
%spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref<?xf16>
%dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf16>
- %bufferSzs, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : tuple<index,index,index> into f16
+ %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index,index,index into f16
%token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index 676454003e472..0ce978c4d7cec 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -28,18 +28,14 @@ module {
%token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
%token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
%token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
- // Allocating larger memory than enough for workspace and storing compressed
- // matrices as we haven't implemented the op to unpack tuple %bufferSzs to
- // retrieve these three sizes.
- // TODO: implement the op to unpack tuple %bufferSzs.
- %mem1, %token7 = gpu.alloc async [%token6] (%c1048576) : memref<?xf16>
- %mem2, %token8 = gpu.alloc async [%token7] (%c1048576) : memref<?xf16>
- %mem3, %token9 = gpu.alloc async [%token8] (%c1048576) : memref<?xf16>
- %env, %token10 = gpu.create_sparse_env async [%token9]
- %spmat, %token11 = gpu.create_2to4_spmat async [%token10] %env, %c16, %c32, %d_a: memref<16x32xf16>
- %dnmat, %token12 = gpu.create_dn_tensor async [%token11] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16>
- %dnmat2, %token13 = gpu.create_dn_tensor async [%token12] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16>
- %bufferSzs, %token14 = gpu.spmm_buffer_size async [%token13] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : tuple<index, index,index> into f16
+ %env, %token7 = gpu.create_sparse_env async [%token6]
+ %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %env, %c16, %c32, %d_a: memref<16x32xf16>
+ %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16>
+ %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16>
+ %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
+ %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref<?xf16>
+ %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref<?xf16>
+ %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref<?xf16>
%token15 = gpu.spmm async [%token14] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
%token16 = gpu.destroy_sp_mat async [%token15] %spmat
%token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
More information about the Mlir-commits
mailing list