[Mlir-commits] [mlir] 981cf16 - [mlir][sparse][gpu] add SpMM to GPU ops dialect
Aart Bik
llvmlistbot at llvm.org
Fri May 19 12:46:22 PDT 2023
Author: Aart Bik
Date: 2023-05-19T12:46:11-07:00
New Revision: 981cf1678d6f3399d3fb5e16434669f3992ac6fd
URL: https://github.com/llvm/llvm-project/commit/981cf1678d6f3399d3fb5e16434669f3992ac6fd
DIFF: https://github.com/llvm/llvm-project/commit/981cf1678d6f3399d3fb5e16434669f3992ac6fd.diff
LOG: [mlir][sparse][gpu] add SpMM to GPU ops dialect
Reviewed By: ThomasRaoux, K-Wu
Differential Revision: https://reviews.llvm.org/D150618
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
mlir/test/Dialect/GPU/ops.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 982ec0c6d4c87..77e65972038da 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1623,7 +1623,7 @@ def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> {
def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
let summary = "Destroy dense vector operation";
let description = [{
- The `gpu.destroy_sparse_env` operation releases all resources of a dense
+ The `gpu.destroy_dn_vec` operation releases all resources of a dense
vector represented by a handle that was previously created by a
`gpu.create_dn_vec` operation.
@@ -1647,6 +1647,64 @@ def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
}];
}
+def GPU_CreateDnMatOp : GPU_Op<"create_dn_mat", [GPU_AsyncOpInterface]> {
+ let summary = "Create dense matrix operation";
+ let description = [{
+ The `gpu.create_dn_mat` operation initializes a dense matrix from
+ the given values buffer and sizes. The buffer must already be copied
+ from the host to the device prior to using this operation. The
+ operation returns a handle to the dense matrix descriptor.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %dmat, %token = gpu.create_dn_mat async [%dep] %mem, %size : memref<?xf64>
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Index:$rows,
+ Index:$cols,
+ AnyMemRef:$memref);
+ let results = (outs Res<GPU_SparseHandle>:$dmat, Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+ }];
+}
+
+def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> {
+ let summary = "Destroy dense matrix operation";
+ let description = [{
+ The `gpu.destroy_dn_mat` operation releases all resources of a dense
+ matrix represented by a handle that was previously created by a
+ `gpu.create_dn_mat` operation.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %token = gpu.destroy_dn_vec async [%dep] %dmat
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Arg<GPU_SparseHandle>:$dmat);
+ let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $dmat attr-dict
+ }];
+}
+
def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
let summary = "Create sparse matrix in COO format operation";
let description = [{
@@ -1812,4 +1870,69 @@ def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
}];
}
+def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
+ let summary = "Precompute buffersize for SpMM operation";
+ let description = [{
+ The `gpu.spmm_buffer_size` operation returns the buffer size required
+ to perform the SpMM operation on the given sparse and dense matrix.
+ The operation expects handles returned by previous sparse operations
+ to construct an environment and the operands for SpMM.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA, %spmatB, %spmatC
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ GPU_SparseHandle:$env,
+ GPU_SparseHandle:$spmatA,
+ GPU_SparseHandle:$dnmatB,
+ GPU_SparseHandle:$dnmatC);
+ let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $env `,` $spmatA `,` $dnmatB `,` $dnmatC attr-dict
+ }];
+}
+
+def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> {
+ let summary = "SpMM operation";
+ let description = [{
+ The `gpu.spmm` operation performs the SpMM operation on the given sparse and
+ dense matrix, and buffer. The operation expects handles returned by previous
+ sparse operations to construct an environment and the operands for SpMM. The
+ buffer must have been allocated on the device.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %token = gpu.spmm async [%dep] %env, %spmatA, %spmatB, %spmatC, %buffer
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ GPU_SparseHandle:$env,
+ GPU_SparseHandle:$spmatA,
+ GPU_SparseHandle:$dnmatB,
+ GPU_SparseHandle:$dnmatC,
+ AnyMemRef:$buffer);
+ let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $env `,` $spmatA `,` $dnmatB `,` $dnmatC `,` $buffer attr-dict `:` type($buffer)
+ }];
+}
+
#endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 033d8c933539f..8c79ee3745d9b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -209,6 +209,15 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
"mgpuDestroyDnVec",
llvmVoidType,
{llvmPointerType, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder createDnMatCallBuilder = {
+ "mgpuCreateDnMat",
+ llvmPointerType,
+ {llvmIntPtrType, llvmIntPtrType, llvmPointerType, llvmInt32Type,
+ llvmPointerType /* void *stream */}};
+ FunctionCallBuilder destroyDnMatCallBuilder = {
+ "mgpuDestroyDnMat",
+ llvmVoidType,
+ {llvmPointerType, llvmPointerType /* void *stream */}};
FunctionCallBuilder createCooCallBuilder = {
"mgpuCreateCoo",
llvmPointerType,
@@ -235,6 +244,16 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
llvmVoidType,
{llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
llvmPointerType, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder spMMBufferSizeCallBuilder = {
+ "mgpuSpMMBufferSize",
+ llvmIntPtrType,
+ {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
+ llvmPointerType /* void *stream */}};
+ FunctionCallBuilder spMMCallBuilder = {
+ "mgpuSpMM",
+ llvmVoidType,
+ {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
+ llvmPointerType, llvmPointerType /* void *stream */}};
};
/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -477,6 +496,30 @@ class ConvertDestroyDnVecOpToGpuRuntimeCallPattern
ConversionPatternRewriter &rewriter) const override;
};
+class ConvertCreateDnMatOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnMatOp> {
+public:
+ ConvertCreateDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnMatOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::CreateDnMatOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroyDnMatOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnMatOp> {
+public:
+ ConvertDestroyDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnMatOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::DestroyDnMatOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
class ConvertCreateCooOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooOp> {
public:
@@ -539,6 +582,32 @@ class ConvertSpMVOpToGpuRuntimeCallPattern
ConversionPatternRewriter &rewriter) const override;
};
+class ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::SpMMBufferSizeOp> {
+public:
+ ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern(
+ LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::SpMMBufferSizeOp>(typeConverter) {
+ }
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::SpMMBufferSizeOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertSpMMOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::SpMMOp> {
+public:
+ ConvertSpMMOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::SpMMOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::SpMMOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
} // namespace
void GpuToLLVMConversionPass::runOnOperation() {
@@ -1180,6 +1249,43 @@ LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
+LogicalResult ConvertCreateDnMatOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::CreateDnMatOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pMat =
+ MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers())
+ pMat = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pMat);
+ Type dType = op.getMemref().getType().cast<MemRefType>().getElementType();
+ auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+ dType.getIntOrFloatBitWidth());
+ auto handle =
+ createDnMatCallBuilder
+ .create(loc, rewriter,
+ {adaptor.getRows(), adaptor.getCols(), pMat, dw, stream})
+ .getResult();
+ rewriter.replaceOp(op, {handle, stream});
+ return success();
+}
+
+LogicalResult ConvertDestroyDnMatOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::DestroyDnMatOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream});
+ rewriter.replaceOp(op, {stream});
+ return success();
+}
+
LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::CreateCooOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
@@ -1302,6 +1408,44 @@ LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
+LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::SpMMBufferSizeOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ auto bufferSize =
+ spMMBufferSizeCallBuilder
+ .create(loc, rewriter,
+ {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(),
+ adaptor.getDnmatC(), stream})
+ .getResult();
+ rewriter.replaceOp(op, {bufferSize, stream});
+ return success();
+}
+
+LogicalResult ConvertSpMMOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::SpMMOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pBuf =
+ MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers())
+ pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
+ spMMCallBuilder.create(loc, rewriter,
+ {adaptor.getEnv(), adaptor.getSpmatA(),
+ adaptor.getDnmatB(), adaptor.getDnmatC(), pBuf,
+ stream});
+ rewriter.replaceOp(op, {stream});
+ return success();
+}
+
void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,
StringRef gpuBinaryAnnotation,
@@ -1329,11 +1473,15 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
ConvertDestroySparseEnvOpToGpuRuntimeCallPattern,
ConvertCreateDnVecOpToGpuRuntimeCallPattern,
ConvertDestroyDnVecOpToGpuRuntimeCallPattern,
+ ConvertCreateDnMatOpToGpuRuntimeCallPattern,
+ ConvertDestroyDnMatOpToGpuRuntimeCallPattern,
ConvertCreateCooOpToGpuRuntimeCallPattern,
ConvertCreateCsrOpToGpuRuntimeCallPattern,
ConvertDestroySpMatOpToGpuRuntimeCallPattern,
ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
- ConvertSpMVOpToGpuRuntimeCallPattern>(converter);
+ ConvertSpMVOpToGpuRuntimeCallPattern,
+ ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern,
+ ConvertSpMMOpToGpuRuntimeCallPattern>(converter);
patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation, kernelBarePtrCallConv);
patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 5040afb0915a8..e5d4cdd738847 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -338,7 +338,7 @@ mgpuSpMVBufferSize(void *h, void *a, void *x, void *y, CUstream /*stream*/) {
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuSpMV(void *h, void *a, void *x, void *y, void *b, CUstream /*stream*/) {
+mgpuSpMV(void *h, void *a, void *x, void *y, void *buf, CUstream /*stream*/) {
cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
@@ -347,5 +347,35 @@ mgpuSpMV(void *h, void *a, void *x, void *y, void *b, CUstream /*stream*/) {
double beta = 1.0;
CUSPARSE_REPORT_IF_ERROR(
cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX,
- &beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, b))
+ &beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buf))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
+mgpuSpMMBufferSize(void *h, void *a, void *b, void *c, CUstream /*stream*/) {
+ cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+ cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
+ cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
+ double alpha = 1.0;
+ double beta = 1.0;
+ size_t bufferSize = 0;
+ CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize(
+ handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+ CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, matB, &beta, matC,
+ CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize))
+ return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuSpMM(void *h, void *a, void *b, void *c, void *buf, CUstream /*stream*/) {
+ cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+ cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
+ cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
+ double alpha = 1.0;
+ double beta = 1.0;
+ CUSPARSE_REPORT_IF_ERROR(
+ cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+ CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, matB, &beta,
+ matC, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, buf))
}
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
index 6f163f9269399..dcef27357b531 100644
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -10,7 +10,7 @@ module attributes {gpu.container_module} {
// CHECK: llvm.call @mgpuCreateCoo
// CHECK: llvm.call @mgpuCreateDnVec
// CHECK: llvm.call @mgpuSpMVBufferSize
- // CHECK: llvm.call @mgpuSpM
+ // CHECK: llvm.call @mgpuSpMV
// CHECK: llvm.call @mgpuDestroySpMat
// CHECK: llvm.call @mgpuDestroyDnVec
// CHECK: llvm.call @mgpuDestroySparseEnv
@@ -32,6 +32,36 @@ module attributes {gpu.container_module} {
return
}
+ // CHECK-LABEL: func @matmul
+ // CHECK: llvm.call @mgpuStreamCreate
+ // CHECK: llvm.call @mgpuMemAlloc
+ // CHECK: llvm.call @mgpuMemAlloc
+ // CHECK: llvm.call @mgpuCreateSparseEnv
+ // CHECK: llvm.call @mgpuCreateCsr
+ // CHECK: llvm.call @mgpuCreateDnMat
+ // CHECK: llvm.call @mgpuSpMMBufferSize
+ // CHECK: llvm.call @mgpuSpMM
+ // CHECK: llvm.call @mgpuDestroySpMat
+ // CHECK: llvm.call @mgpuDestroyDnMat
+ // CHECK: llvm.call @mgpuDestroySparseEnv
+ // CHECK: llvm.call @mgpuStreamSynchronize
+ // CHECK: llvm.call @mgpuStreamDestroy
+ func.func @matmul(%arg0: index) {
+ %token0 = gpu.wait async
+ %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+ %env, %token3 = gpu.create_sparse_env async [%token2]
+ %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref<?xf64>
+ %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat
+ %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>
+ %token8 = gpu.destroy_sp_mat async [%token7] %spmat
+ %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
+ %token10 = gpu.destroy_sparse_env async [%token9] %env
+ gpu.wait [%token10]
+ return
+ }
+
}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 00e2421c0283c..d6c1bef340c93 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -338,14 +338,22 @@ module attributes {gpu.container_module} {
%bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec
// CHECK: gpu.spmv async
%token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>
+ // CHECK: gpu.create_dn_mat async
+ %dnmat, %token9 = gpu.create_dn_mat async [%token8] %arg0, %arg0, %mem2 : memref<?xf64>
+ // CHECK: gpu.spmm_buffer_size async
+ %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat
+ // CHECK: gpu.spmm async
+ %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>
+ // CHECK: gpu.destroy_dn_mat async
+ %token12 = gpu.destroy_dn_mat async [%token11] %dnmat
// CHECK: gpu.destroy_sp_mat async
- %token9 = gpu.destroy_sp_mat async [%token8] %spmat
+ %token13 = gpu.destroy_sp_mat async [%token12] %spmat
// CHECK: gpu.destroy_dn_vec async
- %token10 = gpu.destroy_dn_vec async [%token9] %dnvec
+ %token14 = gpu.destroy_dn_vec async [%token13] %dnvec
// CHECK: gpu.destroy_sparse_env async
- %token11 = gpu.destroy_sparse_env async [%token10] %env
+ %token15 = gpu.destroy_sparse_env async [%token14] %env
// CHECK: gpu.wait
- gpu.wait [%token11]
+ gpu.wait [%token15]
return
}
}
More information about the Mlir-commits
mailing list