[Mlir-commits] [mlir] [mlir][sparse][gpu] add CSC and BSR format to cuSparse GPU ops (PR #67509)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Sep 26 17:32:51 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-gpu
<details>
<summary>Changes</summary>
This adds two cuSparse formats to the GPU dialect support. Together with proper lowering and runtime cuda support. Also fixes a few minor omissions.
---
Full diff: https://github.com/llvm/llvm-project/pull/67509.diff
5 Files Affected:
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+91)
- (modified) mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp (+104-5)
- (modified) mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp (+28)
- (modified) mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir (+25)
- (modified) mlir/test/Dialect/GPU/sparse-roundtrip.mlir (+24)
``````````diff
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 7aa7212af8922a6..34cc129053c3259 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1764,6 +1764,9 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
using this operation. The operation returns a handle to the sparse
matrix descriptor.
+ The CSR format has exactly the same memory layout as its transpose
+ in CSC format (and vice versa).
+
If the `async` keyword is present, the op is executed asynchronously (i.e.
it does not block until the execution has finished on the device). In
that case, it returns a !gpu.async.token in addition to the environment.
@@ -1793,6 +1796,94 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
}];
}
+def GPU_CreateCscOp : GPU_Op<"create_csc", [GPU_AsyncOpInterface]> {
+ let summary = "Create sparse matrix in CSC format operation";
+ let description = [{
+ The `gpu.create_csc` operation initializes a sparse matrix in CSC format
+ with the given sizes from the given position, index, and values buffers.
+ The buffers must already be copied from the host to the device prior to
+ using this operation. The operation returns a handle to the sparse
+ matrix descriptor.
+
+ The CSC format has exactly the same memory layout as its transpose
+ in CSR format (and vice versa).
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %spmat, %token = gpu.create_csc async [%dep] %rows, %cols, %nnz, %colPos,
+ %rowIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Index:$rows,
+ Index:$cols,
+ Index:$nnz,
+ AnyMemRef:$colPos,
+ AnyMemRef:$rowIdxs,
+ AnyMemRef:$values);
+ let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
+ Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $rows `,` $cols `,` $nnz `,` $colPos `,` $rowIdxs `,` $values attr-dict
+ `:` type($colPos) `,` type($rowIdxs) `,` type($values)
+ }];
+}
+
+def GPU_CreateBsrOp : GPU_Op<"create_bsr", [GPU_AsyncOpInterface]> {
+ let summary = "Create sparse matrix in BSR format operation";
+ let description = [{
+ The `gpu.create_bsr` operation initializes a sparse matrix in BSR format
+ with the given sizes for the matrix and blocks from the given position,
+ index, and values buffers. The buffers must already be copied from the
+ host to the device prior to using this operation. The operation returns
+ a handle to the sparse matrix descriptor.
+
+ The BSR format is similar to CSR, where the column indices represent
+ two-dimensional blocks instead of a single matrix entry. Note that this
+ operation (currently) only supports storage with **square** blocks,
+ i.e., `rBlockSize == cBlockSize`.
+
+ If the `async` keyword is present, the op is executed asynchronously (i.e.
+ it does not block until the execution has finished on the device). In
+ that case, it returns a !gpu.async.token in addition to the environment.
+
+ Example:
+
+ ```mlir
+ %spmat, %token = gpu.create_bsr async [%dep]
+ %brows, %bcols, %bnnz, %rBlockSize, %cBlockSize,
+ %bRowPos, %bColIdxs, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ ```
+ }];
+
+ let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+ Index:$brows,
+ Index:$bcols,
+ Index:$bnnz,
+ Index:$rBlockSize,
+ Index:$cBlockSize,
+ AnyMemRef:$bRowPos,
+ AnyMemRef:$bColIdxs,
+ AnyMemRef:$values);
+ let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
+ Optional<GPU_AsyncToken>:$asyncToken);
+
+ let assemblyFormat = [{
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ $brows `,` $bcols `,` $bnnz `,` $rBlockSize `,` $cBlockSize `,`
+ $bRowPos `,` $bColIdxs `,` $values attr-dict
+ `:` type($bRowPos) `,` type($bColIdxs) `,` type($values)
+ }];
+}
+
def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag",
"pruning strategy for 2:4 sparse matrix",
[
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index dd739c9773830e6..097caf23edfa5dd 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -233,6 +233,19 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
llvmInt32Type, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder createCscCallBuilder = {
+ "mgpuCreateCsc",
+ llvmPointerType,
+ {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+ llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+ llvmInt32Type, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder createBsrCallBuilder = {
+ "mgpuCreateBsr",
+ llvmPointerType,
+ {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType,
+ llvmInt32Type, llvmInt32Type, llvmInt32Type,
+ llvmPointerType /* void *stream */}};
FunctionCallBuilder destroySpMatCallBuilder = {
"mgpuDestroySpMat",
llvmVoidType,
@@ -554,6 +567,8 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroyDnTensorOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooAoSOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCsrOp)
+DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCscOp)
+DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateBsrOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(Create2To4SpMatOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroySpMatOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpMVBufferSizeOp)
@@ -627,11 +642,11 @@ LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
// Corresponding to cusparseIndexType_t defined in cusparse.h.
static int32_t getCuSparseIndexTypeFrom(Type type) {
- if (type.isa<IndexType>())
- return 3; // CUSPARSE_INDEX_64I
- else
+ if (type.isInteger(16))
+ return 1; // CUSPARSE_INDEX_16U
+ if (type.isInteger(32))
return 2; // CUSPARSE_INDEX_32I
- // TODO: add support to CUSPARSE_INDEX_16U: 1
+ return 3; // CUSPARSE_INDEX_64I
}
static int32_t getCuSparseLtDataTypeFrom(Type type) {
@@ -684,6 +699,7 @@ static int32_t getCuSparseDataTypeFrom(Type type) {
static gpu::Prune2To4SpMatFlag get2To4PruneFlag(Value spMat) {
return spMat.getDefiningOp<gpu::Create2To4SpMatOp>().getPruneFlag();
}
+
// TODO: We may want a run-time (of the mlir compiler) disablement/warning:
// cusparseLt currently won't work for cuda architecture <8.0 and will trigger a
// runtime (of the CUDA program) error , but it might be great if we could at
@@ -696,9 +712,13 @@ static bool is2To4Sparsity(Value spMat) {
return true;
if (auto op = spMat.getDefiningOp<gpu::CreateCooOp>())
return false;
+ if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
+ return false;
if (auto op = spMat.getDefiningOp<gpu::CreateCsrOp>())
return false;
- if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
+ if (auto op = spMat.getDefiningOp<gpu::CreateCscOp>())
+ return false;
+ if (auto op = spMat.getDefiningOp<gpu::CreateBsrOp>())
return false;
// Print the spMat defining op
spMat.getDefiningOp()->print(llvm::errs());
@@ -1916,6 +1936,83 @@ LogicalResult ConvertSetCsrPointersOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
+LogicalResult ConvertCreateCscOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::CreateCscOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pColPos =
+ MemRefDescriptor(adaptor.getColPos()).allocatedPtr(rewriter, loc);
+ Value pRowIdxs =
+ MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
+ Value pValues =
+ MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers()) {
+ pColPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColPos);
+ pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
+ pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+ }
+ Type pType =
+ llvm::cast<MemRefType>(op.getColPos().getType()).getElementType();
+ Type iType =
+ llvm::cast<MemRefType>(op.getRowIdxs().getType()).getElementType();
+ Type dType =
+ llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
+ auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
+ auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
+ auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
+ auto handle =
+ createCscCallBuilder
+ .create(loc, rewriter,
+ {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
+ pColPos, pRowIdxs, pValues, ptp, itp, dtp, stream})
+ .getResult();
+ rewriter.replaceOp(op, {handle, stream});
+ return success();
+}
+
+LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::CreateBsrOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+ failed(isAsyncWithOneDependency(rewriter, op)))
+ return failure();
+ Location loc = op.getLoc();
+ auto stream = adaptor.getAsyncDependencies().front();
+ Value pRowPos =
+ MemRefDescriptor(adaptor.getBRowPos()).allocatedPtr(rewriter, loc);
+ Value pColIdxs =
+ MemRefDescriptor(adaptor.getBColIdxs()).allocatedPtr(rewriter, loc);
+ Value pValues =
+ MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+ if (!getTypeConverter()->useOpaquePointers()) {
+ pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
+ pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
+ pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+ }
+ Type pType =
+ llvm::cast<MemRefType>(op.getBRowPos().getType()).getElementType();
+ Type iType =
+ llvm::cast<MemRefType>(op.getBColIdxs().getType()).getElementType();
+ Type dType =
+ llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
+ auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
+ auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
+ auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
+ auto handle =
+ createBsrCallBuilder
+ .create(loc, rewriter,
+ {adaptor.getBrows(), adaptor.getBcols(), adaptor.getBnnz(),
+ adaptor.getRBlockSize(), adaptor.getCBlockSize(), pRowPos,
+ pColIdxs, pValues, ptp, itp, dtp, stream})
+ .getResult();
+ rewriter.replaceOp(op, {handle, stream});
+ return success();
+}
+
void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,
StringRef gpuBinaryAnnotation,
@@ -1941,6 +2038,8 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
ConvertCreateCooOpToGpuRuntimeCallPattern,
ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
ConvertCreateCsrOpToGpuRuntimeCallPattern,
+ ConvertCreateCscOpToGpuRuntimeCallPattern,
+ ConvertCreateBsrOpToGpuRuntimeCallPattern,
ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern,
ConvertDestroySpMatOpToGpuRuntimeCallPattern,
ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index d2c62b797577aa7..8561aa090d32fd9 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -526,6 +526,34 @@ mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
return reinterpret_cast<void *>(mat);
}
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos,
+ void *rowIdxs, void *values, int32_t ptp, int32_t itp,
+ int32_t dtp, CUstream /*stream*/) {
+ cusparseSpMatDescr_t mat = nullptr;
+ auto pTp = static_cast<cusparseIndexType_t>(ptp);
+ auto iTp = static_cast<cusparseIndexType_t>(itp);
+ auto dTp = static_cast<cudaDataType_t>(dtp);
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos,
+ rowIdxs, values, pTp, iTp,
+ CUSPARSE_INDEX_BASE_ZERO, dTp))
+ return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz,
+ intptr_t cBsz, void *rowPos, void *colIdxs, void *values,
+ int32_t ptp, int32_t itp, int32_t dtp, CUstream /*stream*/) {
+ cusparseSpMatDescr_t mat = nullptr;
+ auto pTp = static_cast<cusparseIndexType_t>(ptp);
+ auto iTp = static_cast<cusparseIndexType_t>(itp);
+ auto dTp = static_cast<cudaDataType_t>(dtp);
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr(
+ &mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp,
+ CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW))
+ return reinterpret_cast<void *>(mat);
+}
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuDestroySpMat(void *m, CUstream /*stream*/) {
cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
index d9c1c98a0208306..f86d929e0e19acf 100644
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -128,4 +128,29 @@ module attributes {gpu.container_module} {
return
}
+ // CHECK-LABEL: func @csc_and_bsr
+ // CHECK: llvm.call @mgpuStreamCreate
+ // CHECK: llvm.call @mgpuMemAlloc
+ // CHECK: llvm.call @mgpuMemAlloc
+ // CHECK: llvm.call @mgpuCreateCsc
+ // CHECK: llvm.call @mgpuCreateBsr
+ // CHECK: llvm.call @mgpuDestroySpMat
+ // CHECK: llvm.call @mgpuDestroySpMat
+ // CHECK: llvm.call @mgpuStreamSynchronize
+ // CHECK: llvm.call @mgpuStreamDestroy
+ func.func @csc_and_bsr(%arg0: index) {
+ %token0 = gpu.wait async
+ %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+ %csc, %token3 = gpu.create_csc async [%token2]
+ %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+ : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %bsr, %token4 = gpu.create_bsr async [%token3]
+ %arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+ : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %token5 = gpu.destroy_sp_mat async [%token4] %csc
+ %token6 = gpu.destroy_sp_mat async [%token5] %bsr
+ gpu.wait [%token6]
+ return
+ }
}
diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
index 31273ef8c8ce612..1e74aa3a4813a03 100644
--- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
+++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -128,4 +128,28 @@ module attributes {gpu.container_module} {
return
}
+ // CHECK-LABEL: func @csc_and_bsr
+ // CHECK: %{{.*}} = gpu.wait async
+ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
+ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
+ // CHECK: %{{.*}}, %{{.*}} = gpu.create_csc async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ // CHECK: %{{.*}}, %{{.*}} = gpu.create_bsr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ // CHECK: gpu.wait [%{{.*}}]
+ // CHECK: return
+ func.func @csc_and_bsr(%arg0: index) {
+ %token0 = gpu.wait async
+ %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+ %csc, %token3 = gpu.create_csc async [%token2]
+ %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+ : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %bsr, %token4 = gpu.create_bsr async [%token3]
+ %arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+ : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %token5 = gpu.destroy_sp_mat async [%token4] %csc
+ %token6 = gpu.destroy_sp_mat async [%token5] %bsr
+ gpu.wait [%token6]
+ return
+ }
+
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/67509
More information about the Mlir-commits
mailing list