[Mlir-commits] [mlir] 1e491c4 - [mlir][sparse][gpu] add 2:4 spmm prune_and_check flag
Kun Wu
llvmlistbot at llvm.org
Tue Aug 1 11:24:53 PDT 2023
Author: Kun Wu
Date: 2023-08-01T18:24:18Z
New Revision: 1e491c425b7ebd21f8001727692c0657e961e758
URL: https://github.com/llvm/llvm-project/commit/1e491c425b7ebd21f8001727692c0657e961e758
DIFF: https://github.com/llvm/llvm-project/commit/1e491c425b7ebd21f8001727692c0657e961e758.diff
LOG: [mlir][sparse][gpu] add 2:4 spmm prune_and_check flag
Differential Revision: https://reviews.llvm.org/D155909
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 9a8b03c694d34c..e3cd604fcc30ce 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1713,6 +1713,22 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
}];
}
+def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag",
+ "pruning strategy for 2:4 sparse matrix",
+ [
+ I32EnumAttrCase<"NONE", 0>,
+ I32EnumAttrCase<"PRUNE_ONLY", 1>,
+ I32EnumAttrCase<"PRUNE_AND_CHECK", 2>,
+ ]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = GPU_Dialect.cppNamespace;
+}
+
+def GPU_Prune2To4SpMatFlagAttr : EnumAttr<GPU_Dialect, GPU_Prune2To4SpMatFlag,
+ "prune_2to4_spmat_flag">{
+ let defaultValue = "Prune2To4SpMatFlag::PRUNE_AND_CHECK";
+}
+
def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> {
let summary = "Create sparse matrix with 2:4 sparsity operation";
@@ -1730,20 +1746,21 @@ def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]>
Example:
```mlir
- %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref<?xf64>
+ %spmat, %token = gpu.create_2to4_spmat async [%dep] {PRUNE_AND_CHECK} %rows, %cols, %mem: memref<?xf64>
```
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
Index:$rows,
Index:$cols,
+ GPU_Prune2To4SpMatFlagAttr:$pruneFlag,
AnyMemRef:$memref);
let results = (outs Res<GPU_SparseSpMatHandle>:$spMat,
Optional<GPU_AsyncToken>:$asyncToken);
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+ `{` $pruneFlag `}` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
}];
}
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 93034e253c065b..1838c8e6050c34 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -286,7 +286,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
"mgpuCuSparseLtSpMMBufferSize",
llvmVoidType,
{llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
- llvmPointerType, llvmPointerType, llvmInt32Type,
+ llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
llvmPointerType /*void *stream*/}};
FunctionCallBuilder createCuSparseLtSpMMBuilder = {
"mgpuCuSparseLtSpMM",
@@ -747,6 +747,9 @@ static int32_t getCuSparseDataTypeFrom(Type type) {
llvm_unreachable("unsupported element type");
}
+static gpu::Prune2To4SpMatFlag get2To4PruneFlag(Value spMat) {
+ return spMat.getDefiningOp<gpu::Create2To4SpMatOp>().getPruneFlag();
+}
// TODO: We may want a run-time (of the mlir compiler) disablement/warning:
// cusparseLt currently won't work for cuda architecture <8.0 and will trigger a
// runtime (of the CUDA program) error , but it might be great if we could at
@@ -1628,6 +1631,8 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
auto stream = adaptor.getAsyncDependencies().front();
Value bufferSize;
if (is2To4Sparsity(op.getSpmatA())) {
+ auto prune_flag =
+ genConstInt32From(rewriter, loc, get2To4PruneFlag(op.getSpmatA()));
auto computeType = genConstInt32From(
rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
@@ -1637,7 +1642,8 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
createCuSparseLtSpMMBufferSizeBuilder
.create(loc, rewriter,
{bufferSize, modeA, modeB, adaptor.getSpmatA(),
- adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream})
+ adaptor.getDnmatB(), adaptor.getDnmatC(), computeType,
+ prune_flag, stream})
.getResult();
auto bufferSizePtr1 = rewriter.create<LLVM::GEPOp>(
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 2e9a4f064c3fce..c40bcd178060cc 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -692,7 +692,8 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
Value token = genFirstWait(rewriter, loc);
Operation *spGenA = rewriter.create<gpu::Create2To4SpMatOp>(
- loc, spMatHandleTp, tokenTp, token, szm, szk, matA);
+ loc, spMatHandleTp, tokenTp, token, szm, szk,
+ gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA);
Value spMatA = spGenA->getResult(0);
token = spGenA->getResult(1);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index fe3c229aff970a..d2c977a925ea07 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -630,9 +630,13 @@ mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) {
// Several things are being done in this stage, algorithm selection, planning,
// and returning workspace and compressed matrices data buffer sizes.
+// The parameter prune_flag is used to indicate whether pruning and pruning
+// check will happen 0 means not prune or prune check, 1 means prune, 2 means
+// prune & prune check
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
- void *c, int32_t ctp, CUstream stream) {
+ void *c, int32_t ctp, int32_t prune_flag,
+ CUstream stream) {
assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
// TODO: support more advanced settings, e.g., the input right operand is a
// sparse matrix assuming matA is the sparse matrix
@@ -662,23 +666,26 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
&cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
// Pruning step (in-place).
- CUSPARSE_REPORT_IF_ERROR(
- cusparseLtSpMMAPrune(&cusparseLt_env, &(matA->matmul), matA->values,
- matA->values, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+ if (prune_flag > 0)
+ CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPrune(
+ &cusparseLt_env, &(matA->matmul), matA->values, matA->values,
+ CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
// Check structure of A.
// Note that this adds a synchronization on the stream.
// TODO: Do we want that?
- int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream);
- CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
- &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
- int valid = 0;
- mgpuMemcpy(&valid, dvalid, sizeof(int), stream);
- mgpuStreamSynchronize(stream);
- mgpuMemFree(dvalid, stream);
- if (valid != 0)
- fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results "
- "will be invalid\n");
+ if (prune_flag == 2) {
+ int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream);
+ CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
+ &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
+ int valid = 0;
+ mgpuMemcpy(&valid, dvalid, sizeof(int), stream);
+ mgpuStreamSynchronize(stream);
+ mgpuMemFree(dvalid, stream);
+ if (valid != 0)
+ fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results "
+ "will be invalid\n");
+ }
CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace(
&cusparseLt_env, &(matA->plan), &workspace_size_))
diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
index 45677f29d3bf79..113d49c507e9c8 100644
--- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -18,7 +18,7 @@ module attributes {gpu.container_module} {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xf16>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf16>
- %spmat, %token4 = gpu.create_2to4_spmat async [%token2] %arg0, %arg0, %mem1: memref<?xf16>
+ %spmat, %token4 = gpu.create_2to4_spmat async [%token2] {PRUNE_AND_CHECK} %arg0, %arg0, %mem1: memref<?xf16>
%dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf16>
%bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16
%token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
index 3d103d31305ebc..0769e217782c6c 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
@@ -30,7 +30,7 @@
// CHECK: %[[VAL_27:.*]] = memref.dim %[[VAL_16]], %[[VAL_3]] : memref<?x?xf16>
// CHECK: %[[VAL_28:.*]] = memref.dim %[[VAL_23]], %[[VAL_4]] : memref<?x?xf16>
// CHECK: %[[VAL_29:.*]] = gpu.wait async
-// CHECK: %[[VAL_30:.*]], %[[VAL_31:.*]] = gpu.create_2to4_spmat async {{\[}}%[[VAL_29]]] %[[VAL_26]], %[[VAL_27]], %[[VAL_9]] : memref<?x?xf16>
+// CHECK: %[[VAL_30:.*]], %[[VAL_31:.*]] = gpu.create_2to4_spmat async {{\[}}%[[VAL_29]]]{{{.*}}} %[[VAL_26]], %[[VAL_27]], %[[VAL_9]] : memref<?x?xf16>
// CHECK: %[[VAL_32:.*]], %[[VAL_33:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_31]]] %[[VAL_16]], %[[VAL_27]], %[[VAL_28]] : index, index into memref<?x?xf16>
// CHECK: %[[VAL_34:.*]], %[[VAL_35:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_33]]] %[[VAL_23]], %[[VAL_26]], %[[VAL_28]] : index, index into memref<?x?xf16>
// CHECK: %[[VAL_36:.*]]:3, %[[VAL_37:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_35]]] %[[VAL_30]], %[[VAL_32]], %[[VAL_34]] : index, index, index into f16
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index 54a740516e2e69..7b3769f195e4fa 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -31,7 +31,7 @@ module {
%token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
%token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
%token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
- %spmat, %token8 = gpu.create_2to4_spmat async [%token6] %c16, %c32, %d_a: memref<16x32xf16>
+ %spmat, %token8 = gpu.create_2to4_spmat async [%token6]{PRUNE_AND_CHECK} %c16, %c32, %d_a: memref<16x32xf16>
%dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16>
%dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16>
%bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
More information about the Mlir-commits
mailing list