[Mlir-commits] [mlir] 1e491c4 - [mlir][sparse][gpu] add 2:4 spmm prune_and_check flag

Tue Aug 1 11:24:53 PDT 2023

Author: Kun Wu
Date: 2023-08-01T18:24:18Z
New Revision: 1e491c425b7ebd21f8001727692c0657e961e758

URL: https://github.com/llvm/llvm-project/commit/1e491c425b7ebd21f8001727692c0657e961e758
DIFF: https://github.com/llvm/llvm-project/commit/1e491c425b7ebd21f8001727692c0657e961e758.diff

LOG: [mlir][sparse][gpu] add 2:4 spmm prune_and_check flag

Differential Revision: https://reviews.llvm.org/D155909

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
    mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
    mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
    mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
    mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
    mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
    mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 9a8b03c694d34c..e3cd604fcc30ce 100644

--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1713,6 +1713,22 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
   }];
 }
 
+def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag",
+  "pruning strategy for 2:4 sparse matrix",
+  [
+    I32EnumAttrCase<"NONE", 0>,
+    I32EnumAttrCase<"PRUNE_ONLY", 1>,
+    I32EnumAttrCase<"PRUNE_AND_CHECK", 2>,
+  ]> {
+    let genSpecializedAttr = 0;
+    let cppNamespace = GPU_Dialect.cppNamespace;
+}
+
+def GPU_Prune2To4SpMatFlagAttr : EnumAttr<GPU_Dialect, GPU_Prune2To4SpMatFlag,
+                                   "prune_2to4_spmat_flag">{
+  let defaultValue = "Prune2To4SpMatFlag::PRUNE_AND_CHECK";
+}
+
 
 def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> {
   let summary = "Create sparse matrix with 2:4 sparsity operation";
@@ -1730,20 +1746,21 @@ def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]>
     Example:
 
     ```mlir
-    %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref<?xf64>
+    %spmat, %token = gpu.create_2to4_spmat async [%dep] {PRUNE_AND_CHECK} %rows, %cols, %mem: memref<?xf64>
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                        Index:$rows,
                        Index:$cols,
+                       GPU_Prune2To4SpMatFlagAttr:$pruneFlag,
                        AnyMemRef:$memref);
   let results = (outs Res<GPU_SparseSpMatHandle>:$spMat, 
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+     `{` $pruneFlag `}` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
   }];
 }
 

diff  --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 93034e253c065b..1838c8e6050c34 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -286,7 +286,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       "mgpuCuSparseLtSpMMBufferSize",
       llvmVoidType,
       {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
-       llvmPointerType, llvmPointerType, llvmInt32Type,
+       llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
        llvmPointerType /*void *stream*/}};
   FunctionCallBuilder createCuSparseLtSpMMBuilder = {
       "mgpuCuSparseLtSpMM",
@@ -747,6 +747,9 @@ static int32_t getCuSparseDataTypeFrom(Type type) {
   llvm_unreachable("unsupported element type");
 }
 
+static gpu::Prune2To4SpMatFlag get2To4PruneFlag(Value spMat) {
+  return spMat.getDefiningOp<gpu::Create2To4SpMatOp>().getPruneFlag();
+}
 // TODO:  We may want a run-time (of the mlir compiler) disablement/warning:
 // cusparseLt currently won't work for cuda architecture <8.0 and will trigger a
 // runtime (of the CUDA program) error , but it might be great if we could at
@@ -1628,6 +1631,8 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto stream = adaptor.getAsyncDependencies().front();
   Value bufferSize;
   if (is2To4Sparsity(op.getSpmatA())) {
+    auto prune_flag =
+        genConstInt32From(rewriter, loc, get2To4PruneFlag(op.getSpmatA()));
     auto computeType = genConstInt32From(
         rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
     auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
@@ -1637,7 +1642,8 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
     createCuSparseLtSpMMBufferSizeBuilder
         .create(loc, rewriter,
                 {bufferSize, modeA, modeB, adaptor.getSpmatA(),
-                 adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream})
+                 adaptor.getDnmatB(), adaptor.getDnmatC(), computeType,
+                 prune_flag, stream})
         .getResult();
 
     auto bufferSizePtr1 = rewriter.create<LLVM::GEPOp>(

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 2e9a4f064c3fce..c40bcd178060cc 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -692,7 +692,8 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
   Value token = genFirstWait(rewriter, loc);
   Operation *spGenA = rewriter.create<gpu::Create2To4SpMatOp>(
-      loc, spMatHandleTp, tokenTp, token, szm, szk, matA);
+      loc, spMatHandleTp, tokenTp, token, szm, szk,
+      gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA);
 
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);

diff  --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index fe3c229aff970a..d2c977a925ea07 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -630,9 +630,13 @@ mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) {
 
 // Several things are being done in this stage, algorithm selection, planning,
 // and returning workspace and compressed matrices data buffer sizes.
+// The parameter prune_flag is used to indicate whether pruning and pruning
+// check will happen 0 means not prune or prune check, 1 means prune, 2 means
+// prune & prune check
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
-                             void *c, int32_t ctp, CUstream stream) {
+                             void *c, int32_t ctp, int32_t prune_flag,
+                             CUstream stream) {
   assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
   // TODO: support more advanced settings, e.g., the input right operand is a
   // sparse matrix assuming matA is the sparse matrix
@@ -662,23 +666,26 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
       &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
 
   // Pruning step (in-place).
-  CUSPARSE_REPORT_IF_ERROR(
-      cusparseLtSpMMAPrune(&cusparseLt_env, &(matA->matmul), matA->values,
-                           matA->values, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+  if (prune_flag > 0)
+    CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPrune(
+        &cusparseLt_env, &(matA->matmul), matA->values, matA->values,
+        CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
 
   // Check structure of A.
   // Note that this adds a synchronization on the stream.
   // TODO: Do we want that?
-  int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream);
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
-      &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
-  int valid = 0;
-  mgpuMemcpy(&valid, dvalid, sizeof(int), stream);
-  mgpuStreamSynchronize(stream);
-  mgpuMemFree(dvalid, stream);
-  if (valid != 0)
-    fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results "
-                    "will be invalid\n");
+  if (prune_flag == 2) {
+    int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream);
+    CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
+        &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
+    int valid = 0;
+    mgpuMemcpy(&valid, dvalid, sizeof(int), stream);
+    mgpuStreamSynchronize(stream);
+    mgpuMemFree(dvalid, stream);
+    if (valid != 0)
+      fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results "
+                      "will be invalid\n");
+  }
 
   CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace(
       &cusparseLt_env, &(matA->plan), &workspace_size_))

diff  --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
index 45677f29d3bf79..113d49c507e9c8 100644
--- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -18,7 +18,7 @@ module attributes {gpu.container_module} {
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xf16>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf16>
-    %spmat, %token4 = gpu.create_2to4_spmat async [%token2] %arg0, %arg0, %mem1:  memref<?xf16>
+    %spmat, %token4 = gpu.create_2to4_spmat async [%token2] {PRUNE_AND_CHECK} %arg0, %arg0, %mem1:  memref<?xf16>
     %dnmat, %token5 = gpu.create_dn_tensor async [%token4]  %mem2, %arg0, %arg0 : index, index into memref<?xf16>
     %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16
     %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16

diff  --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
index 3d103d31305ebc..0769e217782c6c 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
@@ -30,7 +30,7 @@
 // CHECK:           %[[VAL_27:.*]] = memref.dim %[[VAL_16]], %[[VAL_3]] : memref<?x?xf16>
 // CHECK:           %[[VAL_28:.*]] = memref.dim %[[VAL_23]], %[[VAL_4]] : memref<?x?xf16>
 // CHECK:           %[[VAL_29:.*]] = gpu.wait async
-// CHECK:           %[[VAL_30:.*]], %[[VAL_31:.*]] = gpu.create_2to4_spmat async {{\[}}%[[VAL_29]]] %[[VAL_26]], %[[VAL_27]], %[[VAL_9]] : memref<?x?xf16>
+// CHECK:           %[[VAL_30:.*]], %[[VAL_31:.*]] = gpu.create_2to4_spmat async {{\[}}%[[VAL_29]]]{{{.*}}} %[[VAL_26]], %[[VAL_27]], %[[VAL_9]] : memref<?x?xf16>
 // CHECK:           %[[VAL_32:.*]], %[[VAL_33:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_31]]] %[[VAL_16]], %[[VAL_27]], %[[VAL_28]] : index, index into memref<?x?xf16>
 // CHECK:           %[[VAL_34:.*]], %[[VAL_35:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_33]]] %[[VAL_23]], %[[VAL_26]], %[[VAL_28]] : index, index into memref<?x?xf16>
 // CHECK:           %[[VAL_36:.*]]:3, %[[VAL_37:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_35]]] %[[VAL_30]], %[[VAL_32]], %[[VAL_34]] : index, index, index into f16

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index 54a740516e2e69..7b3769f195e4fa 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -31,7 +31,7 @@ module {
     %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
     %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
     %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
-    %spmat, %token8 = gpu.create_2to4_spmat async [%token6] %c16, %c32, %d_a: memref<16x32xf16>
+    %spmat, %token8 = gpu.create_2to4_spmat async  [%token6]{PRUNE_AND_CHECK} %c16, %c32, %d_a: memref<16x32xf16>
     %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16>
     %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16>
     %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16