[Mlir-commits] [mlir] 4df01dc - [mlir][sparse][gpu][nvidia] add pruning step and check to 2:4 matrix multiplication

Fri Jul 14 12:08:21 PDT 2023

Author: Aart Bik
Date: 2023-07-14T12:08:13-07:00
New Revision: 4df01dc27079b70d2fdec8d795e525b7955c60f7

URL: https://github.com/llvm/llvm-project/commit/4df01dc27079b70d2fdec8d795e525b7955c60f7
DIFF: https://github.com/llvm/llvm-project/commit/4df01dc27079b70d2fdec8d795e525b7955c60f7.diff

LOG: [mlir][sparse][gpu][nvidia] add pruning step and check to 2:4 matrix multiplication

(1) without the check, the results may silently be wrong, so check is needed
(2) add pruning step to guarantee 2:4 property

Note, in the longer run, we may want to split out the pruning step somehow,
or make it optional.

Reviewed By: K-Wu

Differential Revision: https://reviews.llvm.org/D155320

Added: 
    mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir

Modified: 
    mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index ec822125eda456..0ea7127e931659 100644

--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -567,7 +567,7 @@ mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) {
 // and returning workspace and compressed matrices data buffer sizes.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
-                             void *c, int32_t ctp, CUstream /*stream*/) {
+                             void *c, int32_t ctp, CUstream stream) {
   assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
   // TODO: support more advanced settings, e.g., the input right operand is a
   // sparse matrix assuming matA is the sparse matrix
@@ -596,6 +596,25 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
   CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit(
       &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
 
+  // Pruning step (in-place).
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtSpMMAPrune(&cusparseLt_env, &(matA->matmul), matA->values,
+                           matA->values, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+
+  // Check structure of A.
+  // Note that this adds a synchronization on the stream.
+  // TODO: Do we want that?
+  int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
+      &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
+  int valid = 0;
+  mgpuMemcpy(&valid, dvalid, sizeof(int), stream);
+  mgpuStreamSynchronize(stream);
+  mgpuMemFree(dvalid, stream);
+  if (valid != 0)
+    fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results "
+                    "will be invalid\n");
+
   CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace(
       &cusparseLt_env, &(matA->plan), &workspace_size_))
   CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
new file mode 100644
index 00000000000000..062798a39b8106
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -0,0 +1,132 @@
+//
+// NOTE: this test requires gpu-sm80 and cusparselt
+//
+// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
+// RUN:          %s \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --e main --entry-point-result=void \
+// RUN: | FileCheck %s
+
+#map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+module {
+
+  llvm.func @mgpuCreateSparseLtEnv()
+  llvm.func @mgpuDestroySparseLtEnv()
+
+  //
+  // TODO: This uses our temporary ATTRIBUTE, replace with 2:4 type!
+  //
+  func.func @matmul(%arg0: tensor<16x16xf16>,
+                    %arg1: tensor<16x16xf16>,
+		    %arg2: tensor<16x16xf16>) -> tensor<16x16xf16> {
+    %0 = linalg.generic {
+       DENSE24,
+       indexing_maps = [#map0, #map1, #map2],
+       iterator_types = ["parallel", "parallel", "reduction"]
+    }
+     ins(%arg0, %arg1 : tensor<16x16xf16>, tensor<16x16xf16>)
+     outs(%arg2 : tensor<16x16xf16>) {
+         ^bb0(%in: f16, %in_0: f16, %out: f16):
+           %1 = arith.mulf %in, %in_0 : f16
+           %2 = arith.addf %out, %1 : f16
+           linalg.yield %2 : f16
+       } -> tensor<16x16xf16>
+    return %0 : tensor<16x16xf16>
+  }
+
+  func.func @main() {
+    llvm.call @mgpuCreateSparseLtEnv() : () -> ()
+
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+
+    %f0 = arith.constant 0.0 : f16
+    %f1 = arith.constant 1.0 : f16
+    %f4 = arith.constant 4.0 : f16
+
+    // Initial A, B, C matrices.
+    %A = tensor.generate {
+    ^bb0(%i: index, %j: index):
+      %val = arith.andi %j, %c1 : index
+      %cmp = arith.cmpi eq, %val, %c0 : index
+      %res = arith.select %cmp, %f4, %f1 : f16
+      tensor.yield %res : f16
+    } : tensor<16x16xf16>
+    %B = tensor.generate {
+    ^bb0(%i: index, %j: index):
+      %cmp = arith.cmpi eq, %i, %j : index
+      %res = arith.select %cmp, %f1, %f0 : f16
+      tensor.yield %res : f16
+    } : tensor<16x16xf16>
+    %C = tensor.generate {
+    ^bb0(%i: index, %j: index):
+      tensor.yield %f0 : f16
+    } : tensor<16x16xf16>
+
+    // Call the kernel.
+    //
+    // By effectively computing D = A B + C with id(B) and zero(C)
+    // the resulting matrix returns the pruned A back to the caller.
+    //
+    %D = call @matmul(%A, %B, %C): (tensor<16x16xf16>, tensor<16x16xf16>, tensor<16x16xf16>) -> (tensor<16x16xf16>)
+
+    //
+    // This was the original matrix.
+    //
+    // CHECK:      ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
+    //
+    scf.for %i = %c0 to %c16 step %c1 {
+      %va = vector.transfer_read %A[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16>
+      vector.print %va : vector<16xf16>
+    }
+
+    //
+    // This is the STRIP-pruned matrix.
+    //
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
+    //
+    scf.for %i = %c0 to %c16 step %c1 {
+      %vd = vector.transfer_read %D[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16>
+      vector.print %vd : vector<16xf16>
+    }
+
+    llvm.call @mgpuDestroySparseLtEnv() : () -> ()
+    return
+  }
+}