[Mlir-commits] [mlir] f6f817d - [mlir][sparse][gpu] minor improvements in 2:4 example
Aart Bik
llvmlistbot at llvm.org
Thu Jul 13 16:20:43 PDT 2023
Author: Aart Bik
Date: 2023-07-13T16:20:27-07:00
New Revision: f6f817d0d7b7b03191c007a638a6067fa083dcee
URL: https://github.com/llvm/llvm-project/commit/f6f817d0d7b7b03191c007a638a6067fa083dcee
DIFF: https://github.com/llvm/llvm-project/commit/f6f817d0d7b7b03191c007a638a6067fa083dcee.diff
LOG: [mlir][sparse][gpu] minor improvements in 2:4 example
Reviewed By: K-Wu
Differential Revision: https://reviews.llvm.org/D155244
Added:
Modified:
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
Removed:
################################################################################
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index f1e985e70793bc..54a740516e2e69 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -1,5 +1,5 @@
//
-// NOTE: this test requires gpu-sm80
+// NOTE: this test requires gpu-sm80 and cusparselt
//
// RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
@@ -41,7 +41,8 @@ module {
%token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
%token16 = gpu.destroy_sp_mat async [%token15] %spmat
%token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
- %token19 = gpu.memcpy async [%token17] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
+ %token18 = gpu.destroy_dn_tensor async [%token17] %dnmat2
+ %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
%token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16>
%token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>
%token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16>
@@ -69,9 +70,9 @@ module {
%c64 = arith.constant 64 : index
// Matrices A, B, C (16x32, 32x16, 16x16).
- %a = memref.alloc() : memref<16x32xf16> // 16x32 but 2:4, row-major
- %b = memref.alloc() : memref<32x16xf16> // regular dense column-major
- %c = memref.alloc() : memref<16x16xf16> // accumulator row-major
+ %a = memref.alloc() : memref<16x32xf16> // 16x32 with 2:4, row-major
+ %b = memref.alloc() : memref<32x16xf16> // regular dense column-major
+ %c = memref.alloc() : memref<16x16xf16> // accumulator row-major
//
// Setup matrix A.
@@ -181,27 +182,8 @@ module {
vector.print %pb0 : vector<16xf16>
}
- // Maps the provided host buffers into the device address space.
- // Writes from the host are guaranteed to be visible to device
- // kernels that are launched afterwards. Writes from the device
- // are guaranteed to be visible on the host after synchronizing
- // with the device kernel completion.
- %cast_a = memref.cast %a : memref<16x32xf16> to memref<*xf16>
- gpu.host_register %cast_a : memref<*xf16>
- %cast_b = memref.cast %b : memref<32x16xf16> to memref<*xf16>
- gpu.host_register %cast_b : memref<*xf16>
- %cast_c = memref.cast %c : memref<16x16xf16> to memref<*xf16>
- gpu.host_register %cast_c : memref<*xf16>
-
// Call the kernel.
- %t1 = arith.constant 1 : index
- %t32 = arith.constant 32 : index
call @sampled_matmul (%a, %b, %c): (memref<16x32xf16>, memref<32x16xf16>, memref<16x16xf16>) -> ()
-
- // Unmaps the host buffers.
- gpu.host_unregister %cast_a : memref<*xf16>
- gpu.host_unregister %cast_b : memref<*xf16>
- gpu.host_unregister %cast_c : memref<*xf16>
//
// Verify computed matrix C.
@@ -227,7 +209,7 @@ module {
%pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
vector.print %pc0 : vector<16xf16>
}
-
+
llvm.call @mgpuDestroySparseLtEnv() : () -> ()
return
}
More information about the Mlir-commits
mailing list