[Mlir-commits] [mlir] 86eff48 - [mlir][sparse][gpu] force 16-byte alignment on data structs for cuSparseLt

Thu Jul 13 10:45:24 PDT 2023

Author: Aart Bik
Date: 2023-07-13T10:45:15-07:00
New Revision: 86eff489e75f709d3abaea71e48f07a7e049278e

URL: https://github.com/llvm/llvm-project/commit/86eff489e75f709d3abaea71e48f07a7e049278e
DIFF: https://github.com/llvm/llvm-project/commit/86eff489e75f709d3abaea71e48f07a7e049278e.diff

LOG: [mlir][sparse][gpu] force 16-byte alignment on data structs for cuSparseLt

Also makes some minor consistency edits in the cuSparseLt wrapper lib.

Reviewed By: Peiming, K-Wu

Differential Revision: https://reviews.llvm.org/D155139

Added: 
    

Modified: 
    mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
    mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index f8615fc88b4db1..6c383de2e6a69b 100644

--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -1376,8 +1376,8 @@ LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
     if (isSpMMCusparseLtOp(op.getDnTensor())) {
       auto handleSz = rewriter.create<LLVM::ConstantOp>(
           loc, getIndexType(), rewriter.getIndexAttr(11032));
-      handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
-                                               llvmInt8Type, handleSz);
+      handle = rewriter.create<LLVM::AllocaOp>(
+          loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16);
       handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
 
       createLtDnMatCallBuilder
@@ -1554,8 +1554,8 @@ LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
   // CUDA runner asserts the size is 44104 bytes.
   auto handleSz = rewriter.create<LLVM::ConstantOp>(
       loc, getIndexType(), rewriter.getIndexAttr(44104));
-  Value handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
-                                                 llvmInt8Type, handleSz);
+  Value handle = rewriter.create<LLVM::AllocaOp>(
+      loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16);
   handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
 
   create2To4SpMatCallBuilder
@@ -1644,8 +1644,8 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
         rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
     auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                    rewriter.getIndexAttr(3));
-    auto bufferSize = rewriter.create<LLVM::AllocaOp>(loc, llvmInt64PointerType,
-                                                      llvmInt64Type, three);
+    auto bufferSize = rewriter.create<LLVM::AllocaOp>(
+        loc, llvmInt64PointerType, llvmInt64Type, three, /*alignment=*/16);
     createCuSparseLtSpMMBufferSizeBuilder
         .create(loc, rewriter,
                 {bufferSize, modeA, modeB, adaptor.getSpmatA(),

diff  --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 545dca6f9bdf1c..b5dbc5144ddd7a 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -512,7 +512,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() {
   ScopedContext scopedContext;
   assert(!cusparseLt_initiated &&
          "client called mgpuCreateSparseLtEnv() twice");
-  // Note that cuSparseLt still uses cusparseStatus_t
+  // Note that cuSparseLt still uses cusparseStatus_t.
   CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env));
   cusparseLt_initiated = true;
 }
@@ -527,29 +527,22 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values,
                           int32_t dtp, CUstream /*stream*/) {
   assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
-  // CusparseLt expects the descriptors to be zero-initialized.
-  memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData));
   auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
+  // CusparseLt expects the descriptors to be zero-initialized.
+  memset(dnmat_handle, 0, sizeof(cusparseLtDnMatHandleAndData));
+  dnmat_handle->values = values;
   auto dTp = static_cast<cudaDataType_t>(dtp);
-  // assuming row-major when deciding lda
+  // Assume row-major when deciding lda.
+  const uint32_t alignment = 16;
   CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
       &cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
-      /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW))
-  dnmat_handle->values = values;
-}
-
-// This can be used to destroy both dense matrices and sparse matrices in
-// cusparseLt
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) {
-  auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData *>(m);
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
+      alignment, dTp, CUSPARSE_ORDER_ROW))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) {
-  auto matAndData = reinterpret_cast<cusparseLtDnMatHandleAndData *>(m);
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
+mgpuDestroyCuSparseLtDnMat(void *dh, CUstream /*stream*/) {
+  auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(dnmat_handle->mat)))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
@@ -561,11 +554,17 @@ mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols,
   memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData));
   spmat_handle->values = values;
   auto dTp = static_cast<cudaDataType_t>(dtp);
-  // assuming row-major when deciding lda
+  // Assume row-major when deciding lda.
+  const uint32_t alignment = 16;
   CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
-      &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols,
-      /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW,
-      CUSPARSELT_SPARSITY_50_PERCENT))
+      &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, alignment,
+      dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) {
+  auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(spmat_handle->mat)))
 }
 
 // Several things are being done in this stage, algorithm selection, planning,
@@ -607,7 +606,7 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
       &cusparseLt_env, &(matA->plan), &compressed_size_,
       &compressed_buffer_size_))
 
-  // avoid zero-alloc
+  // Avoid zero-allocation.
   *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_);
   *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_);
   *compressed_buffer_size =