[Mlir-commits] [mlir] 86eff48 - [mlir][sparse][gpu] force 16-byte alignment on data structs for cuSparseLt
Aart Bik
llvmlistbot at llvm.org
Thu Jul 13 10:45:24 PDT 2023
Author: Aart Bik
Date: 2023-07-13T10:45:15-07:00
New Revision: 86eff489e75f709d3abaea71e48f07a7e049278e
URL: https://github.com/llvm/llvm-project/commit/86eff489e75f709d3abaea71e48f07a7e049278e
DIFF: https://github.com/llvm/llvm-project/commit/86eff489e75f709d3abaea71e48f07a7e049278e.diff
LOG: [mlir][sparse][gpu] force 16-byte alignment on data structs for cuSparseLt
Also makes some minor consistency edits in the cuSparseLt wrapper lib.
Reviewed By: Peiming, K-Wu
Differential Revision: https://reviews.llvm.org/D155139
Added:
Modified:
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
Removed:
################################################################################
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index f8615fc88b4db1..6c383de2e6a69b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -1376,8 +1376,8 @@ LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
if (isSpMMCusparseLtOp(op.getDnTensor())) {
auto handleSz = rewriter.create<LLVM::ConstantOp>(
loc, getIndexType(), rewriter.getIndexAttr(11032));
- handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
- llvmInt8Type, handleSz);
+ handle = rewriter.create<LLVM::AllocaOp>(
+ loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16);
handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
createLtDnMatCallBuilder
@@ -1554,8 +1554,8 @@ LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
// CUDA runner asserts the size is 44104 bytes.
auto handleSz = rewriter.create<LLVM::ConstantOp>(
loc, getIndexType(), rewriter.getIndexAttr(44104));
- Value handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
- llvmInt8Type, handleSz);
+ Value handle = rewriter.create<LLVM::AllocaOp>(
+ loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16);
handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
create2To4SpMatCallBuilder
@@ -1644,8 +1644,8 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
rewriter.getIndexAttr(3));
- auto bufferSize = rewriter.create<LLVM::AllocaOp>(loc, llvmInt64PointerType,
- llvmInt64Type, three);
+ auto bufferSize = rewriter.create<LLVM::AllocaOp>(
+ loc, llvmInt64PointerType, llvmInt64Type, three, /*alignment=*/16);
createCuSparseLtSpMMBufferSizeBuilder
.create(loc, rewriter,
{bufferSize, modeA, modeB, adaptor.getSpmatA(),
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 545dca6f9bdf1c..b5dbc5144ddd7a 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -512,7 +512,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() {
ScopedContext scopedContext;
assert(!cusparseLt_initiated &&
"client called mgpuCreateSparseLtEnv() twice");
- // Note that cuSparseLt still uses cusparseStatus_t
+ // Note that cuSparseLt still uses cusparseStatus_t.
CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env));
cusparseLt_initiated = true;
}
@@ -527,29 +527,22 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values,
int32_t dtp, CUstream /*stream*/) {
assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
- // CusparseLt expects the descriptors to be zero-initialized.
- memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData));
auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
+ // CusparseLt expects the descriptors to be zero-initialized.
+ memset(dnmat_handle, 0, sizeof(cusparseLtDnMatHandleAndData));
+ dnmat_handle->values = values;
auto dTp = static_cast<cudaDataType_t>(dtp);
- // assuming row-major when deciding lda
+ // Assume row-major when deciding lda.
+ const uint32_t alignment = 16;
CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
&cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
- /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW))
- dnmat_handle->values = values;
-}
-
-// This can be used to destroy both dense matrices and sparse matrices in
-// cusparseLt
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) {
- auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData *>(m);
- CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
+ alignment, dTp, CUSPARSE_ORDER_ROW))
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) {
- auto matAndData = reinterpret_cast<cusparseLtDnMatHandleAndData *>(m);
- CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
+mgpuDestroyCuSparseLtDnMat(void *dh, CUstream /*stream*/) {
+ auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
+ CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(dnmat_handle->mat)))
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
@@ -561,11 +554,17 @@ mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols,
memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData));
spmat_handle->values = values;
auto dTp = static_cast<cudaDataType_t>(dtp);
- // assuming row-major when deciding lda
+ // Assume row-major when deciding lda.
+ const uint32_t alignment = 16;
CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
- &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols,
- /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW,
- CUSPARSELT_SPARSITY_50_PERCENT))
+ &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, alignment,
+ dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) {
+ auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
+ CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(spmat_handle->mat)))
}
// Several things are being done in this stage, algorithm selection, planning,
@@ -607,7 +606,7 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
&cusparseLt_env, &(matA->plan), &compressed_size_,
&compressed_buffer_size_))
- // avoid zero-alloc
+ // Avoid zero-allocation.
*workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_);
*compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_);
*compressed_buffer_size =
More information about the Mlir-commits
mailing list