[Mlir-commits] [mlir] be2dd22 - [mlir][sparse][gpu] reuse CUDA environment handle throughout instance lifetime
Kun Wu
llvmlistbot at llvm.org
Fri Jun 30 14:53:20 PDT 2023
Author: Kun Wu
Date: 2023-06-30T21:52:34Z
New Revision: be2dd22b8f47e6f0e56063910cf7cd37482960cc
URL: https://github.com/llvm/llvm-project/commit/be2dd22b8f47e6f0e56063910cf7cd37482960cc
DIFF: https://github.com/llvm/llvm-project/commit/be2dd22b8f47e6f0e56063910cf7cd37482960cc.diff
LOG: [mlir][sparse][gpu] reuse CUDA environment handle throughout instance lifetime
Differential Revision: https://reviews.llvm.org/D153173
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
mlir/test/Dialect/GPU/ops.mlir
mlir/test/Dialect/GPU/sparse-roundtrip.mlir
mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index d3d31cdb75b48b..842e63aff7bd9f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -110,12 +110,6 @@ class MMAMatrixOf<list<Type> allowedTypes> :
"gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">;
// Types for all sparse handles.
-def GPU_SparseEnvHandle :
- DialectType<GPU_Dialect,
- CPred<"llvm::isa<::mlir::gpu::SparseEnvHandleType>($_self)">,
- "sparse environment handle type">,
- BuildableType<"mlir::gpu::SparseEnvHandleType::get($_builder.getContext())">;
-
def GPU_SparseDnTensorHandle :
DialectType<GPU_Dialect,
CPred<"llvm::isa<::mlir::gpu::SparseDnTensorHandleType>($_self)">,
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index e32ea5c38e6e13..1178c0895b5024 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -165,7 +165,7 @@ class MMAMatrixType
void addAsyncDependency(Operation *op, Value token);
// Handle types for sparse.
-enum class SparseHandleKind { Env, SpMat, DnTensor };
+enum class SparseHandleKind { SpMat, DnTensor };
template <SparseHandleKind K>
class SparseHandleType
@@ -176,7 +176,6 @@ class SparseHandleType
using Base::Base;
};
-using SparseEnvHandleType = SparseHandleType<SparseHandleKind::Env>;
using SparseDnTensorHandleType = SparseHandleType<SparseHandleKind::DnTensor>;
using SparseSpMatHandleType = SparseHandleType<SparseHandleKind::SpMat>;
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 0e13295b1db078..9a8b03c694d34c 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1540,63 +1540,6 @@ def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
// Operation on sparse matrices, called from the host
// (currently lowers to cuSparse for CUDA only, no ROCM lowering).
//
-
-def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> {
- let summary = "Create sparse environment operation";
- let description = [{
- The `gpu.create_sparse_env` operation initializes a sparse environment.
- It must be executed prior to any other sparse operation. The operation
- returns a handle to the new sparse environment.
-
- If the `async` keyword is present, the op is executed asynchronously (i.e.
- it does not block until the execution has finished on the device). In
- that case, it returns a !gpu.async.token in addition to the environment.
-
- Example:
-
- ```mlir
- %env, %token = gpu.create_sparse_env async [%dep]
- ```
- }];
-
- let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
- let results = (outs Res<GPU_SparseEnvHandle>:$env,
- Optional<GPU_AsyncToken>:$asyncToken);
- let assemblyFormat = [{
- custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
- }];
-}
-
-def GPU_DestroySparseEnvOp : GPU_Op<
- "destroy_sparse_env",
- [GPU_AsyncOpInterface]> {
- let summary = "Destroy sparse environment operation";
- let description = [{
- The `gpu.destroy_sparse_env` operation releases all resources of a sparse
- environment represented by a handle that was previously created by a
- `gpu.create_sparse_env` operation.
-
- If the `async` keyword is present, the op is executed asynchronously (i.e.
- it does not block until the execution has finished on the device). In
- that case, it returns a !gpu.async.token in addition to the environment.
-
- Example:
-
- ```mlir
- %token = gpu.destroy_sparse_env async [%dep] %env
- ```
- }];
-
- let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- Arg<GPU_SparseEnvHandle>:$env);
- let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
-
- let assemblyFormat = [{
- custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env attr-dict
- }];
-}
-
def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
let summary = "Create dense tensor operation";
let description = [{
@@ -1612,19 +1555,18 @@ def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, Att
Example:
```mlir
- %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref<?xf64>
+ %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref<?xf64>
```
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
AnyMemRef:$memref,
Variadic<Index>:$dims);
let results = (outs Res<GPU_SparseDnTensorHandle>:$dnTensor, Optional<GPU_AsyncToken>:$asyncToken);
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref)
+ $memref `,` $dims attr-dict `:` type($dims) `into` type($memref)
}];
}
@@ -1788,12 +1730,11 @@ def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]>
Example:
```mlir
- %spmat, %token = gpu.create_2to4_spmat async [%dep] %env, %rows, %cols, %mem : memref<?xf64>
+ %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref<?xf64>
```
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
Index:$rows,
Index:$cols,
AnyMemRef:$memref);
@@ -1802,7 +1743,7 @@ def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]>
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+ $rows `,` $cols `,` $memref attr-dict `:` type($memref)
}];
}
@@ -1877,11 +1818,10 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
Example:
```mlir
- %buffersz, %token = gpu.spmv_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY into f32
+ %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32
```
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
GPU_TransposeModeAttr:$modeA,
GPU_SparseSpMatHandle:$spmatA,
GPU_SparseDnTensorHandle:$dnX,
@@ -1894,7 +1834,6 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
"Type":$bufferSz,
"Type":$asyncToken,
"ValueRange":$asyncDependencies,
- "Value":$env,
"Value":$spmatA,
"Value":$dnX,
"Value":$dnY,
@@ -1902,12 +1841,12 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
, [{
auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
- env, modeA, spmatA, dnX, dnY, computeType);}]>
+ modeA, spmatA, dnX, dnY, computeType);}]>
];
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType
+ $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType
}];
}
@@ -1930,11 +1869,10 @@ def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
Example:
```mlir
- %token = gpu.spmv async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY : memref<?xf64> into bf16
+ %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref<?xf64> into bf16
```
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
GPU_TransposeModeAttr:$modeA,
GPU_SparseSpMatHandle:$spmatA,
GPU_SparseDnTensorHandle:$dnX,
@@ -1946,20 +1884,19 @@ def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
let builders = [OpBuilder<(ins
"Type":$asyncToken,
"ValueRange":$asyncDependencies,
- "Value":$env,
"Value":$spmatA,
"Value":$dnX,
"Value":$dnY,
"Type":$computeType,
"Value":$buffer), [{
auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
- return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
+ return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
spmatA, dnX, dnY, computeType, buffer);}]>
];
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType
+ $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType
}];
}
@@ -1982,12 +1919,11 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface, Att
Example:
```mlir
- %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32
+ %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32
```
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
GPU_TransposeModeAttr:$modeA,
GPU_TransposeModeAttr:$modeB,
GPU_SparseSpMatHandle:$spmatA,
@@ -2001,7 +1937,6 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface, Att
"Type":$bufferSzs,
"Type":$asyncToken,
"ValueRange":$asyncDependencies,
- "Value":$env,
"Value":$spmatA,
"Value":$dnmatB,
"Value":$dnmatC,
@@ -2009,12 +1944,12 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface, Att
auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies,
- env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]>
+ modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]>
];
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType
+ $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType
}];
}
@@ -2037,12 +1972,11 @@ def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]
Example:
```mlir
- %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32
+ %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32
```
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
GPU_TransposeModeAttr:$modeA,
GPU_TransposeModeAttr:$modeB,
GPU_SparseSpMatHandle:$spmatA,
@@ -2055,7 +1989,6 @@ def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]
let builders = [OpBuilder<(ins
"Type":$asyncToken,
"ValueRange":$asyncDependencies,
- "Value":$env,
"Value":$spmatA,
"Value":$dnmatB,
"Value":$dnmatC,
@@ -2063,13 +1996,13 @@ def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]
"ValueRange":$buffers), [{
auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
- return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
+ return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]>
];
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType
+ $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType
}];
}
@@ -2088,7 +2021,7 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
Example:
```mlir
- %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
+ %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
```
The matrix arguments can also be associated with one of the following
@@ -2097,7 +2030,6 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
GPU_TransposeModeAttr:$modeA,
GPU_TransposeModeAttr:$modeB,
GPU_SparseDnTensorHandle:$dnmatA,
@@ -2110,7 +2042,6 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
"Type":$bufferSz,
"Type":$asyncToken,
"ValueRange":$asyncDependencies,
- "Value":$env,
"Value":$dnmatA,
"Value":$dnmatB,
"Value":$spmatC,
@@ -2118,12 +2049,12 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
- env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]>
+ modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]>
];
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType
+ $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType
}];
}
@@ -2142,7 +2073,7 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
Example:
```mlir
- %token = gpu.sddmm async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32
+ %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32
```
The matrix arguments can also be associated with one of the following
@@ -2151,7 +2082,6 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
}];
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- GPU_SparseEnvHandle:$env,
GPU_TransposeModeAttr:$modeA,
GPU_TransposeModeAttr:$modeB,
GPU_SparseDnTensorHandle:$dnmatA,
@@ -2164,7 +2094,6 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
let builders = [OpBuilder<(ins
"Type":$asyncToken,
"ValueRange":$asyncDependencies,
- "Value":$env,
"Value":$dnmatA,
"Value":$dnmatB,
"Value":$spmatC,
@@ -2172,13 +2101,13 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
"Value":$buffer), [{
auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
- return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
+ return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]>
];
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
- $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType
+ $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType
}];
}
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 580de21f407970..f8615fc88b4db1 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -204,14 +204,6 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
"mgpuSetDefaultDevice",
llvmVoidType,
{llvmInt32Type /* uint32_t devIndex */}};
- FunctionCallBuilder createSparseEnvCallBuilder = {
- "mgpuCreateSparseEnv",
- llvmPointerType,
- {llvmPointerType /* void *stream */}};
- FunctionCallBuilder destroySparseEnvCallBuilder = {
- "mgpuDestroySparseEnv",
- llvmVoidType,
- {llvmPointerType, llvmPointerType /* void *stream */}};
FunctionCallBuilder createDnVecCallBuilder = {
"mgpuCreateDnVec",
llvmPointerType,
@@ -255,51 +247,40 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
FunctionCallBuilder spMVBufferSizeCallBuilder = {
"mgpuSpMVBufferSize",
llvmIntPtrType,
- {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType,
- llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+ {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType,
+ llvmInt32Type, llvmPointerType /* void *stream */}};
FunctionCallBuilder spMVCallBuilder = {
"mgpuSpMV",
llvmVoidType,
- {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType,
- llvmPointerType, llvmInt32Type, llvmPointerType,
- llvmPointerType /* void *stream */}};
+ {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType,
+ llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}};
FunctionCallBuilder createSpMMBufferSizeCallBuilder = {
"mgpuSpMMBufferSize",
llvmIntPtrType,
- {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
- llvmPointerType, llvmPointerType, llvmInt32Type,
- llvmPointerType /* void *stream */}};
+ {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+ llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
FunctionCallBuilder createSpMMCallBuilder = {
"mgpuSpMM",
llvmVoidType,
- {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
- llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType,
+ {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+ llvmPointerType, llvmInt32Type, llvmPointerType,
llvmPointerType /* void *stream */}};
FunctionCallBuilder createSDDMMBufferSizeCallBuilder = {
"mgpuSDDMMBufferSize",
llvmIntPtrType,
- {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
- llvmPointerType, llvmPointerType, llvmInt32Type,
- llvmPointerType /* void *stream */}};
+ {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+ llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
FunctionCallBuilder createSDDMMCallBuilder = {
"mgpuSDDMM",
llvmVoidType,
- {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
- llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType,
+ {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+ llvmPointerType, llvmInt32Type, llvmPointerType,
llvmPointerType /* void *stream */}};
- FunctionCallBuilder createSparseLtEnvCallBuilder = {
- "mgpuCreateSparseLtEnv",
- llvmVoidType,
- {llvmPointerType, llvmPointerType /* void *stream */}};
- FunctionCallBuilder destroySparseLtEnvCallBuilder = {
- "mgpuDestroySparseLtEnv",
- llvmVoidType,
- {llvmPointerType, llvmPointerType /* void *stream */}};
FunctionCallBuilder createLtDnMatCallBuilder = {
"mgpuCreateCuSparseLtDnMat",
llvmVoidType,
- {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType,
- llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+ {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+ llvmInt32Type, llvmPointerType /* void *stream */}};
FunctionCallBuilder destroyCuSparseLtSpMatBuilder = {
"mgpuDestroyCuSparseLtSpMat",
llvmVoidType,
@@ -311,20 +292,19 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
FunctionCallBuilder create2To4SpMatCallBuilder = {
"mgpuCusparseLtCreate2To4SpMat",
llvmVoidType,
- {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType,
- llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+ {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+ llvmInt32Type, llvmPointerType /* void *stream */}};
FunctionCallBuilder createCuSparseLtSpMMBufferSizeBuilder = {
"mgpuCuSparseLtSpMMBufferSize",
llvmVoidType,
- {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
- llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type,
+ {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
+ llvmPointerType, llvmPointerType, llvmInt32Type,
llvmPointerType /*void *stream*/}};
FunctionCallBuilder createCuSparseLtSpMMBuilder = {
"mgpuCuSparseLtSpMM",
llvmVoidType,
{llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
- llvmPointerType, llvmPointerType, llvmPointerType,
- llvmPointerType /*void *stream*/}};
+ llvmPointerType, llvmPointerType, llvmPointerType /*void *stream*/}};
};
/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -515,34 +495,6 @@ class ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern
ConversionPatternRewriter &rewriter) const override;
};
-class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern
- : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp> {
-public:
- ConvertCreateSparseEnvOpToGpuRuntimeCallPattern(
- LLVMTypeConverter &typeConverter)
- : ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp>(
- typeConverter) {}
-
-private:
- LogicalResult
- matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override;
-};
-
-class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern
- : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp> {
-public:
- ConvertDestroySparseEnvOpToGpuRuntimeCallPattern(
- LLVMTypeConverter &typeConverter)
- : ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp>(
- typeConverter) {}
-
-private:
- LogicalResult
- matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override;
-};
-
class ConvertCreateDnTensorOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnTensorOp> {
public:
@@ -1393,55 +1345,6 @@ static Value genConstInt32From(OpBuilder &builder, Location loc, T TValue) {
static_cast<int32_t>(TValue));
}
-LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
- gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const {
- if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
- failed(isAsyncWithOneDependency(rewriter, op)))
- return failure();
- Location loc = op.getLoc();
- auto stream = adaptor.getAsyncDependencies().front();
- // Use the cusparseLt create call if the dnmat is used with spmat with
- // 2:4 sparsity
- Value handle;
- if (isSpMMCusparseLtOp(op.getEnv())) {
- // CUDA runner asserts the size is 11024 bytes.
- auto handleSz = rewriter.create<LLVM::ConstantOp>(
- loc, getIndexType(), rewriter.getIndexAttr(11024));
- handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
- llvmInt8Type, handleSz);
- handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
- createSparseLtEnvCallBuilder.create(loc, rewriter, {handle, stream})
- .getResult();
- } else {
- handle =
- createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
- }
- rewriter.replaceOp(op, {handle, stream});
- return success();
-}
-
-LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
- gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const {
- if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
- failed(isAsyncWithOneDependency(rewriter, op)))
- return failure();
- Location loc = op.getLoc();
- auto stream = adaptor.getAsyncDependencies().front();
- // Use the cusparseLt destroy call if the dnmat is used with spmat with
- // 2:4 sparsity
- if (isSpMMCusparseLtOp(op.getEnv())) {
- destroySparseLtEnvCallBuilder.create(loc, rewriter,
- {adaptor.getEnv(), stream});
- } else {
- destroySparseEnvCallBuilder.create(loc, rewriter,
- {adaptor.getEnv(), stream});
- }
- rewriter.replaceOp(op, {stream});
- return success();
-}
-
LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::CreateDnTensorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
@@ -1471,7 +1374,6 @@ LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
// the dnmat is used with spmat with 2:4 sparsity
if (dims.size() == 2) {
if (isSpMMCusparseLtOp(op.getDnTensor())) {
- auto envHandle = adaptor.getEnv();
auto handleSz = rewriter.create<LLVM::ConstantOp>(
loc, getIndexType(), rewriter.getIndexAttr(11032));
handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1480,7 +1382,7 @@ LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
createLtDnMatCallBuilder
.create(loc, rewriter,
- {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream})
+ {handle, dims[0], dims[1], pTensor, dtp, stream})
.getResult();
} else {
handle =
@@ -1648,7 +1550,6 @@ LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
Type dType =
llvm::cast<MemRefType>(op.getMemref().getType()).getElementType();
auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
- auto envHandle = adaptor.getEnv();
// CUDA runner asserts the size is 44104 bytes.
auto handleSz = rewriter.create<LLVM::ConstantOp>(
@@ -1659,8 +1560,7 @@ LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
create2To4SpMatCallBuilder
.create(loc, rewriter,
- {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat,
- dtp, stream})
+ {handle, adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream})
.getResult();
rewriter.replaceOp(op, {handle, stream});
return success();
@@ -1697,12 +1597,11 @@ LogicalResult ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
auto computeType = genConstInt32From(
rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
auto stream = adaptor.getAsyncDependencies().front();
- auto bufferSize =
- spMVBufferSizeCallBuilder
- .create(loc, rewriter,
- {adaptor.getEnv(), modeA, adaptor.getSpmatA(),
- adaptor.getDnX(), adaptor.getDnY(), computeType, stream})
- .getResult();
+ auto bufferSize = spMVBufferSizeCallBuilder
+ .create(loc, rewriter,
+ {modeA, adaptor.getSpmatA(), adaptor.getDnX(),
+ adaptor.getDnY(), computeType, stream})
+ .getResult();
rewriter.replaceOp(op, {bufferSize, stream});
return success();
}
@@ -1723,9 +1622,8 @@ LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite(
if (!getTypeConverter()->useOpaquePointers())
pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
spMVCallBuilder.create(loc, rewriter,
- {adaptor.getEnv(), modeA, adaptor.getSpmatA(),
- adaptor.getDnX(), adaptor.getDnY(), computeType, pBuf,
- stream});
+ {modeA, adaptor.getSpmatA(), adaptor.getDnX(),
+ adaptor.getDnY(), computeType, pBuf, stream});
rewriter.replaceOp(op, {stream});
return success();
}
@@ -1750,9 +1648,8 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
llvmInt64Type, three);
createCuSparseLtSpMMBufferSizeBuilder
.create(loc, rewriter,
- {bufferSize, adaptor.getEnv(), modeA, modeB,
- adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(),
- computeType, stream})
+ {bufferSize, modeA, modeB, adaptor.getSpmatA(),
+ adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream})
.getResult();
auto bufferSizePtr1 = rewriter.create<LLVM::GEPOp>(
@@ -1774,12 +1671,12 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
} else {
auto computeType = genConstInt32From(
rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
- bufferSize = createSpMMBufferSizeCallBuilder
- .create(loc, rewriter,
- {adaptor.getEnv(), modeA, modeB,
- adaptor.getSpmatA(), adaptor.getDnmatB(),
- adaptor.getDnmatC(), computeType, stream})
- .getResult();
+ bufferSize =
+ createSpMMBufferSizeCallBuilder
+ .create(loc, rewriter,
+ {modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(),
+ adaptor.getDnmatC(), computeType, stream})
+ .getResult();
rewriter.replaceOp(op, {bufferSize, stream});
}
return success();
@@ -1797,12 +1694,12 @@ LogicalResult ConvertSDDMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
auto computeType = genConstInt32From(
rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
auto stream = adaptor.getAsyncDependencies().front();
- auto bufferSize = createSDDMMBufferSizeCallBuilder
- .create(loc, rewriter,
- {adaptor.getEnv(), modeA, modeB,
- adaptor.getDnmatA(), adaptor.getDnmatB(),
- adaptor.getSpmatC(), computeType, stream})
- .getResult();
+ auto bufferSize =
+ createSDDMMBufferSizeCallBuilder
+ .create(loc, rewriter,
+ {modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(),
+ adaptor.getSpmatC(), computeType, stream})
+ .getResult();
rewriter.replaceOp(op, {bufferSize, stream});
return success();
}
@@ -1832,17 +1729,17 @@ LogicalResult ConvertSpMMOpToGpuRuntimeCallPattern::matchAndRewrite(
}
createCuSparseLtSpMMBuilder.create(
loc, rewriter,
- {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(),
- adaptor.getDnmatC(), pBufs[0], pBufs[1], pBufs[2], stream});
+ {adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(),
+ pBufs[0], pBufs[1], pBufs[2], stream});
} else {
Value pBuf = MemRefDescriptor(adaptor.getBuffers().front())
.allocatedPtr(rewriter, loc);
if (!getTypeConverter()->useOpaquePointers())
pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
- createSpMMCallBuilder.create(
- loc, rewriter,
- {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(),
- adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, pBuf, stream});
+ createSpMMCallBuilder.create(loc, rewriter,
+ {modeA, modeB, adaptor.getSpmatA(),
+ adaptor.getDnmatB(), adaptor.getDnmatC(),
+ computeType, pBuf, stream});
}
rewriter.replaceOp(op, {stream});
return success();
@@ -1872,10 +1769,10 @@ LogicalResult ConvertSDDMMOpToGpuRuntimeCallPattern::matchAndRewrite(
MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
if (!getTypeConverter()->useOpaquePointers())
pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
- createSDDMMCallBuilder.create(
- loc, rewriter,
- {adaptor.getEnv(), modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(),
- adaptor.getSpmatC(), computeType, pBuf, stream});
+ createSDDMMCallBuilder.create(loc, rewriter,
+ {modeA, modeB, adaptor.getDnmatA(),
+ adaptor.getDnmatB(), adaptor.getSpmatC(),
+ computeType, pBuf, stream});
rewriter.replaceOp(op, {stream});
return success();
}
@@ -1887,7 +1784,6 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
addOpaquePointerConversion<gpu::AsyncTokenType>(converter);
addOpaquePointerConversion<gpu::SparseDnTensorHandleType>(converter);
addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
- addOpaquePointerConversion<gpu::SparseEnvHandleType>(converter);
patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,
@@ -1899,8 +1795,6 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern,
ConvertAsyncYieldToGpuRuntimeCallPattern,
- ConvertCreateSparseEnvOpToGpuRuntimeCallPattern,
- ConvertDestroySparseEnvOpToGpuRuntimeCallPattern,
ConvertCreateDnTensorOpToGpuRuntimeCallPattern,
ConvertDestroyDnTensorOpToGpuRuntimeCallPattern,
ConvertCreateCooOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 06ff669e9a71a6..da59b59064803d 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -146,7 +146,6 @@ struct GPUInlinerInterface : public DialectInlinerInterface {
void GPUDialect::initialize() {
addTypes<AsyncTokenType>();
addTypes<MMAMatrixType>();
- addTypes<SparseEnvHandleType>();
addTypes<SparseDnTensorHandleType>();
addTypes<SparseSpMatHandleType>();
addOperations<
@@ -162,8 +161,6 @@ void GPUDialect::initialize() {
static std::string getSparseHandleKeyword(SparseHandleKind kind) {
switch (kind) {
- case SparseHandleKind::Env:
- return "sparse.env_handle";
case SparseHandleKind::DnTensor:
return "sparse.dntensor_handle";
case SparseHandleKind::SpMat:
@@ -216,8 +213,6 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {
shape, elementType, operand);
}
- if (keyword == getSparseHandleKeyword(SparseHandleKind::Env))
- return SparseEnvHandleType::get(context);
if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor))
return SparseDnTensorHandleType::get(context);
if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat))
@@ -231,8 +226,6 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {
void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
TypeSwitch<Type>(type)
.Case<AsyncTokenType>([&](Type) { os << "async.token"; })
- .Case<SparseEnvHandleType>(
- [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::Env); })
.Case<SparseDnTensorHandleType>([&](Type) {
os << getSparseHandleKeyword(SparseHandleKind::DnTensor);
})
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 073aa8419ea53f..49ca395f33cf5f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -494,26 +494,21 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
// Create sparse environment and sparse matrix/dense vector handles.
Type indexTp = rewriter.getIndexType();
- Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
Value token = genFirstWait(rewriter, loc);
- auto env =
- rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
- Value handle = env.getResult(0);
- token = env.getAsyncToken();
Operation *spGenA =
genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szY, szX, nseA,
rowA, colA, valA, isCOO, enableRT);
Value spMatA = spGenA->getResult(0);
token = spGenA->getResult(1);
auto dvecX = rewriter.create<gpu::CreateDnTensorOp>(
- loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX);
+ loc, dnTensorHandleTp, tokenTp, token, vecX, szX);
Value dnX = dvecX.getResult(0);
token = dvecX.getAsyncToken();
auto dvecY = rewriter.create<gpu::CreateDnTensorOp>(
- loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY);
+ loc, dnTensorHandleTp, tokenTp, token, vecY, szY);
Value dnY = dvecY.getResult(0);
token = dvecY.getAsyncToken();
@@ -521,7 +516,7 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
// Precompute buffersize for SpMV.
auto bufferComp = rewriter.create<gpu::SpMVBufferSizeOp>(
- loc, indexTp, tokenTp, token, handle, spMatA, dnX, dnY,
+ loc, indexTp, tokenTp, token, spMatA, dnX, dnY,
/*computeType=*/dnYType);
Value bufferSz = bufferComp.getResult(0);
token = bufferComp.getAsyncToken();
@@ -530,9 +525,8 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
token = buf.getAsyncToken();
// Perform the SpMV.
- auto spmvComp =
- rewriter.create<gpu::SpMVOp>(loc, tokenTp, token, handle, spMatA, dnX,
- dnY, /*computeType=*/dnYType, buffer);
+ auto spmvComp = rewriter.create<gpu::SpMVOp>(
+ loc, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType, buffer);
token = spmvComp.getAsyncToken();
// Copy data back to host and free all the resoures.
@@ -542,8 +536,6 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
.getAsyncToken();
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
.getAsyncToken();
- token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
- .getAsyncToken();
token = genDeallocMemRef(rewriter, loc, rowA, token);
if (colA)
token = genDeallocMemRef(rewriter, loc, colA, token);
@@ -601,27 +593,22 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
// Create sparse environment and sparse matrix/dense matrix handles.
Type indexTp = rewriter.getIndexType();
- Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
Value token = genFirstWait(rewriter, loc);
- auto env =
- rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
- Value handle = env.getResult(0);
- token = env.getAsyncToken();
Operation *spGenA =
genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk, nseA,
rowA, colA, valA, isCOO, enableRT);
Value spMatA = spGenA->getResult(0);
token = spGenA->getResult(1);
auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
- loc, dnTensorHandleTp, tokenTp, token, handle, matB,
+ loc, dnTensorHandleTp, tokenTp, token, matB,
SmallVector<Value>{szk, szn});
Value dnB = dmatB.getResult(0);
token = dmatB.getAsyncToken();
auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(
- loc, dnTensorHandleTp, tokenTp, token, handle, matC,
+ loc, dnTensorHandleTp, tokenTp, token, matC,
SmallVector<Value>{szm, szn});
Value dnC = dmatC.getResult(0);
token = dmatC.getAsyncToken();
@@ -630,7 +617,7 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
// Precompute buffersize for SpMM.
auto bufferComp = rewriter.create<gpu::SpMMBufferSizeOp>(
- loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC,
+ loc, indexTp, tokenTp, token, spMatA, dnB, dnC,
/*computeType=*/dmatCType);
Value bufferSz = bufferComp.getResult(0);
token = bufferComp.getAsyncToken();
@@ -641,9 +628,8 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();
// Perform the SpMM.
- auto spmmComp =
- rewriter.create<gpu::SpMMOp>(loc, tokenTp, token, handle, spMatA, dnB,
- dnC, /*computeType=*/dnCType, buffer);
+ auto spmmComp = rewriter.create<gpu::SpMMOp>(
+ loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType, buffer);
token = spmmComp.getAsyncToken();
// Copy data back to host and free all the resoures.
@@ -653,9 +639,6 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
.getAsyncToken();
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
.getAsyncToken();
- token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
- .getAsyncToken();
- token = genDeallocMemRef(rewriter, loc, rowA, token);
if (colA)
token = genDeallocMemRef(rewriter, loc, colA, token);
token = genDeallocMemRef(rewriter, loc, valA, token);
@@ -715,24 +698,16 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
// Create sparse environment and sparse matrix/dense matrix handles.
Type indexTp = rewriter.getIndexType();
- Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
Type dnMatHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
Value token = genFirstWait(rewriter, loc);
- auto env =
- rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
- Value handle = env.getResult(0);
- token = env.getAsyncToken();
-
auto dmatA = rewriter.create<gpu::CreateDnTensorOp>(
- loc, dnMatHandleTp, tokenTp, token, handle, matA,
- SmallVector<Value>{szm, szk});
+ loc, dnMatHandleTp, tokenTp, token, matA, SmallVector<Value>{szm, szk});
Value dnA = dmatA.getResult(0);
token = dmatA.getAsyncToken();
auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
- loc, dnMatHandleTp, tokenTp, token, handle, matB,
- SmallVector<Value>{szk, szn});
+ loc, dnMatHandleTp, tokenTp, token, matB, SmallVector<Value>{szk, szn});
Value dnB = dmatB.getResult(0);
token = dmatB.getAsyncToken();
@@ -745,7 +720,7 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();
// Precompute buffersize for SDDMM.
auto bufferComp = rewriter.create<gpu::SDDMMBufferSizeOp>(
- loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC, dnCType);
+ loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);
Value bufferSz = bufferComp.getResult(0);
token = bufferComp.getAsyncToken();
auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
@@ -753,8 +728,8 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
token = buf.getAsyncToken();
// Perform the SDDMM.
- auto sddmmComp = rewriter.create<gpu::SDDMMOp>(
- loc, tokenTp, token, handle, dnA, dnB, spMatC, dnCType, buffer);
+ auto sddmmComp = rewriter.create<gpu::SDDMMOp>(loc, tokenTp, token, dnA, dnB,
+ spMatC, dnCType, buffer);
token = sddmmComp.getAsyncToken();
// Copy data back to host and free all the resoures.
@@ -764,8 +739,6 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
.getAsyncToken();
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
.getAsyncToken();
- token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
- .getAsyncToken();
token = genDeallocMemRef(rewriter, loc, buffer, token);
token = genDeallocMemRef(rewriter, loc, matA, token);
token = genDeallocMemRef(rewriter, loc, matB, token);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index acf3412e3da58a..5f6b47031b068c 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -79,6 +79,22 @@ class ScopedContext {
~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
};
+// Note that (1) Nvidia confirms the safety to share handle across multiple
+// instances, and streams. (2) Clients are responsible to call the @mgpu
+// environment initialization/destruction in a thread-safe manner, e.g.,
+// at the beginning of the program before multi-threads are created.
+#ifdef MLIR_ENABLE_CUDA_CUSPARSE
+static cusparseHandle_t cusparse_env = nullptr;
+
+#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
+// cusparseLtHandle_t is not a pointer type, so we need an additional flag to
+// indicate whether it is initialized.
+static cusparseLtHandle_t cusparseLt_env;
+static bool cusparseLt_initiated = false;
+
+#endif // MLIR_ENABLE_CUDA_CUSPARSELT
+#endif // MLIR_ENABLE_CUDA_CUSPARSE
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
ScopedContext scopedContext;
CUmodule module = nullptr;
@@ -270,17 +286,18 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
(beta##p) = reinterpret_cast<void *>(&(beta##d)); \
}
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
-mgpuCreateSparseEnv(CUstream /*stream*/) {
- cusparseHandle_t handle = nullptr;
- CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle))
- return reinterpret_cast<void *>(handle);
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv() {
+ // ScopedContext is for cuda initialization.
+ ScopedContext scopedContext;
+ assert(!cusparse_env && "client called mgpuCreateSparseEnv() twice");
+ CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&cusparse_env));
+ return;
}
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroySparseEnv(void *h, CUstream /*stream*/) {
- cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
- CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle))
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseEnv() {
+ assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
+ CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(cusparse_env));
+ cusparse_env = nullptr;
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
@@ -359,10 +376,9 @@ mgpuDestroySpMat(void *m, CUstream /*stream*/) {
CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat))
}
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
-mgpuSpMVBufferSize(void *h, int32_t ma, void *a, void *x, void *y, int32_t ctp,
- CUstream /*stream*/) {
- cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize(
+ int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) {
+ assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
@@ -370,32 +386,32 @@ mgpuSpMVBufferSize(void *h, int32_t ma, void *a, void *x, void *y, int32_t ctp,
cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
ALPHABETA(cTp, alpha, beta)
size_t bufferSize = 0;
- CUSPARSE_REPORT_IF_ERROR(
- cusparseSpMV_bufferSize(handle, modeA, alphap, matA, vecX, betap, vecY,
- cTp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
+ CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize(
+ cusparse_env, modeA, alphap, matA, vecX, betap, vecY, cTp,
+ CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
}
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, int32_t ma, void *a,
- void *x, void *y,
- int32_t ctp, void *buf,
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x,
+ void *y, int32_t ctp,
+ void *buf,
CUstream /*stream*/) {
- cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
ALPHABETA(cTp, alpha, beta)
- CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(handle, modeA, alphap, matA, vecX,
+ CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(cusparse_env, modeA, alphap, matA, vecX,
betap, vecY, cTp,
CUSPARSE_SPMV_ALG_DEFAULT, buf))
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
-mgpuSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c,
int32_t ctp, CUstream /*stream*/) {
- cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
@@ -405,15 +421,16 @@ mgpuSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
ALPHABETA(cTp, alpha, beta)
size_t bufferSize = 0;
CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize(
- handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
+ cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize))
return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
}
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
- int32_t ctp, void *buf, CUstream /*stream*/) {
- cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb,
+ void *a, void *b, void *c,
+ int32_t ctp, void *buf,
+ CUstream /*stream*/) {
+ assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
@@ -421,16 +438,16 @@ mgpuSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
ALPHABETA(cTp, alpha, beta)
- CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, alphap, matA,
- matB, betap, matC, cTp,
+ CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(cusparse_env, modeA, modeB, alphap,
+ matA, matB, betap, matC, cTp,
CUSPARSE_SPMM_ALG_DEFAULT, buf))
}
// TODO: add support to passing alpha and beta as arguments
extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
-mgpuSDDMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c,
int32_t ctp, CUstream /*stream*/) {
- cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+ assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
@@ -440,15 +457,16 @@ mgpuSDDMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
ALPHABETA(cTp, alpha, beta)
size_t bufferSize = 0;
CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize(
- handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
+ cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize))
return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
}
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuSDDMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
- int32_t ctp, void *buf, CUstream /*stream*/) {
- cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb,
+ void *a, void *b, void *c,
+ int32_t ctp, void *buf,
+ CUstream /*stream*/) {
+ assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
@@ -456,8 +474,8 @@ mgpuSDDMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
auto cTp = static_cast<cudaDataType_t>(ctp);
ALPHABETA(cTp, alpha, beta)
- CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(handle, modeA, modeB, alphap, matA,
- matB, betap, matC, cTp,
+ CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(cusparse_env, modeA, modeB, alphap,
+ matA, matB, betap, matC, cTp,
CUSPARSE_SDDMM_ALG_DEFAULT, buf))
}
@@ -490,30 +508,33 @@ static_assert(sizeof(cusparseLtSpMatHandleAndData) == 44104,
static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032,
"Unexpected cusparseLt dense matrix handle size");
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) {
- // note that cuSparseLt still uses cusparseStatus_t
- CUSPARSE_REPORT_IF_ERROR(
- cusparseLtInit(reinterpret_cast<cusparseLtHandle_t *>(h)))
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() {
+ // ScopedContext is for cuda initialization.
+ ScopedContext scopedContext;
+ assert(!cusparseLt_initiated &&
+ "client called mgpuCreateSparseLtEnv() twice");
+ // Note that cuSparseLt still uses cusparseStatus_t
+ CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env));
+ cusparseLt_initiated = true;
}
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) {
- auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
- CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle))
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseLtEnv() {
+ assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
+ CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&cusparseLt_env));
+ cusparseLt_initiated = false;
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols,
- void *values, int32_t dtp, CUstream /*stream*/) {
- auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values,
+ int32_t dtp, CUstream /*stream*/) {
+ assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
// CusparseLt expects the descriptors to be zero-initialized.
memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData));
auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
auto dTp = static_cast<cudaDataType_t>(dtp);
// assuming row-major when deciding lda
CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
- handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
+ &cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
/*alignment=*/16, dTp, CUSPARSE_ORDER_ROW))
dnmat_handle->values = values;
}
@@ -533,29 +554,29 @@ mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) {
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols,
+mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols,
void *values, int32_t dtp, CUstream /*stream*/) {
+ assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
// CusparseLt expects the descriptors to be zero-initialized.
memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData));
spmat_handle->values = values;
- auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
auto dTp = static_cast<cudaDataType_t>(dtp);
// assuming row-major when deciding lda
CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
- handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16,
- dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+ &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols,
+ /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW,
+ CUSPARSELT_SPARSITY_50_PERCENT))
}
// Several things are being done in this stage, algorithm selection, planning,
// and returning workspace and compressed matrices data buffer sizes.
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a,
- void *b, void *c, int32_t ctp,
- CUstream /*stream*/) {
+mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
+ void *c, int32_t ctp, CUstream /*stream*/) {
+ assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
// TODO: support more advanced settings, e.g., the input right operand is a
// sparse matrix assuming matA is the sparse matrix
- auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
@@ -568,22 +589,25 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a,
cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit(
- handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat),
- &(matC->mat), &(matC->mat), cTp))
+ &cusparseLt_env, &(matA->matmul), modeA, modeB, &(matA->mat),
+ &(matB->mat), &(matC->mat), &(matC->mat), cTp))
CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit(
- handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT))
+ &cusparseLt_env, &(matA->alg_sel), &(matA->matmul),
+ CUSPARSELT_MATMUL_ALG_DEFAULT))
int alg = 0;
CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute(
- handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
+ &cusparseLt_env, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
sizeof(alg)))
CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit(
- handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
+ &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
- CUSPARSE_REPORT_IF_ERROR(
- cusparseLtMatmulGetWorkspace(handle, &(matA->plan), &workspace_size_))
+ CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace(
+ &cusparseLt_env, &(matA->plan), &workspace_size_))
CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(
- handle, &(matA->plan), &compressed_size_, &compressed_buffer_size_))
+ &cusparseLt_env, &(matA->plan), &compressed_size_,
+ &compressed_buffer_size_))
+
// avoid zero-alloc
*workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_);
*compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_);
@@ -592,23 +616,23 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a,
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, void *d_workspace,
+mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace,
void *dA_compressed, void *dA_compressedBuffer,
CUstream stream) {
- auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+ assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
ALPHABETA(CUDA_R_32F, alpha, beta)
CUSPARSE_REPORT_IF_ERROR(
- cusparseLtSpMMACompress(handle, &(matA->plan), (matA->values),
+ cusparseLtSpMMACompress(&cusparseLt_env, &(matA->plan), (matA->values),
dA_compressed, dA_compressedBuffer, stream))
// TODO: add support to multi-stream execution
// Perform the matrix multiplication. D = A*B+C using C==D for now
CUSPARSE_REPORT_IF_ERROR(
- cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed,
+ cusparseLtMatmul(&cusparseLt_env, &(matA->plan), alphap, dA_compressed,
matB->values, betap, matC->values,
/*dD*/ matC->values, d_workspace, nullptr, 0))
diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
index d46baa7c4ef664..45677f29d3bf79 100644
--- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -6,29 +6,25 @@ module attributes {gpu.container_module} {
// CHECK: llvm.call @mgpuStreamCreate
// CHECK: llvm.call @mgpuMemAlloc
// CHECK: llvm.call @mgpuMemAlloc
- // CHECK: llvm.call @mgpuCreateSparseLtEnv
// CHECK: llvm.call @mgpuCusparseLtCreate2To4SpMat
// CHECK: llvm.call @mgpuCreateCuSparseLtDnMat
// CHECK: llvm.call @mgpuCuSparseLtSpMMBufferSize
// CHECK: llvm.call @mgpuCuSparseLtSpMM
// CHECK: llvm.call @mgpuDestroyCuSparseLtSpMat
// CHECK: llvm.call @mgpuDestroyCuSparseLtDnMat
- // CHECK: llvm.call @mgpuDestroySparseLtEnv
// CHECK: llvm.call @mgpuStreamSynchronize
// CHECK: llvm.call @mgpuStreamDestroy
func.func @matmul(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xf16>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf16>
- %env, %token3 = gpu.create_sparse_env async [%token2]
- %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref<?xf16>
- %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf16>
- %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index,index,index into f16
- %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
+ %spmat, %token4 = gpu.create_2to4_spmat async [%token2] %arg0, %arg0, %mem1: memref<?xf16>
+ %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf16>
+ %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16
+ %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
- %token10 = gpu.destroy_sparse_env async [%token9] %env
- gpu.wait [%token10]
+ gpu.wait [%token9]
return
}
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
index 6b7d2b9b87fe72..fff3e5954d577d 100644
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -6,29 +6,25 @@ module attributes {gpu.container_module} {
// CHECK: llvm.call @mgpuStreamCreate
// CHECK: llvm.call @mgpuMemAlloc
// CHECK: llvm.call @mgpuMemAlloc
- // CHECK: llvm.call @mgpuCreateSparseEnv
// CHECK: llvm.call @mgpuCreateCoo
// CHECK: llvm.call @mgpuCreateDnVec
// CHECK: llvm.call @mgpuSpMVBufferSize
// CHECK: llvm.call @mgpuSpMV
// CHECK: llvm.call @mgpuDestroySpMat
// CHECK: llvm.call @mgpuDestroyDnVec
- // CHECK: llvm.call @mgpuDestroySparseEnv
// CHECK: llvm.call @mgpuStreamSynchronize
// CHECK: llvm.call @mgpuStreamDestroy
func.func @matvec(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
- %env, %token3 = gpu.create_sparse_env async [%token2]
- %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
- %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
- %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64
- %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
+ %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref<?xf64>
+ %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64
+ %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
- %token10 = gpu.destroy_sparse_env async [%token9] %env
- gpu.wait [%token10]
+ gpu.wait [%token9]
return
}
@@ -36,29 +32,25 @@ module attributes {gpu.container_module} {
// CHECK: llvm.call @mgpuStreamCreate
// CHECK: llvm.call @mgpuMemAlloc
// CHECK: llvm.call @mgpuMemAlloc
- // CHECK: llvm.call @mgpuCreateSparseEnv
// CHECK: llvm.call @mgpuCreateCsr
// CHECK: llvm.call @mgpuCreateDnMat
// CHECK: llvm.call @mgpuSpMMBufferSize
// CHECK: llvm.call @mgpuSpMM
// CHECK: llvm.call @mgpuDestroySpMat
// CHECK: llvm.call @mgpuDestroyDnMat
- // CHECK: llvm.call @mgpuDestroySparseEnv
// CHECK: llvm.call @mgpuStreamSynchronize
// CHECK: llvm.call @mgpuStreamDestroy
func.func @matmul(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
- %env, %token3 = gpu.create_sparse_env async [%token2]
- %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
- %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
- %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
- %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
+ %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+ %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64
+ %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
- %token10 = gpu.destroy_sparse_env async [%token9] %env
- gpu.wait [%token10]
+ gpu.wait [%token9]
return
}
@@ -66,29 +58,25 @@ module attributes {gpu.container_module} {
// CHECK: llvm.call @mgpuStreamCreate
// CHECK: llvm.call @mgpuMemAlloc
// CHECK: llvm.call @mgpuMemAlloc
- // CHECK: llvm.call @mgpuCreateSparseEnv
// CHECK: llvm.call @mgpuCreateCsr
// CHECK: llvm.call @mgpuCreateDnMat
// CHECK: llvm.call @mgpuSDDMMBufferSize
// CHECK: llvm.call @mgpuSDDMM
// CHECK: llvm.call @mgpuDestroySpMat
// CHECK: llvm.call @mgpuDestroyDnMat
- // CHECK: llvm.call @mgpuDestroySparseEnv
// CHECK: llvm.call @mgpuStreamSynchronize
// CHECK: llvm.call @mgpuStreamDestroy
func.func @sddmm(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
- %env, %token3 = gpu.create_sparse_env async [%token2]
- %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
- %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
- %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
- %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
+ %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+ %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64
+ %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
- %token10 = gpu.destroy_sparse_env async [%token9] %env
- gpu.wait [%token10]
+ gpu.wait [%token9]
return
}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 3f5dbb15660c25..0c78e5a2d665d9 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -326,38 +326,34 @@ module attributes {gpu.container_module} {
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
// CHECK: gpu.alloc async
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
- // CHECK: gpu.create_sparse_env async
- %env, %token3 = gpu.create_sparse_env async [%token2]
// CHECK: gpu.create_coo async
- %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
// CHECK: gpu.create_csr async
%spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
// CHECK: gpu.create_dn_tensor async
- %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref<?xf64>
+ %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %mem2, %arg0 : index into memref<?xf64>
// CHECK: gpu.spmv_buffer_size async
- %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec into f64
+ %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec into f64
// CHECK: gpu.spmv async
- %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
+ %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
// CHECK: gpu.create_dn_tensor async
- %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+ %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
// CHECK: gpu.spmm_buffer_size async
- %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64
+ %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64
// CHECK: gpu.spmm async
- %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
+ %token11 = gpu.spmm async [%token10] %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
// CHECK: gpu.sddmm_buffer_size async
- %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat into f64
+ %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat into f64
// CHECK: gpu.sddmm async
- %token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
+ %token13 = gpu.sddmm async [%token12] %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
// CHECK: gpu.destroy_dn_tensor async
%token14 = gpu.destroy_dn_tensor async [%token13] %dnmat
// CHECK: gpu.destroy_sp_mat async
%token15 = gpu.destroy_sp_mat async [%token14] %spmat
// CHECK: gpu.destroy_dn_tensor async
%token16 = gpu.destroy_dn_tensor async [%token15] %dnvec
- // CHECK: gpu.destroy_sparse_env async
- %token17 = gpu.destroy_sparse_env async [%token16] %env
// CHECK: gpu.wait
- gpu.wait [%token17]
+ gpu.wait [%token16]
return
}
}
diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
index 6766c982df789f..2d07f8ceaf7274 100644
--- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
+++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -6,29 +6,25 @@ module attributes {gpu.container_module} {
// CHECK: %{{.*}} = gpu.wait async
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
// CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
- // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
+ // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}} : index into memref<?xf64>
+ // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64
+ // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
// CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
// CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
- // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
// CHECK: gpu.wait [%{{.*}}]
// CHECK: return
func.func @matvec(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
- %env, %token3 = gpu.create_sparse_env async [%token2]
- %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
- %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
- %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64
- %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
+ %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref<?xf64>
+ %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64
+ %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
- %token10 = gpu.destroy_sparse_env async [%token9] %env
- gpu.wait [%token10]
+ gpu.wait [%token9]
return
}
@@ -36,29 +32,25 @@ module attributes {gpu.container_module} {
// CHECK: %{{.*}} = gpu.wait async
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
// CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
- // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
+ // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
+ // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64
+ // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
// CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
// CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
- // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
// CHECK: gpu.wait [%{{.*}}]
// CHECK: return
func.func @matmul(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
- %env, %token3 = gpu.create_sparse_env async [%token2]
- %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
- %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
- %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
- %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
+ %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+ %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64
+ %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
- %token10 = gpu.destroy_sparse_env async [%token9] %env
- gpu.wait [%token10]
+ gpu.wait [%token9]
return
}
@@ -66,29 +58,25 @@ module attributes {gpu.container_module} {
// CHECK: %{{.*}} = gpu.wait async
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
// CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
- // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
- // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
+ // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
+ // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64
+ // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
// CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
// CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
- // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
// CHECK: gpu.wait [%{{.*}}]
// CHECK: return
func.func @sddmm(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
- %env, %token3 = gpu.create_sparse_env async [%token2]
- %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
- %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
- %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
- %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
+ %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+ %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+ %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64
+ %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
- %token10 = gpu.destroy_sparse_env async [%token9] %env
- gpu.wait [%token10]
+ gpu.wait [%token9]
return
}
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
index c8b7e4835f86f1..2807008f98b790 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -45,19 +45,16 @@
// CHECK: %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref<?x?xf64>, memref<?x?xf64>
// CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]
// CHECK: %[[VAL_41:.*]] = gpu.wait async
-// CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
-// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref<?x?xf64>
-// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref<?x?xf64>
-// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
+// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_41]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
+// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref<?x?xf64>
+// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref<?x?xf64>
+// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
// CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
-// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
+// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
// CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]]
// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]]
-// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
-// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
-// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
+// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_19]] : memref<?xindex>
// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref<?xf64>
// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref<?xi8>
// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref<?x?xf64>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
index 4d267fb68c79b6..560a535f120342 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -43,18 +43,16 @@ module {
// CHECK: %[[VAL_37:.*]] = gpu.memcpy async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_32]] : memref<?xf64>, memref<?xf64>
// CHECK: gpu.wait {{\[}}%[[VAL_15]], %[[VAL_20]], %[[VAL_25]], %[[VAL_31]], %[[VAL_37]]]
// CHECK: %[[VAL_38:.*]] = gpu.wait async
-// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]]
-// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref<?xf64>
-// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref<?xf64>
-// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
+// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_38]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
+// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : index into memref<?xf64>
+// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : index into memref<?xf64>
+// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
// CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref<?xi8>
-// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
+// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
// CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]]
// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]]
// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]]
-// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]
-// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref<?xindex>
+// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_54]]] %[[VAL_13]] : memref<?xindex>
// CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref<?xindex>
// CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref<?xf64>
// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_49]] : memref<?xi8>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
index 2cd9e2847a6235..71641f33f82bd2 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
@@ -53,18 +53,16 @@
// CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_18]] : memref<?xf64>, memref<?xf64>
// CHECK: gpu.wait {{\[}}%[[VAL_10]], %[[VAL_15]], %[[VAL_23]], %[[VAL_28]], %[[VAL_33]]]
// CHECK: %[[VAL_34:.*]] = gpu.wait async
-// CHECK: %[[VAL_35:.*]], %[[VAL_36:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_34]]]
-// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
-// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_35]], %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
+// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_34]]] %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
+// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_csr async {{\[}}%[[VAL_40]]] %[[VAL_3]], %[[VAL_3]], %[[VAL_5]], %[[VAL_21]], %[[VAL_26]], %[[VAL_31]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64
+// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64
// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.alloc async {{\[}}%[[VAL_44]]] (%[[VAL_43]]) : memref<?xi8>
-// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref<?xi8> into f64
+// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref<?xi8> into f64
// CHECK: %[[VAL_48:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_37]]
// CHECK: %[[VAL_49:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_48]]] %[[VAL_39]]
// CHECK: %[[VAL_50:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_49]]] %[[VAL_41]]
-// CHECK: %[[VAL_51:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_50]]] %[[VAL_35]]
-// CHECK: %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_51]]] %[[VAL_45]] : memref<?xi8>
+// CHECK: %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_50]]] %[[VAL_45]] : memref<?xi8>
// CHECK: %[[VAL_53:.*]] = gpu.dealloc async {{\[}}%[[VAL_52]]] %[[VAL_8]] : memref<8x8xf64>
// CHECK: %[[VAL_54:.*]] = gpu.dealloc async {{\[}}%[[VAL_53]]] %[[VAL_13]] : memref<8x8xf64>
// CHECK: %[[VAL_55:.*]] = gpu.dealloc async {{\[}}%[[VAL_54]]] %[[VAL_21]] : memref<?xindex>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index 0ce978c4d7cec8..f1e985e70793bc 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -11,6 +11,9 @@
// RUN: | FileCheck %s
module {
+ llvm.func @mgpuCreateSparseLtEnv()
+ llvm.func @mgpuDestroySparseLtEnv()
+
func.func @sampled_matmul(%a : memref<16x32xf16>,
%b : memref<32x16xf16>,
%c : memref<16x16xf16>) {
@@ -28,19 +31,17 @@ module {
%token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
%token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
%token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
- %env, %token7 = gpu.create_sparse_env async [%token6]
- %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %env, %c16, %c32, %d_a: memref<16x32xf16>
- %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16>
- %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16>
- %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
+ %spmat, %token8 = gpu.create_2to4_spmat async [%token6] %c16, %c32, %d_a: memref<16x32xf16>
+ %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16>
+ %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16>
+ %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
%mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref<?xf16>
%mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref<?xf16>
%mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref<?xf16>
- %token15 = gpu.spmm async [%token14] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
+ %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
%token16 = gpu.destroy_sp_mat async [%token15] %spmat
%token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
- %token18 = gpu.destroy_sparse_env async [%token17] %env
- %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
+ %token19 = gpu.memcpy async [%token17] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
%token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16>
%token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>
%token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16>
@@ -57,6 +58,7 @@ module {
// using NVidia 2:4 structured sparsity for A.
//
func.func @main() {
+ llvm.call @mgpuCreateSparseLtEnv() : () -> ()
%f0 = arith.constant 0.0 : f16
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
@@ -225,7 +227,8 @@ module {
%pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
vector.print %pc0 : vector<16xf16>
}
-
+
+ llvm.call @mgpuDestroySparseLtEnv() : () -> ()
return
}
}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
index d7eade81a01f39..2b471e0e118c4f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
@@ -32,6 +32,9 @@
}>
module {
+ llvm.func @mgpuCreateSparseEnv()
+ llvm.func @mgpuDestroySparseEnv()
+
// Computes C = A x B with A sparse COO.
func.func @matmulCOO(%A: tensor<8x8xf32, #SortedCOO>,
%B: tensor<8x8xf32>,
@@ -85,6 +88,7 @@ module {
// Main driver.
//
func.func @main() {
+ llvm.call @mgpuCreateSparseEnv(): () -> ()
%f0 = arith.constant 0.0 : f32
%f1 = arith.constant 1.0 : f32
@@ -173,6 +177,8 @@ module {
bufferization.dealloc_tensor %Acoo : tensor<8x8xf32, #SortedCOO>
bufferization.dealloc_tensor %Acsr : tensor<8x8xf32, #CSR>
+ llvm.call @mgpuDestroySparseEnv(): () -> ()
+
return
}
}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
index f3f5820d3f2069..9c2ddcc9282935 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
@@ -32,6 +32,9 @@
}>
module {
+ llvm.func @mgpuCreateSparseEnv()
+ llvm.func @mgpuDestroySparseEnv()
+
// Compute matrix vector y = Ax on COO with default index coordinates.
func.func @matvecCOO(%A: tensor<?x?xf64, #SortedCOO>, %x: tensor<?xf64>, %y_in: tensor<?xf64>) -> tensor<?xf64> {
%y_out = linalg.matvec
@@ -49,6 +52,7 @@ module {
}
func.func @main() {
+ llvm.call @mgpuCreateSparseEnv() : () -> ()
%f0 = arith.constant 0.0 : f64
%f1 = arith.constant 1.0 : f64
%c0 = arith.constant 0 : index
@@ -122,6 +126,8 @@ module {
// Release the resources.
bufferization.dealloc_tensor %Acoo : tensor<?x?xf64, #SortedCOO>
bufferization.dealloc_tensor %Acsr : tensor<?x?xf64, #CSR>
+
+ llvm.call @mgpuDestroySparseEnv() : () -> ()
return
}
}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index b8d30d00058361..e4a3294f971748 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -46,6 +46,9 @@
// runs the resulting code with the JIT compiler.
//
module {
+ llvm.func @mgpuCreateSparseEnv()
+ llvm.func @mgpuDestroySparseEnv()
+
//
// A kernel that computes a sampled dense matrix matrix multiplication
// using a "spy" function and in-place update of the sampling sparse matrix.
@@ -81,6 +84,7 @@ module {
// Main driver.
//
func.func @entry() {
+ llvm.call @mgpuCreateSparseEnv() : () -> ()
%d0 = arith.constant 0.0 : f32
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
@@ -149,6 +153,7 @@ module {
bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR>
bufferization.dealloc_tensor %1 : tensor<?x?xf32, #CSR>
+ llvm.call @mgpuDestroySparseEnv() : () -> ()
return
}
}
More information about the Mlir-commits
mailing list