[Mlir-commits] [mlir] 9fc02a7 - [mlir][sparse][gpu] add AoS COO support to cuSPARSE

Tue Jun 6 12:32:55 PDT 2023

Author: Aart Bik
Date: 2023-06-06T12:32:46-07:00
New Revision: 9fc02a7a0861921d9d655e783dcbdd2472bc43ea

URL: https://github.com/llvm/llvm-project/commit/9fc02a7a0861921d9d655e783dcbdd2472bc43ea
DIFF: https://github.com/llvm/llvm-project/commit/9fc02a7a0861921d9d655e783dcbdd2472bc43ea.diff

LOG: [mlir][sparse][gpu] add AoS COO support to cuSPARSE

Even though this feature was deprecated in release 11.2,
any library before this version still supports the feature,
which is why we are making it available under a macro.

Reviewed By: K-Wu

Differential Revision: https://reviews.llvm.org/D152290

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
    mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
    mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
    mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 17bff31941579..70a41e0d975a9 100644

--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1560,7 +1560,7 @@ def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]>
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
-  let results = (outs Res<GPU_SparseEnvHandle>:$env, 
+  let results = (outs Res<GPU_SparseEnvHandle>:$env,
                       Optional<GPU_AsyncToken>:$asyncToken);
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
@@ -1568,7 +1568,7 @@ def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]>
 }
 
 def GPU_DestroySparseEnvOp : GPU_Op<
-    "destroy_sparse_env", 
+    "destroy_sparse_env",
     [GPU_AsyncOpInterface]> {
   let summary = "Destroy sparse environment operation";
   let description = [{
@@ -1592,7 +1592,7 @@ def GPU_DestroySparseEnvOp : GPU_Op<
   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
-    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
     $env attr-dict
   }];
 }
@@ -1618,7 +1618,7 @@ def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> {
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                    AnyMemRef:$memref, Index:$size);
-  let results = (outs Res<GPU_SparseDnVecHandle>:$dvec, 
+  let results = (outs Res<GPU_SparseDnVecHandle>:$dvec,
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
@@ -1650,7 +1650,7 @@ def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
-    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
     $dvec attr-dict
   }];
 }
@@ -1709,7 +1709,7 @@ def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> {
   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
-    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
     $dmat attr-dict
   }];
 }
@@ -1721,6 +1721,7 @@ def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
     with the given sizes from the given index and values buffers. The buffers
     must already be copied from the host to the device prior to using this
     operation. The operation returns a handle to the sparse matrix descriptor.
+    Note that this operation builds the COO in SoA format.
 
     If the `async` keyword is present, the op is executed asynchronously (i.e.
     it does not block until the execution has finished on the device). In
@@ -1741,7 +1742,7 @@ def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
                        AnyMemRef:$rowIdxs,
                        AnyMemRef:$colIdxs,
                        AnyMemRef:$values);
-  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat, 
+  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
@@ -1751,6 +1752,45 @@ def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
   }];
 }
 
+def GPU_CreateCooAoSOp : GPU_Op<"create_coo_aos", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse matrix in COO format operation (AoS)";
+  let description = [{
+    The `gpu.create_coo_aos` operation initializes a sparse matrix in COO format
+    with the given sizes from the given index and values buffers. The buffers
+    must already be copied from the host to the device prior to using this
+    operation. The operation returns a handle to the sparse matrix descriptor.
+    Unlike the default `gpu.create_coo` operation, this operation builds the
+    COO format from a single index buffer in AoS format (note that this
+    feature has been deprecated in cuSparse 11.2).
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %spmat, %token = gpu.create_coo_aos async [%dep] %rows, %cols, %nnz, %idxs,
+        %values : memref<?xindex>, memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Index:$rows,
+                   Index:$cols,
+                   Index:$nnz,
+                   AnyMemRef:$idxs,
+                   AnyMemRef:$values);
+  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
+                      Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $rows `,` $cols `,` $nnz `,` $idxs `,` $values attr-dict
+    `:` type($idxs) `,` type($values)
+  }];
+}
+
 def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
   let summary = "Create sparse matrix in CSR format operation";
   let description = [{
@@ -1779,7 +1819,7 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
                    AnyMemRef:$rowPos,
                    AnyMemRef:$colIdxs,
                    AnyMemRef:$values);
-  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat, 
+  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
@@ -1816,8 +1856,8 @@ def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
   }];
 }
 
-// To avoid coupling this dialect with cusparse.h specifics, we hardcoded magic 
-// literals in this enum. Note that this should be kept in sync with 
+// To avoid coupling this dialect with cusparse.h specifics, we hardcoded magic
+// literals in this enum. Note that this should be kept in sync with
 // cusparseOperation_t in cusparse.h:
 // typedef enum {
 // CUSPARSE_OPERATION_NON_TRANSPOSE       = 0,
@@ -1828,8 +1868,8 @@ def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
 def GPU_TransposeMode : I32EnumAttr<"TransposeMode",
     "transpose mode of sparse matrix supported by sparse tensor ops",
     [
-      I32EnumAttrCase<"NON_TRANSPOSE", 0>, 
-      I32EnumAttrCase<"TRANSPOSE", 1>, 
+      I32EnumAttrCase<"NON_TRANSPOSE", 0>,
+      I32EnumAttrCase<"TRANSPOSE", 1>,
       I32EnumAttrCase<"CONJUGATE_TRANSPOSE", 2>,
     ]> {
       let genSpecializedAttr = 0;
@@ -1853,7 +1893,7 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
     it does not block until the execution has finished on the device). In
     that case, it returns a !gpu.async.token in addition to the environment.
 
-    The matrix arguments can also be associated with one of the following 
+    The matrix arguments can also be associated with one of the following
     operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
     is NON_TRANSPOSE.
 
@@ -1870,7 +1910,7 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
                        GPU_SparseDnVecHandle:$dnX,
                        GPU_SparseDnVecHandle:$dnY,
                        TypeAttr:$computeType);
-  let results = (outs Res<Index>:$bufferSz, 
+  let results = (outs Res<Index>:$bufferSz,
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let builders = [OpBuilder<(ins
@@ -1884,7 +1924,7 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
       "Type":$computeType)
       , [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, 
+    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
                  env, modeA, spmatA, dnX, dnY, computeType);}]>
   ];
 
@@ -1906,7 +1946,7 @@ def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
     it does not block until the execution has finished on the device). In
     that case, it returns a !gpu.async.token in addition to the environment.
 
-    The matrix arguments can also be associated with one of the following 
+    The matrix arguments can also be associated with one of the following
     operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
     is NON_TRANSPOSE.
 
@@ -1958,7 +1998,7 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
     it does not block until the execution has finished on the device). In
     that case, it returns a !gpu.async.token in addition to the environment.
 
-    The matrix arguments can also be associated with one of the following 
+    The matrix arguments can also be associated with one of the following
     operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
     is NON_TRANSPOSE.
 
@@ -1977,7 +2017,7 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
                        GPU_SparseDnMatHandle:$dnmatB,
                        GPU_SparseDnMatHandle:$dnmatC,
                        TypeAttr:$computeType);
-  let results = (outs Res<Index>:$bufferSz, 
+  let results = (outs Res<Index>:$bufferSz,
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let builders = [OpBuilder<(ins
@@ -1991,7 +2031,7 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
       "Type":$computeType), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, 
+    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
                  env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]>
   ];
 
@@ -2013,7 +2053,7 @@ def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> {
     it does not block until the execution has finished on the device). In
     that case, it returns a !gpu.async.token in addition to the environment.
 
-    The matrix arguments can also be associated with one of the following 
+    The matrix arguments can also be associated with one of the following
     operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
     is NON_TRANSPOSE.
 
@@ -2046,7 +2086,7 @@ def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> {
       "Value":$buffer), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, 
+    return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
                  modeB, spmatA, dnmatB, dnmatC, computeType, buffer);}]>
   ];
 
@@ -2074,7 +2114,7 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
     %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
     ```
 
-    The matrix arguments can also be associated with one of the following 
+    The matrix arguments can also be associated with one of the following
     operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
     is NON_TRANSPOSE.
   }];
@@ -2100,7 +2140,7 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
       "Type":$computeType), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, 
+    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
                  env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]>
   ];
 
@@ -2128,7 +2168,7 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
     %token = gpu.sddmm async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32
     ```
 
-    The matrix arguments can also be associated with one of the following 
+    The matrix arguments can also be associated with one of the following
     operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
     is NON_TRANSPOSE.
   }];
@@ -2155,7 +2195,7 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
     "Value":$buffer), [{
   auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
   auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-  return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, 
+  return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
                 modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]>
   ];
 

diff  --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 023a52eeec138..835cde3254278 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -224,6 +224,12 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
        llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
        llvmPointerType /* void *stream */}};
+  FunctionCallBuilder createCooAoSCallBuilder = {
+      "mgpuCreateCooAoS", // deprecated in cuSPARSE 11.2
+      llvmPointerType,
+      {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+       llvmPointerType, llvmInt32Type, llvmInt32Type,
+       llvmPointerType /* void *stream */}};
   FunctionCallBuilder createCsrCallBuilder = {
       "mgpuCreateCsr",
       llvmPointerType,
@@ -547,6 +553,18 @@ class ConvertCreateCooOpToGpuRuntimeCallPattern
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class ConvertCreateCooAoSOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooAoSOp> {
+public:
+  ConvertCreateCooAoSOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooAoSOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::CreateCooAoSOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 class ConvertCreateCsrOpToGpuRuntimeCallPattern
     : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCsrOp> {
 public:
@@ -1421,6 +1439,37 @@ LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertCreateCooAoSOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateCooAoSOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pIdxs = MemRefDescriptor(adaptor.getIdxs()).allocatedPtr(rewriter, loc);
+  Value pValues =
+      MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers()) {
+    pIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pIdxs);
+    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+  }
+  Type iType =
+      llvm::cast<MemRefType>(op.getIdxs().getType()).getElementType();
+  Type dType =
+      llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
+  auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
+  auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
+  auto handle =
+      createCooAoSCallBuilder
+          .create(loc, rewriter,
+                  {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
+                   pIdxs, pValues, itp, dtp, stream})
+          .getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
 LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::CreateCsrOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1645,6 +1694,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                ConvertCreateDnMatOpToGpuRuntimeCallPattern,
                ConvertDestroyDnMatOpToGpuRuntimeCallPattern,
                ConvertCreateCooOpToGpuRuntimeCallPattern,
+               ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
                ConvertCreateCsrOpToGpuRuntimeCallPattern,
                ConvertDestroySpMatOpToGpuRuntimeCallPattern,
                ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index a190ff6dacb92..ea17cf94ce6f7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -355,7 +355,11 @@ static bool areAdmissibleTypes(SparseTensorType aTp, SparseTensorType bTp,
     return false;
   if (isAdmissibleCOO(aTp)) {
     isCOO = true;
-    return enableRT; // TODO: CreateCooAoSOp was deprecated, find another way
+#ifdef CUSPARSE_COO_AOS
+    return true;
+#else
+    return enableRT;
+#endif
   }
   return isAdmissibleCSR(aTp);
 }
@@ -393,7 +397,13 @@ static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,
       return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
                                               sz1, sz2, nseA, rowA, colA, valA);
     }
+#ifdef CUSPARSE_COO_AOS
+    assert(!colA);
+    return builder.create<gpu::CreateCooAoSOp>(loc, handleTp, tokenTp, token,
+                                               sz1, sz2, nseA, rowA, valA);
+#else
     llvm_unreachable("gpu::CreateCooAoSOp is deprecated");
+#endif
   }
   assert(colA);
   return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,

diff  --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 1c9497b19eea3..76ce8edfdb9e8 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -312,6 +312,19 @@ mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs,
   return reinterpret_cast<void *>(mat);
 }
 
+#ifdef CUSPARSE_COO_AOS // deprecated in cuSPARSE 11.2
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCooAoS(intptr_t rows, intptr_t cols, intptr_t nnz, void *idxs,
+                 void *values, int32_t itp, int32_t dtp, CUstream /*stream*/) {
+  cusparseSpMatDescr_t mat = nullptr;
+  auto iTp = static_cast<cusparseIndexType_t>(itp);
+  auto dTp = static_cast<cudaDataType_t>(dtp);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreateCooAoS(
+      &mat, rows, cols, nnz, idxs, values, iTp, CUSPARSE_INDEX_BASE_ZERO, dTp))
+  return reinterpret_cast<void *>(mat);
+}
+#endif // CUSPARSE_COO_AOS
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
 mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
               void *colIdxs, void *values, int32_t ptp, int32_t itp,