[Mlir-commits] [mlir] 95a6c50 - [mlir][sparse][gpu] add set csr pointers, remove estimate op, fix bugs

Thu Aug 10 13:52:58 PDT 2023

Author: Aart Bik
Date: 2023-08-10T13:52:47-07:00
New Revision: 95a6c509c9ec9b87ca951de4cded9a7807058ae6

URL: https://github.com/llvm/llvm-project/commit/95a6c509c9ec9b87ca951de4cded9a7807058ae6
DIFF: https://github.com/llvm/llvm-project/commit/95a6c509c9ec9b87ca951de4cded9a7807058ae6.diff

LOG: [mlir][sparse][gpu] add set csr pointers, remove estimate op, fix bugs

Rationale:
Since we only support default algorithm for SpGEMM, we can remove the
estimate op (for now at least). This also introduces the set csr pointers
op, and fixes a few bugs in the existing lowering for the SpGEMM breakdown.
This revision paves the way for actual recognition of SpGEMM in the sparsifier.

Reviewed By: K-Wu

Differential Revision: https://reviews.llvm.org/D157645

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
    mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
    mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
    mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
    mlir/test/Dialect/GPU/sparse-roundtrip.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 8dc88663c0c8c2..498e8c37049a8d 100644

--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1561,6 +1561,7 @@ def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
 // Operation on sparse matrices, called from the host
 // (currently lowers to cuSparse for CUDA only, no ROCM lowering).
 //
+
 def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
   let summary = "Create dense tensor operation";
   let description = [{
@@ -2234,9 +2235,9 @@ def GPU_SpGEMMWorkEstimationOrComputeOp : GPU_Op<"spgemm_work_estimation_or_comp
     Example:
 
     ```mlir
-    %bufferSz, %token = gpu.spgemm_work_estimation_or_compute async [%dep]{COMPUTE}
+    %bufferSz, %token = gpu.spgemm_work_estimation_or_compute async [%dep] {COMPUTE}
                           %desc, %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE},
-                          %spmatC, ALG2, %spgemmDesc, %c0, %alloc: f32 into
+                          %spmatC, %spgemmDesc, %c0, %alloc: f32 into
                           memref<0xi8>
     ```
 
@@ -2283,69 +2284,11 @@ def GPU_SpGEMMWorkEstimationOrComputeOp : GPU_Op<"spgemm_work_estimation_or_comp
   }];
 }
 
-def GPU_SpGEMMEstimateMemoryOp : GPU_Op<"spgemm_estimate_memory", [GPU_AsyncOpInterface]> {
-  let summary = "SpGEMM estimate memory operation";
-  let description = [{
-    The `gpu.spgemm_estimate_memory` is used for both determining the buffer
-    size and performing the actual computation.
-
-    If the `async` keyword is present, the op is executed asynchronously (i.e.
-    it does not block until the execution has finished on the device). In
-    that case, it returns a `!gpu.async.token` in addition to the environment.
-
-    Example:
-
-    ```mlir
-    %bufferSz3, %dummy, %token = gpu.spgemm_estimate_memory async [%dep] %spmatA, %spmatB, %spmatC, ALG2, %spgemmDesc, %c0, %c0, %alloc: f32 into memref<0xi8>
-    ```
-  }];
-
-  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseSpGEMMOpHandle:$desc,
-                       GPU_TransposeModeAttr:$modeA,
-                       GPU_TransposeModeAttr:$modeB,
-                       GPU_SparseSpMatHandle:$spmatA,
-                       GPU_SparseSpMatHandle:$spmatB,
-                       GPU_SparseSpMatHandle:$spmatC,
-                       TypeAttr:$computeType,
-                       Index:$bufferSz3,
-                       AnyMemRef:$buffer3,
-                       Index:$bufferSz2);
-  let results = (outs Index:$bufferSz3New,
-                      Index:$bufferSz2New,
-                      Optional<GPU_AsyncToken>:$asyncToken);
-
-  let builders = [OpBuilder<(ins
-    "Type":$bufferSz3New,
-    "Type":$bufferSz2New,
-    "Type":$asyncToken,
-    "ValueRange":$asyncDependencies,
-    "Value":$desc,
-    "Value":$spmatA,
-    "Value":$spmatB,
-    "Value":$spmatC,
-    "Type":$computeType,
-    "Value":$bufferSz3,
-    "Value":$buffer3,
-    "Value":$bufferSz2), [{
-  auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
-  auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-  return build($_builder, $_state, bufferSz3New, bufferSz2New, asyncToken,
-               asyncDependencies, desc, modeA, modeB, spmatA, spmatB, spmatC,
-               computeType, bufferSz3, buffer3, bufferSz2);}]>
-  ];
-
-  let assemblyFormat = [{
-    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc `,` $bufferSz3 `,` $bufferSz2 `,` $buffer3 attr-dict `:` $computeType `into` type($buffer3)
-  }];
-}
-
 def GPU_SpGEMMCopyOp : GPU_Op<"spgemm_copy", [GPU_AsyncOpInterface]> {
   let summary = "SpGEMM copy operation";
   let description = [{
-    The `gpu.spgemm_copy` operation copies a sparse matrix, e.g., the result of
-    the SpGEMM computation.
+    The `gpu.spgemm_copy` operation copies the sparse matrix result of
+    a SpGEMM computation.
 
     If the `async` keyword is present, the op is executed asynchronously (i.e.
     it does not block until the execution has finished on the device). In
@@ -2354,7 +2297,7 @@ def GPU_SpGEMMCopyOp : GPU_Op<"spgemm_copy", [GPU_AsyncOpInterface]> {
     Example:
 
     ```mlir
-    gpu.spgemm_copy %spmatA, %spmatB, %spmatC, ALG2, %spgemmDesc: f32
+    gpu.spgemm_copy %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
     ```
 
     The matrix arguments can also be associated with one of the following
@@ -2422,4 +2365,37 @@ def GPU_SpGEMMGetSizeOp : GPU_Op<"spgemm_get_size", [GPU_AsyncOpInterface]> {
   }];
 }
 
+def GPU_SetCsrPointersOp : GPU_Op<"set_csr_pointers", [GPU_AsyncOpInterface]> {
+  let summary = "SpGEMM get size operation";
+  let description = [{
+    The `gpu.set_csr_pointers` assigns the given positions, coordinates,
+    and values buffer that reside on the device directly to the given sparse
+    matrix descriptor in csr format.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a `!gpu.async.token` in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %token = gpu.set_csr_pointers async [%dep] %positions, %coordinates, %values
+          : memref<?xf32>, memref<?xindex>, memref<?xindex>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                       Arg<GPU_SparseSpMatHandle>:$spmat,
+                       AnyMemRef:$positions,
+                       AnyMemRef:$coordinates,
+		       AnyMemRef:$values);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+      $spmat `,` $positions `,` $coordinates `,` $values attr-dict
+        `:` type($positions) `,` type($coordinates) `,` type($values)
+  }];
+}
+
 #endif // GPU_OPS

diff  --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 37efadd1be5625..88dd32a2146eb9 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -302,15 +302,6 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
        llvmPointerType /*a*/, llvmPointerType /*b*/, llvmPointerType /*c*/,
        llvmInt32Type /*ctp*/, llvmIntPtrType /*bs*/, llvmPointerType /*buf*/,
        llvmPointerType /*void *stream*/}};
-  FunctionCallBuilder createSpGEMMEstimateMemoryBuilder = {
-      "mgpuSpGEMMEstimateMemory",
-      llvmVoidType,
-      {llvmPointerType /*nbs3*/, llvmPointerType /*nbs2*/,
-       llvmPointerType /*s*/, llvmInt32Type /*ma*/, llvmInt32Type /*mb*/,
-       llvmPointerType /*a*/, llvmPointerType /*b*/, llvmPointerType /*c*/,
-       llvmInt32Type /*ctp*/, llvmFloat32Type /*chunk_fraction*/,
-       llvmIntPtrType /*bs3*/, llvmPointerType /*buf3*/, llvmIntPtrType /*bs2*/,
-       llvmPointerType /*void *stream*/}};
   FunctionCallBuilder createSpGEMMComputeBuilder = {
       "mgpuSpGEMMCompute",
       llvmIntPtrType,
@@ -329,7 +320,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       llvmPointerType,
       {llvmPointerType /*void *stream*/}};
   FunctionCallBuilder createSpGEMMDestroyDescrBuilder = {
-      "mgpuSpGEMMDestoryDescr",
+      "mgpuSpGEMMDestroyDescr",
       llvmVoidType,
       {llvmPointerType /*s*/, llvmPointerType /*void *stream*/}};
   FunctionCallBuilder createSpGEMMGetSizeBuilder = {
@@ -337,6 +328,12 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       llvmVoidType,
       {llvmPointerType /*mc*/, llvmPointerType /*rc*/, llvmPointerType /*cc*/,
        llvmPointerType /*nc*/, llvmPointerType /*void *stream*/}};
+  FunctionCallBuilder createSetCsrPointersBuilder = {
+      "mgpuSetCsrPointers",
+      llvmVoidType,
+      {llvmPointerType /*spmat*/, llvmPointerType /*pos*/,
+       llvmPointerType /*crd*/, llvmPointerType /*val*/,
+       llvmPointerType /*void *stream*/}};
 };
 
 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -559,9 +556,9 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SDDMMOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMCreateDescrOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMDestroyDescrOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMWorkEstimationOrComputeOp)
-DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMEstimateMemoryOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMCopyOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMGetSizeOp)
+DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SetCsrPointersOp)
 
 } // namespace
 
@@ -1710,13 +1707,13 @@ LogicalResult
 ConvertSpGEMMDestroyDescrOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::SpGEMMDestroyDescrOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
-
   if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
       failed(isAsyncWithOneDependency(rewriter, op)))
     return failure();
   Location loc = op.getLoc();
   auto stream = adaptor.getAsyncDependencies().front();
-  createSpGEMMCopyBuilder.create(loc, rewriter, {adaptor.getDesc(), stream});
+  createSpGEMMDestroyDescrBuilder.create(loc, rewriter,
+                                         {adaptor.getDesc(), stream});
   rewriter.replaceOp(op, {stream});
   return success();
 }
@@ -1764,55 +1761,6 @@ ConvertSpGEMMWorkEstimationOrComputeOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
-LogicalResult
-ConvertSpGEMMEstimateMemoryOpToGpuRuntimeCallPattern::matchAndRewrite(
-    gpu::SpGEMMEstimateMemoryOp op, OpAdaptor adaptor,
-    ConversionPatternRewriter &rewriter) const {
-  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
-      failed(isAsyncWithOneDependency(rewriter, op)))
-    return failure();
-  Location loc = op.getLoc();
-  auto computeType = genConstInt32From(
-      rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
-  auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA());
-  auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB());
-  auto stream = adaptor.getAsyncDependencies().front();
-  // TODO: support other chunk fraction
-  Value chunkFraction = genConstFloat32From(rewriter, loc, 1.0);
-  Value pBuf3 =
-      MemRefDescriptor(adaptor.getBuffer3()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pBuf3 = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf3);
-
-  auto two = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                               rewriter.getIndexAttr(2));
-  auto bufferSize = rewriter.create<LLVM::AllocaOp>(
-      loc, llvmInt64PointerType, llvmInt64Type, two, /*alignment=*/16);
-
-  auto bufferSizePtr2 = rewriter.create<LLVM::GEPOp>(
-      loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize,
-      ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                   rewriter.getIndexAttr(0))});
-  auto bufferSizePtr3 = rewriter.create<LLVM::GEPOp>(
-      loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize,
-      ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                   rewriter.getIndexAttr(1))});
-
-  createSpGEMMEstimateMemoryBuilder.create(
-      loc, rewriter,
-      {bufferSizePtr3, bufferSizePtr2, adaptor.getDesc(), modeA, modeB,
-       adaptor.getSpmatA(), adaptor.getSpmatB(), adaptor.getSpmatC(),
-       computeType, chunkFraction, adaptor.getBufferSz3(), pBuf3,
-       adaptor.getBufferSz2(), stream});
-  auto bufferSize2 =
-      rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSizePtr2);
-  auto bufferSize3 =
-      rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSizePtr3);
-
-  rewriter.replaceOp(op, {bufferSize3, bufferSize2, stream});
-  return success();
-}
-
 LogicalResult ConvertSpGEMMCopyOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::SpGEMMCopyOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1869,6 +1817,31 @@ LogicalResult ConvertSpGEMMGetSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertSetCsrPointersOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::SetCsrPointersOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pPos =
+      MemRefDescriptor(adaptor.getPositions()).allocatedPtr(rewriter, loc);
+  Value pCrd =
+      MemRefDescriptor(adaptor.getCoordinates()).allocatedPtr(rewriter, loc);
+  Value pVal =
+      MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers()) {
+    pPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pPos);
+    pCrd = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pCrd);
+    pVal = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pVal);
+  }
+  createSetCsrPointersBuilder.create(
+      loc, rewriter, {adaptor.getSpmat(), pPos, pCrd, pVal, stream});
+  rewriter.replaceOp(op, {stream});
+  return success();
+}
+
 void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns,
                                                StringRef gpuBinaryAnnotation,
@@ -1904,9 +1877,9 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                ConvertSpGEMMCreateDescrOpToGpuRuntimeCallPattern,
                ConvertSpGEMMDestroyDescrOpToGpuRuntimeCallPattern,
                ConvertSpGEMMWorkEstimationOrComputeOpToGpuRuntimeCallPattern,
-               ConvertSpGEMMEstimateMemoryOpToGpuRuntimeCallPattern,
                ConvertSpGEMMCopyOpToGpuRuntimeCallPattern,
-               ConvertSpGEMMGetSizeOpToGpuRuntimeCallPattern>(converter);
+               ConvertSpGEMMGetSizeOpToGpuRuntimeCallPattern,
+               ConvertSetCsrPointersOpToGpuRuntimeCallPattern>(converter);
   patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
       converter, gpuBinaryAnnotation, kernelBarePtrCallConv);
   patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());

diff  --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 23c5e40e438189..fd338a14c504ef 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -615,45 +615,12 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation(
   auto cTp = static_cast<cudaDataType_t>(ctp);
   ALPHABETA(cTp, alpha, beta)
   size_t newBufferSize = bs;
-
   CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_workEstimation(
       cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
       CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize, buf))
   return newBufferSize == 0 ? 1 : newBufferSize; // avoid zero-alloc
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuSpGEMMEstimateMemory(void *nbs3, void *nbs2, void *s, int32_t ma,
-                         int32_t mb, void *a, void *b, void *c, int32_t ctp,
-                         float chunk_fraction, intptr_t bs3, void *buf3,
-                         intptr_t bs2, CUstream /*stream*/) {
-  cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
-  cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
-  cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
-  cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
-  cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
-  cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
-  auto cTp = static_cast<cudaDataType_t>(ctp);
-  ALPHABETA(cTp, alpha, beta)
-  size_t *newBufferSize2 = reinterpret_cast<size_t *>(nbs2);
-  size_t *newBufferSize3 = reinterpret_cast<size_t *>(nbs3);
-  *newBufferSize2 = bs2;
-  *newBufferSize3 = bs3;
-
-  CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_estimateMemory(
-      cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
-      CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, chunk_fraction, newBufferSize3, buf3,
-      newBufferSize2))
-  // avoid zero-alloc
-  if (*newBufferSize2 == 0) {
-    *newBufferSize2 = 1;
-  }
-  if (*newBufferSize3 == 0) {
-    *newBufferSize3 = 1;
-  }
-  return;
-}
-
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
 mgpuSpGEMMCompute(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c,
                   int32_t ctp, intptr_t bsz2, void *buf2, CUstream /*stream*/) {
@@ -683,7 +650,6 @@ mgpuSpGEMMCopy(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c,
   cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
   auto cTp = static_cast<cudaDataType_t>(ctp);
   ALPHABETA(cTp, alpha, beta)
-
   CUSPARSE_REPORT_IF_ERROR(
       cusparseSpGEMM_copy(cusparse_env, modeA, modeB, alphap, matA, matB, betap,
                           matC, cTp, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc))
@@ -712,6 +678,12 @@ mgpuSpGEMMGetSize(void *m, void *r, void *c, void *n, CUstream /*stream*/) {
   CUSPARSE_REPORT_IF_ERROR(cusparseSpMatGetSize(matDescr, rows, cols, nnz));
 }
 
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuSetCsrPointers(void *m, void *p, void *c, void *v, CUstream /*stream*/) {
+  cusparseSpMatDescr_t matDescr = reinterpret_cast<cusparseSpMatDescr_t>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCsrSetPointers(matDescr, p, c, v));
+}
+
 #ifdef MLIR_ENABLE_CUDA_CUSPARSELT
 
 ///

diff  --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
index 40489295143862..45b991bd4f8896 100644
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -93,9 +93,6 @@ module attributes {gpu.container_module} {
   // CHECK: llvm.call @mgpuSpGEMMWorkEstimation
   // CHECK: llvm.call @mgpuMemAlloc
   // CHECK: llvm.call @mgpuSpGEMMWorkEstimation
-  // CHECK: llvm.call @mgpuSpGEMMEstimateMemory
-  // CHECK: llvm.call @mgpuMemAlloc
-  // CHECK: llvm.call @mgpuSpGEMMEstimateMemory
   // CHECK: llvm.call @mgpuMemAlloc
   // CHECK: llvm.call @mgpuSpGEMMCompute
   // CHECK: llvm.call @mgpuMemAlloc
@@ -130,19 +127,10 @@ module attributes {gpu.container_module} {
                               [%token8]{WORK_ESTIMATION} %spmatA, %spmatB,
                               %spmatC, %spgemmDesc, %bufferSz1,
                               %buf1: f32 into memref<?xi8>
-    %bufferSz3, %dummy, %token10 = gpu.spgemm_estimate_memory async [%token9]
-                                     %spmatA, %spmatB, %spmatC,
-                                     %spgemmDesc, %c0, %c0,
-                                     %alloc: f32 into memref<0xi8>
-    %buf3, %token11 = gpu.alloc async [%token10] (%bufferSz3) : memref<?xi8>
-    %bufferSz3_2, %bufferSz2, %token12 = gpu.spgemm_estimate_memory async
-                                          [%token11] %spmatA, %spmatB, %spmatC,
-                                          %spgemmDesc, %bufferSz3, %c0,
-                                          %buf3: f32 into memref<?xi8>
-    %buf2, %token13 = gpu.alloc async [%token12] (%bufferSz2) : memref<?xi8>
+    %buf2, %token13 = gpu.alloc async [%token9] (%bufferSz1_1) : memref<?xi8>
     %bufferSz2_2, %token14 = gpu.spgemm_work_estimation_or_compute async
                                [%token13]{COMPUTE} %spmatA, %spmatB, %spmatC,
-                               %spgemmDesc, %bufferSz2,
+                               %spgemmDesc, %bufferSz1_1,
                                %buf2: f32 into memref<?xi8>
     %rows, %cols, %nnz, %token15 = gpu.spgemm_get_size async [%token14] %spmatC
     %mem_columns, %token16 = gpu.alloc async [%token15] (%cols) : memref<?xi32>

diff  --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
index b39bc101af17f9..171a1ad24898ff 100644
--- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
+++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -55,72 +55,45 @@ module attributes {gpu.container_module} {
   }
 
   // CHECK-LABEL:     func @spgemm
-  // CHECK:      %{{.*}} = gpu.wait async
+  // CHECK:           %{{.*}} = gpu.wait async
   // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
+  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf32>
+  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
+  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
+  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
   // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_create_descr async [%{{.*}}]
-  // CHECK:           %{{.*}} = memref.alloc() : memref<0xi8>
-  // CHECK:           %{{.*}} = arith.constant 0 : index
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xi8>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<?xi8>
-  // CHECK:           %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_estimate_memory async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xi8>
-  // CHECK:           %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_estimate_memory async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<?xi8>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xi8>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<?xi8>
+  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ WORK_ESTIMATION} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
+  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ COMPUTE} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
   // CHECK:           %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_get_size async [%{{.*}}] %{{.*}}
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xi32>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf32>
-  // CHECK:           gpu.wait [%{{.*}}]
-  // CHECK:           gpu.spgemm_copy  %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32
-  // CHECK:           gpu.destroy_sp_mat  %{{.*}}
-  // CHECK:           gpu.destroy_sp_mat  %{{.*}}
-  // CHECK:           gpu.destroy_sp_mat  %{{.*}}
+  // CHECK            %{{.*}} = gpu.set_csr_pointers async [%{{.*}}] %{{.*}}, {{.*}}, {{.*}}, {{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
+  // CHECK:           %{{.*}} = gpu.spgemm_copy async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32
+  // CHECK:           %{{.*}} = gpu.spgemm_destroy_descr async [%{{.*}}] %{{.*}}
+  // CHECK:           gpu.destroy_sp_mat %{{.*}}
+  // CHECK:           gpu.destroy_sp_mat %{{.*}}
+  // CHECK:           gpu.destroy_sp_mat %{{.*}}
   // CHECK:           return
   func.func @spgemm(%arg0: index) {
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
-    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %spmatA, %token3 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %spmatB, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %spmatC, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf32>
+    %spmatA, %token3 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %spmatB, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %spmatC, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
     %spgemmDesc, %token6 = gpu.spgemm_create_descr async [%token5]
-    // Used as nullptr
-    %alloc = memref.alloc() : memref<0xi8>
+    %alloc = memref.alloc() : memref<0xi8>  // nullptr
     %c0 = arith.constant 0 : index
     %bufferSz1, %token7 = gpu.spgemm_work_estimation_or_compute async
                             [%token6]{WORK_ESTIMATION}
-                            %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE},
-                            %spmatC, %spgemmDesc, %c0,
-                            %alloc: f32 into memref<0xi8>
-    %buf1, %token8 = gpu.alloc async [%token7] (%bufferSz1) : memref<?xi8>
-    %bufferSz1_1, %token9 = gpu.spgemm_work_estimation_or_compute async
-                              [%token8]{WORK_ESTIMATION} %spmatA, %spmatB,
-                              %spmatC, %spgemmDesc, %bufferSz1,
-                              %buf1: f32 into memref<?xi8>
-    %bufferSz3, %dummy, %token10 = gpu.spgemm_estimate_memory async [%token9]
-                                     %spmatA, %spmatB, %spmatC,
-                                     %spgemmDesc, %c0, %c0,
-                                     %alloc: f32 into memref<0xi8>
-    %buf3, %token11 = gpu.alloc async [%token10] (%bufferSz3) : memref<?xi8>
-    %bufferSz3_2, %bufferSz2, %token12 = gpu.spgemm_estimate_memory async
-                                          [%token11] %spmatA, %spmatB, %spmatC,
-                                          %spgemmDesc, %bufferSz3, %c0,
-                                          %buf3: f32 into memref<?xi8>
-    %buf2, %token13 = gpu.alloc async [%token12] (%bufferSz2) : memref<?xi8>
-    %bufferSz2_2, %token14 = gpu.spgemm_work_estimation_or_compute async
-                               [%token13]{COMPUTE} %spmatA, %spmatB, %spmatC,
-                               %spgemmDesc, %bufferSz2,
-                               %buf2: f32 into memref<?xi8>
-    %rows, %cols, %nnz, %token15 = gpu.spgemm_get_size async [%token14] %spmatC
-    %mem_columns, %token16 = gpu.alloc async [%token15] (%cols) : memref<?xi32>
-    %mem_values, %token17 = gpu.alloc async [%token16] (%nnz) : memref<?xf32>
-    gpu.wait [%token17]
-    gpu.spgemm_copy %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
+                            %spmatA, %spmatB, %spmatC,
+			    %spgemmDesc, %c0, %alloc: f32 into memref<0xi8>
+    %bufferSz2, %token8 = gpu.spgemm_work_estimation_or_compute async
+                               [%token7]{COMPUTE}
+			       %spmatA, %spmatB, %spmatC,
+                               %spgemmDesc, %c0, %alloc: f32 into memref<0xi8>
+    %rows, %cols, %nnz, %token9 = gpu.spgemm_get_size async [%token8] %spmatC
+    %token10 = gpu.set_csr_pointers async [%token8] %spmatC, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %token11 = gpu.spgemm_copy async [%token10] %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
+    %token12 = gpu.spgemm_destroy_descr async [%token11] %spgemmDesc
     gpu.destroy_sp_mat %spmatA
     gpu.destroy_sp_mat %spmatB
     gpu.destroy_sp_mat %spmatC