[Mlir-commits] [mlir] [mlir][sparse][gpu] re-enable all GPU libgen tests (PR #72185)

Mon Nov 13 17:17:27 PST 2023

llvmbot wrote:



@llvm/pr-subscribers-mlir-sparse

@llvm/pr-subscribers-mlir-gpu

Author: Aart Bik (aartbik)

<details>
<summary>Changes</summary>

Previous change no longer properly used the GPU libgen pass (even though most tests still passed falling back to CPU). This revision puts the proper pass order into place. Also bit of a cleanup of CPU codegen vs. libgen setup.

---

Patch is 21.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72185.diff


15 Files Affected:

- (modified) mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h (+1-2) 
- (modified) mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h (+6-9) 
- (modified) mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td (+6-6) 
- (modified) mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp (+10-1) 
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp (+13-10) 
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp (+10-5) 
- (modified) mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir (+1-2) 
- (modified) mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir (+1-2) 
- (modified) mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir (+1-2) 
- (modified) mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir (+1-1) 
- (modified) mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir (+1-1) 
- (modified) mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir (+1-2) 
- (modified) mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir () 
- (modified) mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir () 
- (modified) mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir (+33-35) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index 718922341dac310..37bb9e7986cd8a8 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -144,8 +144,7 @@ struct SparseCompilerOptions
 
   /// Projects out the options for `createSparsificationPass`.
   SparsificationOptions sparsificationOptions() const {
-    return SparsificationOptions(parallelization, enableGPULibgen,
-                                 enableRuntimeLibrary);
+    return SparsificationOptions(parallelization, enableRuntimeLibrary);
   }
 
   /// Projects out the options for `createConvertVectorToLLVMPass`.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index b124364f8cb1f05..e93e2aefb344fd5 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -74,15 +74,11 @@ std::unique_ptr<Pass> createPreSparsificationRewritePass();
 
 /// Options for the Sparsification pass.
 struct SparsificationOptions {
-  SparsificationOptions(SparseParallelizationStrategy p, bool gpuLibgen,
-                        bool enableRT)
-      : parallelizationStrategy(p), enableGPULibgen(gpuLibgen),
-        enableRuntimeLibrary(enableRT) {}
+  SparsificationOptions(SparseParallelizationStrategy p, bool enableRT)
+      : parallelizationStrategy(p), enableRuntimeLibrary(enableRT) {}
   SparsificationOptions()
-      : SparsificationOptions(SparseParallelizationStrategy::kNone, false,
-                              true) {}
+      : SparsificationOptions(SparseParallelizationStrategy::kNone, true) {}
   SparseParallelizationStrategy parallelizationStrategy;
-  bool enableGPULibgen;
   bool enableRuntimeLibrary;
 };
 
@@ -196,7 +192,8 @@ void populateSparseGPULibgenPatterns(RewritePatternSet &patterns,
                                      bool enableRT);
 
 std::unique_ptr<Pass> createSparseGPUCodegenPass();
-std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);
+std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads,
+                                                 bool enableRT);
 
 //===----------------------------------------------------------------------===//
 // The SparseStorageSpecifierToLLVM pass.
@@ -225,7 +222,7 @@ std::unique_ptr<Pass> createSparsificationAndBufferizationPass(
     const SparsificationOptions &sparsificationOptions,
     bool createSparseDeallocs, bool enableRuntimeLibrary,
     bool enableBufferInitialization, unsigned vectorLength,
-    bool enableVLAVectorization, bool enableSIMDIndex32);
+    bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen);
 
 //===----------------------------------------------------------------------===//
 // Registration.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index dde138b4c99afe4..f38779ed9ed2b82 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -105,7 +105,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
     "affine::AffineDialect",
     "arith::ArithDialect",
     "bufferization::BufferizationDialect",
-    "gpu::GPUDialect",
     "LLVM::LLVMDialect",
     "linalg::LinalgDialect",
     "memref::MemRefDialect",
@@ -131,9 +130,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
              clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
                         "any-storage-any-loop",
                         "Enable sparse parallelization for any storage and loop."))}]>,
-    Option<"enableGPULibgen", "enable-gpu-libgen", "bool",
-           "false",
-           "Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,
     Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
            "true", "Enable runtime library for manipulating sparse tensors">,
   ];
@@ -368,7 +364,9 @@ def SparseVectorization : Pass<"sparse-vectorization", "ModuleOp"> {
 def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
   let summary = "Generates GPU code during sparsification";
   let description = [{
-    Enables the sparsifier to use GPU acceleration.
+    Enables the sparsifier to use GPU acceleration. When the number of GPU
+    threads is set to zero, the pass tries to enable GPU acceleration by
+    means of direct library calls (like cuSPARSE).
   }];
   let constructor = "mlir::createSparseGPUCodegenPass()";
   let dependentDialects = [
@@ -381,7 +379,9 @@ def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
     "sparse_tensor::SparseTensorDialect",
   ];
   let options = [
-    Option<"numThreads", "num_threads", "int32_t", "1024", "Sets the number of GPU threads">,
+    Option<"numThreads", "num-threads", "int32_t", "1024", "Sets the number of GPU threads">,
+    Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
+           "true", "Enable runtime library for manipulating sparse tensors">,
   ];
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 3ed8bba2514aaf9..6ee48482ad6ef88 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -31,7 +31,10 @@
 
 void mlir::sparse_tensor::buildSparseCompiler(
     OpPassManager &pm, const SparseCompilerOptions &options) {
+  // Rewrite named linalg ops into generic ops.
   pm.addNestedPass<func::FuncOp>(createLinalgGeneralizationPass());
+
+  // Sparsification and bufferization mini-pipeline.
   pm.addPass(createSparsificationAndBufferizationPass(
       getBufferizationOptionsForSparsification(
           options.testBufferizationAnalysisOnly),
@@ -39,10 +42,14 @@ void mlir::sparse_tensor::buildSparseCompiler(
       options.enableRuntimeLibrary, options.enableBufferInitialization,
       options.vectorLength,
       /*enableVLAVectorization=*/options.armSVE,
-      /*enableSIMDIndex32=*/options.force32BitVectorIndices));
+      /*enableSIMDIndex32=*/options.force32BitVectorIndices,
+      options.enableGPULibgen));
+
+  // Bail-early for test setup.
   if (options.testBufferizationAnalysisOnly)
     return;
 
+  // Storage specifier lowering and bufferization wrap-up.
   pm.addPass(createStorageSpecifierToLLVMPass());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addNestedPass<func::FuncOp>(
@@ -72,8 +79,10 @@ void mlir::sparse_tensor::buildSparseCompiler(
   pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
   pm.addPass(createConvertMathToLibmPass());
   pm.addPass(createConvertComplexToLibmPass());
+
   // Repeat convert-vector-to-llvm.
   pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
+
   pm.addPass(createConvertComplexToLLVMPass());
   pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
   pm.addPass(createConvertFuncToLLVMPass());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index c139fcc8135154d..375e10f9068e43b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -82,19 +82,15 @@ struct SparsificationPass
   SparsificationPass(const SparsificationPass &pass) = default;
   SparsificationPass(const SparsificationOptions &options) {
     parallelization = options.parallelizationStrategy;
-    enableGPULibgen = options.enableGPULibgen;
     enableRuntimeLibrary = options.enableRuntimeLibrary;
   }
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     // Translate strategy flags to strategy options.
-    SparsificationOptions options(parallelization, enableGPULibgen,
-                                  enableRuntimeLibrary);
-    // Apply GPU libgen (if requested), sparsification, and cleanup rewriting.
+    SparsificationOptions options(parallelization, enableRuntimeLibrary);
+    // Apply sparsification and cleanup rewriting.
     RewritePatternSet patterns(ctx);
-    if (enableGPULibgen)
-      populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
     populateSparsificationPatterns(patterns, options);
     scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
@@ -323,12 +319,18 @@ struct SparseGPUCodegenPass
     : public impl::SparseGPUCodegenBase<SparseGPUCodegenPass> {
   SparseGPUCodegenPass() = default;
   SparseGPUCodegenPass(const SparseGPUCodegenPass &pass) = default;
-  SparseGPUCodegenPass(unsigned nT) { numThreads = nT; }
+  SparseGPUCodegenPass(unsigned nT, bool enableRT) {
+    numThreads = nT;
+    enableRuntimeLibrary = enableRT;
+  }
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
-    populateSparseGPUCodegenPatterns(patterns, numThreads);
+    if (numThreads == 0)
+      populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
+    else
+      populateSparseGPUCodegenPatterns(patterns, numThreads);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
@@ -457,8 +459,9 @@ std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass() {
   return std::make_unique<SparseGPUCodegenPass>();
 }
 
-std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads) {
-  return std::make_unique<SparseGPUCodegenPass>(numThreads);
+std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads,
+                                                       bool enableRT) {
+  return std::make_unique<SparseGPUCodegenPass>(numThreads, enableRT);
 }
 
 std::unique_ptr<Pass> mlir::createStorageSpecifierToLLVMPass() {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index e20b98add19adbf..94b25a358e804a7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -65,7 +65,7 @@ class SparsificationAndBufferizationPass
       const SparsificationOptions &sparsificationOptions,
       bool createSparseDeallocs, bool enableRuntimeLibrary,
       bool enableBufferInitialization, unsigned vectorLength,
-      bool enableVLAVectorization, bool enableSIMDIndex32)
+      bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen)
       : bufferizationOptions(bufferizationOptions),
         sparsificationOptions(sparsificationOptions),
         createSparseDeallocs(createSparseDeallocs),
@@ -73,7 +73,8 @@ class SparsificationAndBufferizationPass
         enableBufferInitialization(enableBufferInitialization),
         vectorLength(vectorLength),
         enableVLAVectorization(enableVLAVectorization),
-        enableSIMDIndex32(enableSIMDIndex32) {}
+        enableSIMDIndex32(enableSIMDIndex32), enableGPULibgen(enableGPULibgen) {
+  }
 
   /// Bufferize all dense ops. This assumes that no further analysis is needed
   /// and that all required buffer copies were already inserted by
@@ -139,6 +140,8 @@ class SparsificationAndBufferizationPass
     // of `bufferization.alloc_tensor` ops.
     {
       OpPassManager pm("builtin.module");
+      if (enableGPULibgen)
+        pm.addPass(createSparseGPUCodegenPass(0, enableRuntimeLibrary));
       pm.addPass(createSparseReinterpretMapPass(ReinterpretMapScope::kAll));
       pm.addPass(createSparsificationPass(sparsificationOptions));
       pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
@@ -177,6 +180,7 @@ class SparsificationAndBufferizationPass
   unsigned vectorLength;
   bool enableVLAVectorization;
   bool enableSIMDIndex32;
+  bool enableGPULibgen;
 };
 
 } // namespace sparse_tensor
@@ -210,7 +214,8 @@ std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass() {
       /*enableBufferInitialization=*/false,
       /*vectorLength=*/0,
       /*enableVLAVectorization=*/false,
-      /*enableSIMDIndex32=*/false);
+      /*enableSIMDIndex32=*/false,
+      /*enableGPULibgen=*/false);
 }
 
 std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
@@ -218,10 +223,10 @@ std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
     const SparsificationOptions &sparsificationOptions,
     bool createSparseDeallocs, bool enableRuntimeLibrary,
     bool enableBufferInitialization, unsigned vectorLength,
-    bool enableVLAVectorization, bool enableSIMDIndex32) {
+    bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen) {
   return std::make_unique<
       mlir::sparse_tensor::SparsificationAndBufferizationPass>(
       bufferizationOptions, sparsificationOptions, createSparseDeallocs,
       enableRuntimeLibrary, enableBufferInitialization, vectorLength,
-      enableVLAVectorization, enableSIMDIndex32);
+      enableVLAVectorization, enableSIMDIndex32, enableGPULibgen);
 }
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
index 73161bdb135ca4a..34189d329cc41ee 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
 
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
index 9973050d40799d4..f584977e96415bd 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 // CHECK-LABEL:   func.func @matmul(
 // CHECK-SAME:      %[[VAL_0:.*0]]: tensor<?x?xf16>,
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
index 50ff81cb6ecd0a6..bd0bf6927b0da4c 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #SortedCOO = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
index 221bda47291ebf9..ce7af53bb346278 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #trait_sampled_dense_dense = {
   indexing_maps = [
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
index 6afb626625cfe2d..dd79a9017f7f4cb 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #BSR = #sparse_tensor.encoding<{
   map = (i, j) -> (
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
index 027c9fda5da90eb..7ac37c1c4950c09 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
old mode 100755
new mode 100644
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
old mode 100755
new mode 100644
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
index 6c3d67e2ea78dc6..735dc8cb4bb3611 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
@@ -85,32 +85,30 @@ module {
   // A kernel that computes a BSR sampled dense matrix matrix multiplication
   // using a "spy" function and in-place update of the sampling sparse matrix.
   //
-  // TODO: re-enable the following test.
-  //
-  // func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
-  //                        %arga: tensor<?x?xf32>,
-  //                        %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
-  //   %result = linalg.generic #trait_SDDMM
-  //     ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
-  //     outs(%args: tensor<?x?xf32, #BSR>) {
-  //       ^bb(%a: f32, %b: f32, %s: f32):
-  //          %f0 = arith.constant 0.0 : f32
-  //          %u = sparse_tensor.unary %s : f32 to f32
-  //            present={
-  //               ^bb0(%p: f32):
-  //                 %mul = arith.mulf %a, %b : f32
-  //                 sparse_tensor.yield %mul : f32
-  //            }
-  //            absent={}
-  //          %r = sparse_tensor.reduce %s, %u, %f0 : f32 {
-  //             ^bb0(%p: f32, %q: f32):
-  //               %add = arith.addf %p, %q : f32
-  //               sparse_tensor.yield %add : f32
-  //           }
-  //          linalg.yield %r : f32
-  //     } -> tensor<?x?xf32, #BSR>
-  //   return %result : tensor<?x?xf32, #BSR>
-  // }
+  func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
+                         %arga: tensor<?x?xf32>,
+                         %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
+    %result = linalg.generic #trait_SDDMM
+      ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%args: tensor<?x?xf32, #BSR>) {
+        ^bb(%a: f32, %b: f32, %s: f32):
+           %f0 = arith.constant 0.0 : f32
+           %u = sparse_tensor.unary %s : f32 to f32
+             present={
+                ^bb0(%p: f32):
+                  %mul = arith.mulf %a, %b : f32
+                  sparse_tensor.yield %mul : f32
+             }
+             absent={}
+           %r = sparse_tensor.reduce %s, %u, %f0 : f32 {
+              ^bb0(%p: f32, %q: f32):
+                %add = arith.addf %p, %q : f32
+                sparse_tensor.yield %add : f32
+            }
+           linalg.yield %r : f32
+      } -> tensor<?x?xf32, #BSR>
+    return %result : tensor<?x?xf32, #BSR>
+  }
 
   func.func private @getTensorFilename(index) -> (!Filename)
 
@@ -153,15 +151,15 @@ module {
     //
     %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
     %m_csr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #CSR>
-    // %m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>
+    %m_bsr = sparse_tensor.new %fileNam...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/72185