[Mlir-commits] [mlir] 83cf0dc - [mlir][sparse] implement direct IR alloc/empty/new for non-permutations (#72585)

Thu Nov 16 17:17:46 PST 2023

Author: Aart Bik
Date: 2023-11-16T17:17:41-08:00
New Revision: 83cf0dc98234bbd8cb0d0959baa570477a8daf92

URL: https://github.com/llvm/llvm-project/commit/83cf0dc98234bbd8cb0d0959baa570477a8daf92
DIFF: https://github.com/llvm/llvm-project/commit/83cf0dc98234bbd8cb0d0959baa570477a8daf92.diff

LOG: [mlir][sparse] implement direct IR alloc/empty/new for non-permutations (#72585)

This change implements the correct *level* sizes set up for the direct
IR codegen fields in the sparse storage scheme. This brings libgen and
codegen together again.

This is step 3 out of 3 to make sparse_tensor.new work for BSR

Added: 
    

Modified: 
    mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
    mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
    mlir/test/Dialect/SparseTensor/codegen.mlir
    mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir
    mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
    mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
index 85bbfe06a86a020..6b4a174596ffef6 100644

--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -1,4 +1,4 @@
-//===- File.h - Reading sparse tensors from files --------------*- C++ -*-===//
+//===- File.h - Reading sparse tensors from files ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index cfc8eb19918b77e..9f41db73a509112 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -159,41 +159,46 @@ static Value createAllocation(OpBuilder &builder, Location loc,
   return buffer;
 }
 
+/// Creates the dim sizes array, filling in from dynamic sizes.
+static void createDimSizes(OpBuilder &builder, Location loc,
+                           SparseTensorType stt, ValueRange dynSizes,
+                           /*out*/ SmallVectorImpl<Value> &dimSizesValues) {
+  const Dimension dimRank = stt.getDimRank();
+  dimSizesValues.clear();
+  dimSizesValues.reserve(dimRank);
+  unsigned i = 0;
+  for (const Size sz : stt.getDimShape())
+    dimSizesValues.push_back(ShapedType::isDynamic(sz)
+                                 ? dynSizes[i++]
+                                 : constantIndex(builder, loc, sz));
+}
+
 /// Creates allocation for each field in sparse tensor type. Note that
 /// for all dynamic memrefs in the sparse tensor stroage layout, the
 /// memory size is really the capacity of the "vector", while the actual
 /// size resides in the sizes array.
 static void createAllocFields(OpBuilder &builder, Location loc,
-                              SparseTensorType stt, ValueRange dynSizes,
-                              bool enableInit, SmallVectorImpl<Value> &fields,
-                              Value sizeHint) {
-  // Build original sizes.
-  assert((dynSizes.size() == static_cast<size_t>(stt.getNumDynamicDims())) &&
-         "Got wrong number of dynamic sizes");
-  const Dimension dimRank = stt.getDimRank();
-  SmallVector<Value> dimSizes;
-  dimSizes.reserve(dimRank);
-  unsigned i = 0; // cumulative index into `dynSizes`.
-  for (const Size sh : stt.getDimShape())
-    dimSizes.push_back(ShapedType::isDynamic(sh)
-                           ? dynSizes[i++]
-                           : constantIndex(builder, loc, sh));
-
+                              SparseTensorType stt, bool enableInit,
+                              Value sizeHint,
+                              SmallVectorImpl<Value> &lvlSizesValues,
+                              /*out*/ SmallVectorImpl<Value> &fields) {
+  Level lvlRank = stt.getLvlRank();
   // Set up some heuristic sizes. We try to set the initial
   // size based on available information. Otherwise we just
   // initialize a few elements to start the reallocation chain.
   // TODO: refine this
   Value posHeuristic, crdHeuristic, valHeuristic;
   if (stt.isAllDense()) {
-    valHeuristic = dimSizes[0];
-    for (const Value sz : ArrayRef<Value>{dimSizes}.drop_front())
-      valHeuristic = builder.create<arith::MulIOp>(loc, valHeuristic, sz);
+    valHeuristic = lvlSizesValues[0];
+    for (Level lvl = 1; lvl < lvlRank; lvl++)
+      valHeuristic =
+          builder.create<arith::MulIOp>(loc, valHeuristic, lvlSizesValues[lvl]);
   } else if (sizeHint) {
     if (getCOOStart(stt.getEncoding()) == 0) {
       posHeuristic = constantIndex(builder, loc, 2);
       crdHeuristic = builder.create<arith::MulIOp>(
-          loc, constantIndex(builder, loc, dimRank), sizeHint); // AOS
-    } else if (dimRank == 2 && stt.isDenseLvl(0) && stt.isCompressedLvl(1)) {
+          loc, constantIndex(builder, loc, lvlRank), sizeHint); // AOS
+    } else if (lvlRank == 2 && stt.isDenseLvl(0) && stt.isCompressedLvl(1)) {
       posHeuristic = builder.create<arith::AddIOp>(
           loc, sizeHint, constantIndex(builder, loc, 1));
       crdHeuristic = sizeHint;
@@ -205,7 +210,6 @@ static void createAllocFields(OpBuilder &builder, Location loc,
     posHeuristic = crdHeuristic = valHeuristic =
         constantIndex(builder, loc, 16);
   }
-
   // Initializes all fields. An initial storage specifier and allocated
   // positions/coordinates/values memrefs (with heuristic capacity).
   foreachFieldAndTypeInSparseTensor(
@@ -237,16 +241,13 @@ static void createAllocFields(OpBuilder &builder, Location loc,
         // Returns true to continue the iteration.
         return true;
       });
-
   // Initialize the storage scheme to an empty tensor. Sets the lvlSizes
   // and gives all position fields an initial zero entry, so that it is
   // easier to maintain the "linear + 1" length property.
   MutSparseTensorDescriptor desc(stt, fields);
   Value posZero = constantZero(builder, loc, stt.getPosType());
   for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) {
-    // FIXME: `toOrigDim` is deprecated.
-    desc.setLvlSize(builder, loc, lvl,
-                    dimSizes[toOrigDim(stt.getEncoding(), lvl)]);
+    desc.setLvlSize(builder, loc, lvl, lvlSizesValues[lvl]);
     const auto dlt = stt.getLvlType(lvl);
     if (isCompressedDLT(dlt) || isLooseCompressedDLT(dlt))
       createPushback(builder, loc, desc, SparseTensorFieldKind::PosMemRef, lvl,
@@ -371,8 +372,8 @@ static void genEndInsert(OpBuilder &builder, Location loc,
                          SparseTensorDescriptor desc) {
   const SparseTensorType stt(desc.getRankedTensorType());
   const Level lvlRank = stt.getLvlRank();
-  for (Level l = 0; l < lvlRank; l++) {
-    const auto dlt = stt.getLvlType(l);
+  for (Level lvl = 0; lvl < lvlRank; lvl++) {
+    const auto dlt = stt.getLvlType(lvl);
     if (isCompressedDLT(dlt)) {
       // Compressed dimensions need a position cleanup for all entries
       // that were not visited during the insertion pass.
@@ -380,10 +381,10 @@ static void genEndInsert(OpBuilder &builder, Location loc,
       // TODO: avoid cleanup and keep compressed scheme consistent at all
       // times?
       //
-      if (l > 0) {
+      if (lvl > 0) {
         Type posType = stt.getPosType();
-        Value posMemRef = desc.getPosMemRef(l);
-        Value hi = desc.getPosMemSize(builder, loc, l);
+        Value posMemRef = desc.getPosMemRef(lvl);
+        Value hi = desc.getPosMemSize(builder, loc, lvl);
         Value zero = constantIndex(builder, loc, 0);
         Value one = constantIndex(builder, loc, 1);
         // Vector of only one, but needed by createFor's prototype.
@@ -729,7 +730,6 @@ class SparseReMapConverter : public OpConversionPattern<ReinterpretMapOp> {
 };
 
 /// Sparse codegen rule for the alloc operator.
-/// TODO(springerm): remove when bufferization.alloc_tensor is gone
 class SparseTensorAllocConverter
     : public OpConversionPattern<bufferization::AllocTensorOp> {
 public:
@@ -746,6 +746,8 @@ class SparseTensorAllocConverter
     if (!resType.hasEncoding())
       return failure();
     Location loc = op.getLoc();
+
+    // Deal with copy.
     if (op.getCopy()) {
       auto desc = getDescriptorFromTensorTuple(adaptor.getCopy());
       SmallVector<Value> fields;
@@ -766,17 +768,22 @@ class SparseTensorAllocConverter
       return success();
     }
 
+    // Construct the dim/lvl sizes and the (unused) dim2lvl/lvl2dim buffers.
+    SmallVector<Value> dimSizesValues;
+    SmallVector<Value> lvlSizesValues;
+    Value dimSizesBuffer;
+    Value dim2lvlBuffer;
+    Value lvl2dimBuffer;
+    createDimSizes(rewriter, loc, resType, adaptor.getDynamicSizes(),
+                   dimSizesValues);
+    genMapBuffers(rewriter, loc, resType, dimSizesValues, dimSizesBuffer,
+                  lvlSizesValues, dim2lvlBuffer, lvl2dimBuffer);
+
     // Construct allocation for each field.
     Value sizeHint = op.getSizeHint();
-    ValueRange dynSizes = adaptor.getDynamicSizes();
-    const size_t found = dynSizes.size();
-    const int64_t expected = resType.getNumDynamicDims();
-    if (found != static_cast<size_t>(expected))
-      return rewriter.notifyMatchFailure(op,
-                                         "Got wrong number of dynamic sizes");
     SmallVector<Value> fields;
-    createAllocFields(rewriter, loc, resType, dynSizes,
-                      enableBufferInitialization, fields, sizeHint);
+    createAllocFields(rewriter, loc, resType, enableBufferInitialization,
+                      sizeHint, lvlSizesValues, fields);
 
     // Replace operation with resulting memrefs.
     rewriter.replaceOp(op, genTuple(rewriter, loc, resType, fields));
@@ -788,7 +795,6 @@ class SparseTensorAllocConverter
 };
 
 /// Sparse codegen rule for the empty tensor operator.
-/// TODO(springerm): remove when bufferization.alloc_tensor is gone
 class SparseTensorEmptyConverter : public OpConversionPattern<tensor::EmptyOp> {
 public:
   using OpConversionPattern::OpConversionPattern;
@@ -803,19 +809,24 @@ class SparseTensorEmptyConverter : public OpConversionPattern<tensor::EmptyOp> {
     const auto resType = getSparseTensorType(op);
     if (!resType.hasEncoding())
       return failure();
+    Location loc = op.getLoc();
+
+    // Construct the dim/lvl sizes and the (unused) dim2lvl/lvl2dim buffers.
+    SmallVector<Value> dimSizesValues;
+    SmallVector<Value> lvlSizesValues;
+    Value dimSizesBuffer;
+    Value dim2lvlBuffer;
+    Value lvl2dimBuffer;
+    createDimSizes(rewriter, loc, resType, adaptor.getDynamicSizes(),
+                   dimSizesValues);
+    genMapBuffers(rewriter, loc, resType, dimSizesValues, dimSizesBuffer,
+                  lvlSizesValues, dim2lvlBuffer, lvl2dimBuffer);
 
     // Construct allocation for each field.
-    Location loc = op.getLoc();
     Value sizeHint; // none
-    const ValueRange dynSizes = adaptor.getDynamicSizes();
-    const size_t found = dynSizes.size();
-    const int64_t expected = resType.getNumDynamicDims();
-    if (found != static_cast<size_t>(expected))
-      return rewriter.notifyMatchFailure(op,
-                                         "Got wrong number of dynamic sizes");
     SmallVector<Value> fields;
-    createAllocFields(rewriter, loc, resType, dynSizes,
-                      enableBufferInitialization, fields, sizeHint);
+    createAllocFields(rewriter, loc, resType, enableBufferInitialization,
+                      sizeHint, lvlSizesValues, fields);
 
     // Replace operation with resulting memrefs.
     rewriter.replaceOp(op, genTuple(rewriter, loc, resType, fields));
@@ -1461,10 +1472,10 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
     //   if (! %isSorted) sparse_tensor.sort_coo(%nse, %coordinates, %values)
     //   update storage specifier
     //   @delSparseTensorReader(%reader)
-    SmallVector<Value> dimShapesValues;
+    SmallVector<Value> dimSizesValues;
     Value dimSizesBuffer;
     Value reader = genReader(rewriter, loc, dstTp, adaptor.getOperands()[0],
-                             dimShapesValues, dimSizesBuffer);
+                             dimSizesValues, dimSizesBuffer);
 
     // Get the number of stored entries.
     const Type indexTp = rewriter.getIndexType();
@@ -1472,25 +1483,19 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
                                {indexTp}, {reader}, EmitCInterface::Off)
                     .getResult(0);
 
-    // Construct allocation for each field.
-    SmallVector<Value> dynSizes;
-    if (dstTp.hasDynamicDimShape()) {
-      for (const auto &d : llvm::enumerate(dstTp.getDimShape()))
-        if (ShapedType::isDynamic(d.value()))
-          dynSizes.push_back(rewriter.create<memref::LoadOp>(
-              loc, dimSizesBuffer, constantIndex(rewriter, loc, d.index())));
-    }
-    SmallVector<Value> fields;
-    createAllocFields(rewriter, loc, dstTp, dynSizes, /*enableInit=*/false,
-                      fields, nse);
-
-    // Now construct the lvl sizes and the dim2lvl/lvl2dim buffers.
+    // Construct the lvl sizes and the dim2lvl/lvl2dim buffers.
     SmallVector<Value> lvlSizesValues;
     Value dim2lvlBuffer;
     Value lvl2dimBuffer;
-    genMapBuffers(rewriter, loc, dstTp, dimShapesValues, dimSizesBuffer,
+    genMapBuffers(rewriter, loc, dstTp, dimSizesValues, dimSizesBuffer,
                   lvlSizesValues, dim2lvlBuffer, lvl2dimBuffer);
 
+    // Construct allocation for each field.
+    Value sizeHint = nse;
+    SmallVector<Value> fields;
+    createAllocFields(rewriter, loc, dstTp, /*enableInit=*/false, sizeHint,
+                      lvlSizesValues, fields);
+
     // Read the COO tensor data.
     MutSparseTensorDescriptor desc(dstTp, fields);
     Value xs = desc.getAOSMemRef();

diff  --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index 12d77ecfecc6ac9..e63595bed53e599 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -252,7 +252,7 @@ func.func @sparse_values_coo(%arg0: tensor<?x?x?xf64, #ccoo>) -> memref<?xf64> {
 }
 
 
-// CHECK-LABEL:   func.func @sparse_indices_coo(
+// CHECK-LABEL: func.func @sparse_indices_coo(
 //  CHECK-SAME: %[[A0:.*0]]: memref<?xindex>,
 //  CHECK-SAME: %[[A1:.*1]]: memref<?xindex>,
 //  CHECK-SAME: %[[A2:.*2]]: memref<?xindex>,
@@ -270,7 +270,7 @@ func.func @sparse_indices_coo(%arg0: tensor<?x?x?xf64, #ccoo>) -> memref<?xindex
   return %0 : memref<?xindex, strided<[?], offset: ?>>
 }
 
-// CHECK-LABEL:   func.func @sparse_indices_buffer_coo(
+// CHECK-LABEL: func.func @sparse_indices_buffer_coo(
 //  CHECK-SAME: %[[A0:.*0]]: memref<?xindex>,
 //  CHECK-SAME: %[[A1:.*1]]: memref<?xindex>,
 //  CHECK-SAME: %[[A2:.*2]]: memref<?xindex>,
@@ -676,26 +676,26 @@ func.func @sparse_convert_element_type(%arg0: tensor<32xf32, #SparseVector>) ->
 // CHECK:       memref.store %[[VAL_4]], %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<2xindex>
 // CHECK:       %[[VAL_8:.*]] = call @createCheckedSparseTensorReader(%[[A0]], %[[VAL_7]], %[[VAL_2]]) : (!llvm.ptr, memref<?xindex>, i32) -> !llvm.ptr
 // CHECK:       %[[VAL_9:.*]] = call @getSparseTensorReaderDimSizes(%[[VAL_8]]) : (!llvm.ptr) -> memref<?xindex>
-// CHECK:       %[[VAL_10:.*]] = call @getSparseTensorReaderNSE(%[[VAL_8]]) : (!llvm.ptr) -> index
-// CHECK:       %[[VAL_11:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:       %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:       %[[VAL_13:.*]] = arith.muli %[[VAL_10]], %[[VAL_5]] : index
-// CHECK:       %[[VAL_14:.*]] = memref.alloc() : memref<2xindex>
-// CHECK:       %[[VAL_15:.*]] = memref.cast %[[VAL_14]] : memref<2xindex> to memref<?xindex>
-// CHECK:       %[[VAL_16:.*]] = memref.alloc(%[[VAL_13]]) : memref<?xindex>
-// CHECK:       %[[VAL_17:.*]] = memref.alloc(%[[VAL_10]]) : memref<?xf32>
-// CHECK:       %[[VAL_18:.*]] = sparse_tensor.storage_specifier.init
-// CHECK:       %[[VAL_19:.*]] = sparse_tensor.storage_specifier.set %[[VAL_18]]  lvl_sz at 0 with %[[VAL_11]]
-// CHECK:       %[[VAL_20:.*]] = sparse_tensor.storage_specifier.get %[[VAL_19]]  pos_mem_sz at 0
-// CHECK:       %[[VAL_21:.*]], %[[VAL_22:.*]] = sparse_tensor.push_back %[[VAL_20]], %[[VAL_15]], %[[VAL_4]]
-// CHECK:       %[[VAL_23:.*]] = sparse_tensor.storage_specifier.set %[[VAL_19]]  pos_mem_sz at 0 with %[[VAL_22]]
-// CHECK:       %[[VAL_24:.*]] = sparse_tensor.storage_specifier.set %[[VAL_23]]  lvl_sz at 1 with %[[VAL_12]]
-// CHECK:       %[[VAL_25:.*]], %[[VAL_26:.*]] = sparse_tensor.push_back %[[VAL_22]], %[[VAL_21]], %[[VAL_4]], %[[VAL_3]]
-// CHECK:       %[[VAL_27:.*]] = sparse_tensor.storage_specifier.set %[[VAL_24]]  pos_mem_sz at 0 with %[[VAL_26]]
-// CHECK:       %[[VAL_28:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:       %[[VAL_29:.*]] = memref.cast %[[VAL_28]] : memref<2xindex> to memref<?xindex>
-// CHECK:       memref.store %[[VAL_4]], %[[VAL_28]]{{\[}}%[[VAL_4]]] : memref<2xindex>
-// CHECK:       memref.store %[[VAL_3]], %[[VAL_28]]{{\[}}%[[VAL_3]]] : memref<2xindex>
+// CHECK-DAG:   %[[VAL_10:.*]] = call @getSparseTensorReaderNSE(%[[VAL_8]]) : (!llvm.ptr) -> index
+// CHECK-DAG:   %[[VAL_11:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:   %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:   %[[VAL_13:.*]] = arith.muli %[[VAL_10]], %[[VAL_5]] : index
+// CHECK-DAG:   %[[VAL_14:.*]] = memref.alloc() : memref<2xindex>
+// CHECK-DAG:   %[[VAL_15:.*]] = memref.cast %[[VAL_14]] : memref<2xindex> to memref<?xindex>
+// CHECK-DAG:   %[[VAL_16:.*]] = memref.alloc(%[[VAL_13]]) : memref<?xindex>
+// CHECK-DAG:   %[[VAL_17:.*]] = memref.alloc(%[[VAL_10]]) : memref<?xf32>
+// CHECK-DAG:   %[[VAL_18:.*]] = sparse_tensor.storage_specifier.init
+// CHECK-DAG:   %[[VAL_19:.*]] = sparse_tensor.storage_specifier.set %[[VAL_18]]  lvl_sz at 0 with %[[VAL_11]]
+// CHECK-DAG:   %[[VAL_20:.*]] = sparse_tensor.storage_specifier.get %[[VAL_19]]  pos_mem_sz at 0
+// CHECK-DAG:   %[[VAL_21:.*]], %[[VAL_22:.*]] = sparse_tensor.push_back %[[VAL_20]], %[[VAL_15]], %[[VAL_4]]
+// CHECK-DAG:   %[[VAL_23:.*]] = sparse_tensor.storage_specifier.set %[[VAL_19]]  pos_mem_sz at 0 with %[[VAL_22]]
+// CHECK-DAG:   %[[VAL_24:.*]] = sparse_tensor.storage_specifier.set %[[VAL_23]]  lvl_sz at 1 with %[[VAL_12]]
+// CHECK-DAG:   %[[VAL_25:.*]], %[[VAL_26:.*]] = sparse_tensor.push_back %[[VAL_22]], %[[VAL_21]], %[[VAL_4]], %[[VAL_3]]
+// CHECK-DAG:   %[[VAL_27:.*]] = sparse_tensor.storage_specifier.set %[[VAL_24]]  pos_mem_sz at 0 with %[[VAL_26]]
+// CHECK-DAG:   %[[VAL_28:.*]] = memref.alloca() : memref<2xindex>
+// CHECK-DAG:   %[[VAL_29:.*]] = memref.cast %[[VAL_28]] : memref<2xindex> to memref<?xindex>
+// CHECK-DAG:   memref.store %[[VAL_4]], %[[VAL_28]]{{\[}}%[[VAL_4]]] : memref<2xindex>
+// CHECK-DAG:   memref.store %[[VAL_3]], %[[VAL_28]]{{\[}}%[[VAL_3]]] : memref<2xindex>
 // CHECK:       %[[VAL_30:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[VAL_8]], %[[VAL_29]], %[[VAL_29]], %[[VAL_16]], %[[VAL_17]]) : (!llvm.ptr, memref<?xindex>, memref<?xindex>, memref<?xindex>, memref<?xf32>) -> i1
 // CHECK:       %[[VAL_31:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_1]] : i1
 // CHECK:       scf.if %[[VAL_31]] {
@@ -722,31 +722,31 @@ func.func @sparse_new_coo(%arg0: !llvm.ptr) -> tensor<?x?xf32, #Coo> {
 // CHECK:       memref.store %[[VAL_3]], %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<2xindex>
 // CHECK:       memref.store %[[VAL_3]], %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref<2xindex>
 // CHECK:       %[[VAL_7:.*]] = call @createCheckedSparseTensorReader(%[[A0]], %[[VAL_6]], %[[VAL_1]]) : (!llvm.ptr, memref<?xindex>, i32) -> !llvm.ptr
-// CHECK:       %[[VAL_8:.*]] = call @getSparseTensorReaderDimSizes(%[[VAL_7]]) : (!llvm.ptr) -> memref<?xindex>
-// CHECK:       %[[VAL_9:.*]] = call @getSparseTensorReaderNSE(%[[VAL_7]]) : (!llvm.ptr) -> index
-// CHECK:       %[[VAL_10:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:       %[[VAL_11:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK:       %[[VAL_12:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index
-// CHECK:       %[[VAL_13:.*]] = memref.alloc() : memref<2xindex>
-// CHECK:       %[[VAL_14:.*]] = memref.cast %[[VAL_13]] : memref<2xindex> to memref<?xindex>
-// CHECK:       %[[VAL_15:.*]] = memref.alloc(%[[VAL_12]]) : memref<?xindex>
-// CHECK:       %[[VAL_16:.*]] = memref.alloc(%[[VAL_9]]) : memref<?xf32>
-// CHECK:       %[[VAL_17:.*]] = sparse_tensor.storage_specifier.init
-// CHECK:       %[[VAL_18:.*]] = sparse_tensor.storage_specifier.set %[[VAL_17]]  lvl_sz at 0 with %[[VAL_11]]
-// CHECK:       %[[VAL_19:.*]] = sparse_tensor.storage_specifier.get %[[VAL_18]]  pos_mem_sz at 0
-// CHECK:       %[[VAL_20:.*]], %[[VAL_21:.*]] = sparse_tensor.push_back %[[VAL_19]], %[[VAL_14]], %[[VAL_3]]
-// CHECK:       %[[VAL_22:.*]] = sparse_tensor.storage_specifier.set %[[VAL_18]]  pos_mem_sz at 0 with %[[VAL_21]]
-// CHECK:       %[[VAL_23:.*]] = sparse_tensor.storage_specifier.set %[[VAL_22]]  lvl_sz at 1 with %[[VAL_10]]
-// CHECK:       %[[VAL_24:.*]], %[[VAL_25:.*]] = sparse_tensor.push_back %[[VAL_21]], %[[VAL_20]], %[[VAL_3]], %[[VAL_2]]
-// CHECK:       %[[VAL_26:.*]] = sparse_tensor.storage_specifier.set %[[VAL_23]]  pos_mem_sz at 0 with %[[VAL_25]]
-// CHECK:       %[[VAL_27:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:       %[[VAL_28:.*]] = memref.cast %[[VAL_27]] : memref<2xindex> to memref<?xindex>
-// CHECK:       memref.store %[[VAL_2]], %[[VAL_27]]{{\[}}%[[VAL_3]]] : memref<2xindex>
-// CHECK:       memref.store %[[VAL_3]], %[[VAL_27]]{{\[}}%[[VAL_2]]] : memref<2xindex>
-// CHECK:       %[[VAL_29:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:       %[[VAL_30:.*]] = memref.cast %[[VAL_29]] : memref<2xindex> to memref<?xindex>
-// CHECK:       memref.store %[[VAL_2]], %[[VAL_29]]{{\[}}%[[VAL_3]]] : memref<2xindex>
-// CHECK:       memref.store %[[VAL_3]], %[[VAL_29]]{{\[}}%[[VAL_2]]] : memref<2xindex>
+// CHECK-DAG:   %[[VAL_8:.*]] = call @getSparseTensorReaderDimSizes(%[[VAL_7]]) : (!llvm.ptr) -> memref<?xindex>
+// CHECK-DAG:   %[[VAL_9:.*]] = call @getSparseTensorReaderNSE(%[[VAL_7]]) : (!llvm.ptr) -> index
+// CHECK-DAG:   %[[VAL_10:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:   %[[VAL_11:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK-DAG:   %[[VAL_12:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index
+// CHECK-DAG:   %[[VAL_13:.*]] = memref.alloc() : memref<2xindex>
+// CHECK-DAG:   %[[VAL_14:.*]] = memref.cast %[[VAL_13]] : memref<2xindex> to memref<?xindex>
+// CHECK-DAG:   %[[VAL_15:.*]] = memref.alloc(%[[VAL_12]]) : memref<?xindex>
+// CHECK-DAG:   %[[VAL_16:.*]] = memref.alloc(%[[VAL_9]]) : memref<?xf32>
+// CHECK-DAG:   %[[VAL_17:.*]] = sparse_tensor.storage_specifier.init
+// CHECK-DAG:   %[[VAL_18:.*]] = sparse_tensor.storage_specifier.set %[[VAL_17]]  lvl_sz at 0 with %[[VAL_11]]
+// CHECK-DAG:   %[[VAL_19:.*]] = sparse_tensor.storage_specifier.get %[[VAL_18]]  pos_mem_sz at 0
+// CHECK-DAG:   %[[VAL_20:.*]], %[[VAL_21:.*]] = sparse_tensor.push_back %[[VAL_19]], %[[VAL_14]], %[[VAL_3]]
+// CHECK-DAG:   %[[VAL_22:.*]] = sparse_tensor.storage_specifier.set %[[VAL_18]]  pos_mem_sz at 0 with %[[VAL_21]]
+// CHECK-DAG:   %[[VAL_23:.*]] = sparse_tensor.storage_specifier.set %[[VAL_22]]  lvl_sz at 1 with %[[VAL_10]]
+// CHECK-DAG:   %[[VAL_24:.*]], %[[VAL_25:.*]] = sparse_tensor.push_back %[[VAL_21]], %[[VAL_20]], %[[VAL_3]], %[[VAL_2]]
+// CHECK-DAG:   %[[VAL_26:.*]] = sparse_tensor.storage_specifier.set %[[VAL_23]]  pos_mem_sz at 0 with %[[VAL_25]]
+// CHECK-DAG:   %[[VAL_27:.*]] = memref.alloca() : memref<2xindex>
+// CHECK-DAG:   %[[VAL_28:.*]] = memref.cast %[[VAL_27]] : memref<2xindex> to memref<?xindex>
+// CHECK-DAG:   memref.store %[[VAL_2]], %[[VAL_27]]{{\[}}%[[VAL_3]]] : memref<2xindex>
+// CHECK-DAG:   memref.store %[[VAL_3]], %[[VAL_27]]{{\[}}%[[VAL_2]]] : memref<2xindex>
+// CHECK-DAG:   %[[VAL_29:.*]] = memref.alloca() : memref<2xindex>
+// CHECK-DAG:   %[[VAL_30:.*]] = memref.cast %[[VAL_29]] : memref<2xindex> to memref<?xindex>
+// CHECK-DAG:   memref.store %[[VAL_2]], %[[VAL_29]]{{\[}}%[[VAL_3]]] : memref<2xindex>
+// CHECK-DAG:   memref.store %[[VAL_3]], %[[VAL_29]]{{\[}}%[[VAL_2]]] : memref<2xindex>
 // CHECK:       %[[VAL_31:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[VAL_7]], %[[VAL_28]], %[[VAL_30]], %[[VAL_15]], %[[VAL_16]]) : (!llvm.ptr, memref<?xindex>, memref<?xindex>, memref<?xindex>, memref<?xf32>) -> i1
 // CHECK:       memref.store %[[VAL_9]], %[[VAL_24]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:       %[[VAL_32:.*]] = sparse_tensor.storage_specifier.set %[[VAL_26]]  crd_mem_sz at 0 with %[[VAL_12]]

diff  --git a/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir b/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir
index 14a8f0e43d35aad..e6d9700ffe4f838 100644
--- a/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen_buffer_initialization.mlir
@@ -4,9 +4,9 @@
 
 // CHECK-LABEL:   func.func @empty_sparse_vector(
 //  CHECK-SAME:     %[[VAL_0:.*]]: index) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
-//       CHECK:     %[[VAL_1:.*]] = arith.constant 1 : index
-//       CHECK:     %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f64
-//       CHECK:     %[[VAL_3:.*]] = arith.constant 0 : index
+//   CHECK-DAG:     %[[VAL_1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:     %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f64
+//   CHECK-DAG:     %[[VAL_3:.*]] = arith.constant 0 : index
 //       CHECK:     %[[VAL_4:.*]] = memref.alloc() : memref<16xindex>
 //       CHECK:     %[[VAL_5:.*]] = memref.cast %[[VAL_4]] : memref<16xindex> to memref<?xindex>
 //       CHECK:     linalg.fill ins(%[[VAL_3]] : index) outs(%[[VAL_4]] : memref<16xindex>)

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
index d92165e98cea4a1..6468c4b45d2479d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
@@ -3,7 +3,7 @@
 //
 // Set-up that's shared across all tests in this directory. In principle, this
 // config could be moved to lit.local.cfg. However, there are downstream users that
-//  do not use these LIT config files. Hence why this is kept inline.
+// do not use these LIT config files. Hence why this is kept inline.
 //
 // DEFINE: %{sparsifier_opts} = enable-runtime-library=true
 // DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
@@ -20,10 +20,13 @@
 // REDEFINE: %{env} = TENSOR0="%mlir_src_dir/test/Integration/data/block.mtx"
 // RUN: %{compile} | env %{env} %{run} | FileCheck %s
 //
-// TODO: enable!
 // Do the same run, but now with direct IR generation.
 // REDEFINE: %{sparsifier_opts} = enable-runtime-library=false
-// R_UN: %{compile} | env %{env} %{run} | FileCheck %s
+// RUN: %{compile} | env %{env} %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
+// RUN: %{compile} | env %{env} %{run} | FileCheck %s
 
 !Filename = !llvm.ptr
 

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
index fb0da0c0e649e5c..db5c154e5e438da 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
@@ -16,8 +16,7 @@
 //
 // without RT lib:
 //
-// TODO: make this work
-// R_U_N: %{compile} enable-runtime-library=false" | %{run}
+// RUN: %{compile} enable-runtime-library=false" | %{run}
 
 !Filename = !llvm.ptr