[Mlir-commits] [mlir] [mlir][sparse] refactor dense2sparse and const2sparse conversion (PR #68935)

Thu Oct 12 15:11:27 PDT 2023

https://github.com/aartbik created https://github.com/llvm/llvm-project/pull/68935

The new code uses the MapRef data structure, which prepares our move to non-permutation types, such as block-sparsity

>From 3d8f29859dbb6782effaedfb3594f10290f27783 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik at google.com>
Date: Thu, 12 Oct 2023 15:04:58 -0700
Subject: [PATCH] [mlir][sparse] refactor dense2sparse and const2sparse
 conversion

The new code uses the MapRef data structure, which prepares
our move to non-permutation types, such as block-sparsity
---
 .../mlir/Dialect/SparseTensor/IR/Enums.h      |   5 +-
 .../ExecutionEngine/SparseTensor/Storage.h    | 189 ++++++----
 .../ExecutionEngine/SparseTensorRuntime.h     |  35 +-
 .../Transforms/SparseTensorConversion.cpp     |  77 ++---
 .../ExecutionEngine/SparseTensorRuntime.cpp   |  55 +--
 .../test/Dialect/SparseTensor/conversion.mlir |   2 +-
 .../SparseTensor/convert_dense2sparse.mlir    | 322 +++++++++---------
 .../Dialect/SparseTensor/sparse_expand.mlir   |   6 +-
 .../SparseTensor/sparse_fill_zero.mlir        |   2 +-
 9 files changed, 345 insertions(+), 348 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 2920ef79f461c6a..1333a2b3cd4bcdd 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -143,11 +143,10 @@ constexpr bool isComplexPrimaryType(PrimaryType valTy) {
 /// The actions performed by @newSparseTensor.
 enum class Action : uint32_t {
   kEmpty = 0,
-  // newSparseTensor no longer handles `kFromFile=1`, so we leave this
-  // number reserved to help catch any code that still needs updating.
+  kEmptyForward = 1,
   kFromCOO = 2,
   kSparseToSparse = 3,
-  kEmptyCOO = 4,
+  kFuture = 4, // not used
   kToCOO = 5,
   kToIterator = 6,
   kPack = 7,
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 303a41bc471d5d9..3bc854b02eeb1e8 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -33,7 +33,6 @@
     assert((isCompressedDLT(dlt) || isSingletonDLT(dlt)) &&                    \
            "Level is neither compressed nor singleton");                       \
   } while (false)
-#define ASSERT_DENSE_DLT(dlt) assert(isDenseDLT(dlt) && "Level is not dense");
 
 namespace mlir {
 namespace sparse_tensor {
@@ -44,6 +43,12 @@ class SparseTensorEnumeratorBase;
 template <typename P, typename C, typename V>
 class SparseTensorEnumerator;
 
+//===----------------------------------------------------------------------===//
+//
+//  SparseTensorStorage
+//
+//===----------------------------------------------------------------------===//
+
 /// Abstract base class for `SparseTensorStorage<P,C,V>`. This class
 /// takes responsibility for all the `<P,C,V>`-independent aspects
 /// of the tensor (e.g., shape, sparsity, mapping). In addition,
@@ -97,7 +102,7 @@ class SparseTensorStorageBase {
 
   /// Safely looks up the size of the given tensor-dimension.
   uint64_t getDimSize(uint64_t d) const {
-    assert(d < getDimRank() && "Dimension is out of bounds");
+    assert(d < getDimRank());
     return dimSizes[d];
   }
 
@@ -106,7 +111,7 @@ class SparseTensorStorageBase {
 
   /// Safely looks up the size of the given storage-level.
   uint64_t getLvlSize(uint64_t l) const {
-    assert(l < getLvlRank() && "Level is out of bounds");
+    assert(l < getLvlRank());
     return lvlSizes[l];
   }
 
@@ -115,7 +120,7 @@ class SparseTensorStorageBase {
 
   /// Safely looks up the type of the given level.
   DimLevelType getLvlType(uint64_t l) const {
-    assert(l < getLvlRank() && "Level is out of bounds");
+    assert(l < getLvlRank());
     return lvlTypes[l];
   }
 
@@ -173,6 +178,13 @@ class SparseTensorStorageBase {
   MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETVALUES)
 #undef DECL_GETVALUES
 
+  /// Element-wise forwarding insertions. The first argument is the
+  /// dimension-coordinates for the value being inserted.
+#define DECL_FORWARDINGINSERT(VNAME, V)                                        \
+  virtual void forwardingInsert(const uint64_t *, V);
+  MLIR_SPARSETENSOR_FOREVERY_V(DECL_FORWARDINGINSERT)
+#undef DECL_FORWARDINGINSERT
+
   /// Element-wise insertion in lexicographic coordinate order. The first
   /// argument is the level-coordinates for the value being inserted.
 #define DECL_LEXINSERT(VNAME, V) virtual void lexInsert(const uint64_t *, V);
@@ -182,24 +194,17 @@ class SparseTensorStorageBase {
   /// Expanded insertion.  Note that this method resets the
   /// values/filled-switch array back to all-zero/false while only
   /// iterating over the nonzero elements.
-  ///
-  /// Arguments:
-  /// * `lvlCoords` the level-coordinates shared by the values being inserted.
-  /// * `values` a map from last-level coordinates to their associated value.
-  /// * `filled` a map from last-level coordinates to bool, indicating
-  ///   whether `values` contains a valid value to be inserted.
-  /// * `added` a map from `[0..count)` to last-level coordinates for
-  ///   which `filled` is true and `values` contains the assotiated value.
-  /// * `count` the size of `added`.
-  /// * `expsz` the size of the expanded vector (verification only).
 #define DECL_EXPINSERT(VNAME, V)                                               \
   virtual void expInsert(uint64_t *, V *, bool *, uint64_t *, uint64_t,        \
                          uint64_t);
   MLIR_SPARSETENSOR_FOREVERY_V(DECL_EXPINSERT)
 #undef DECL_EXPINSERT
 
-  /// Finishes insertion.
-  virtual void endInsert() = 0;
+  /// Finalizes forwarding insertions.
+  virtual void endForwardingInsert() = 0;
+
+  /// Finalizes lexicographic insertions.
+  virtual void endLexInsert() = 0;
 
 private:
   const std::vector<uint64_t> dimSizes;
@@ -207,6 +212,8 @@ class SparseTensorStorageBase {
   const std::vector<DimLevelType> lvlTypes;
   const std::vector<uint64_t> dim2lvlVec;
   const std::vector<uint64_t> lvl2dimVec;
+
+protected:
   const MapRef map; // non-owning pointers into dim2lvl/lvl2dim vectors
 };
 
@@ -229,7 +236,8 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
                       const uint64_t *lvl2dim)
       : SparseTensorStorageBase(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
                                 dim2lvl, lvl2dim),
-        positions(lvlRank), coordinates(lvlRank), lvlCursor(lvlRank) {}
+        positions(lvlRank), coordinates(lvlRank), lvlCursor(lvlRank), lvlCOO() {
+  }
 
 public:
   /// Constructs a sparse tensor with the given encoding, and allocates
@@ -242,11 +250,12 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
                       const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
-                      const uint64_t *lvl2dim, bool initializeValuesIfAllDense);
+                      const uint64_t *lvl2dim, SparseTensorCOO<V> *coo,
+                      bool initializeValuesIfAllDense);
 
   /// Constructs a sparse tensor with the given encoding, and initializes
   /// the contents from the COO. This ctor performs the same heuristic
-  /// overhead-storage allocation as the ctor taking a `bool`.
+  /// overhead-storage allocation as the ctor above.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const DimLevelType *lvlTypes,
                       const uint64_t *dim2lvl, const uint64_t *lvl2dim,
@@ -279,10 +288,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   static SparseTensorStorage<P, C, V> *
   newEmpty(uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
            const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-           const uint64_t *dim2lvl, const uint64_t *lvl2dim) {
-    return new SparseTensorStorage<P, C, V>(
-        dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim, true);
-  }
+           const uint64_t *dim2lvl, const uint64_t *lvl2dim, bool forwarding);
 
   /// Allocates a new sparse tensor and initializes it from the given COO.
   /// The preconditions are as per the `SparseTensorStorageBase` ctor
@@ -303,19 +309,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
 
   /// Allocates a new sparse tensor and initializes it with the contents
   /// of another sparse tensor.
-  ///
-  /// Preconditions:
-  /// * as per the `SparseTensorStorageBase` ctor.
-  /// * `src2lvl` must be valid for `srcRank`, must map coordinates valid
-  ///    for `source.getDimSizes()` to coordinates valid for `lvlSizes`,
-  ///    and therefore must be the inverse of `lvl2dim`.
-  /// * `source` must have the same value type `V`.
-  ///
-  /// Asserts:
-  /// * `dimRank` and `lvlRank` are nonzero.
-  /// * `srcRank == source.getDimRank()`.
-  /// * `lvlSizes` contains only nonzero sizes.
-  /// * `source.getDimSizes()` is a refinement of `dimShape`.
   //
   // TODO: The `dimRank` and `dimShape` arguments are only used for
   // verifying that the source tensor has the expected shape.  So if we
@@ -337,10 +330,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
 
   /// Allocates a new sparse tensor and initialize it with the data stored level
   /// buffers directly.
-  ///
-  /// Precondition:
-  /// * as per the `SparseTensorStorageBase` ctor.
-  /// * the data integrity stored in `buffers` is guaranteed by users already.
   static SparseTensorStorage<P, C, V> *packFromLvlBuffers(
       uint64_t dimRank, const uint64_t *dimShape, uint64_t lvlRank,
       const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
@@ -352,12 +341,12 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// Partially specialize these getter methods based on template types.
   void getPositions(std::vector<P> **out, uint64_t lvl) final {
     assert(out && "Received nullptr for out parameter");
-    assert(lvl < getLvlRank() && "Level is out of bounds");
+    assert(lvl < getLvlRank());
     *out = &positions[lvl];
   }
   void getCoordinates(std::vector<C> **out, uint64_t lvl) final {
     assert(out && "Received nullptr for out parameter");
-    assert(lvl < getLvlRank() && "Level is out of bounds");
+    assert(lvl < getLvlRank());
     *out = &coordinates[lvl];
   }
   void getValues(std::vector<V> **out) final {
@@ -365,15 +354,23 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     *out = &values;
   }
 
+  /// Returns coordinate at given position.
   uint64_t getCrd(uint64_t lvl, uint64_t pos) const final {
     ASSERT_COMPRESSED_OR_SINGLETON_LVL(lvl);
-    assert(pos < coordinates[lvl].size() && "Position is out of bounds");
+    assert(pos < coordinates[lvl].size());
     return coordinates[lvl][pos]; // Converts the stored `C` into `uint64_t`.
   }
 
+  /// Partially specialize forwarding insertions based on template types.
+  void forwardingInsert(const uint64_t *dimCoords, V val) final {
+    assert(dimCoords && lvlCOO);
+    map.pushforward(dimCoords, lvlCursor.data());
+    lvlCOO->add(lvlCursor, val);
+  }
+
   /// Partially specialize lexicographical insertions based on template types.
   void lexInsert(const uint64_t *lvlCoords, V val) final {
-    assert(lvlCoords && "Received nullptr for level-coordinates");
+    assert(lvlCoords);
     // First, wrap up pending insertion path.
     uint64_t diffLvl = 0;
     uint64_t full = 0;
@@ -416,8 +413,22 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     }
   }
 
+  /// Finalizes forwarding insertions.
+  void endForwardingInsert() final {
+    // Ensure lvlCOO is sorted.
+    assert(lvlCOO);
+    lvlCOO->sort();
+    // Now actually insert the `elements`.
+    const auto &elements = lvlCOO->getElements();
+    const uint64_t nse = elements.size();
+    assert(values.size() == 0);
+    values.reserve(nse);
+    fromCOO(elements, 0, nse, 0);
+    delete lvlCOO;
+  }
+
   /// Finalizes lexicographic insertions.
-  void endInsert() final {
+  void endLexInsert() final {
     if (values.empty())
       finalizeSegment(0);
     else
@@ -463,7 +474,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// does not check that `pos` is semantically valid (i.e., larger than
   /// the previous position and smaller than `coordinates[lvl].capacity()`).
   void appendPos(uint64_t lvl, uint64_t pos, uint64_t count = 1) {
-    assert(isCompressedLvl(lvl) && "Level is not compressed");
+    assert(isCompressedLvl(lvl));
     positions[lvl].insert(positions[lvl].end(), count,
                           detail::checkOverflowCast<P>(pos));
   }
@@ -482,7 +493,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     if (isCompressedDLT(dlt) || isSingletonDLT(dlt)) {
       coordinates[lvl].push_back(detail::checkOverflowCast<C>(crd));
     } else { // Dense level.
-      ASSERT_DENSE_DLT(dlt);
+      assert(isDenseDLT(dlt));
       assert(crd >= full && "Coordinate was already filled");
       if (crd == full)
         return; // Short-circuit, since it'll be a nop.
@@ -502,7 +513,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     // Subscript assignment to `std::vector` requires that the `pos`-th
     // entry has been initialized; thus we must be sure to check `size()`
     // here, instead of `capacity()` as would be ideal.
-    assert(pos < coordinates[lvl].size() && "Position is out of bounds");
+    assert(pos < coordinates[lvl].size());
     coordinates[lvl][pos] = detail::checkOverflowCast<C>(crd);
   }
 
@@ -574,7 +585,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     } else if (isSingletonDLT(dlt)) {
       return; // Nothing to finalize.
     } else {  // Dense dimension.
-      ASSERT_DENSE_DLT(dlt);
+      assert(isDenseDLT(dlt));
       const uint64_t sz = getLvlSizes()[l];
       assert(sz >= full && "Segment is overfull");
       count = detail::checkedMul(count, sz - full);
@@ -593,7 +604,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   void endPath(uint64_t diffLvl) {
     const uint64_t lvlRank = getLvlRank();
     const uint64_t lastLvl = lvlRank - 1;
-    assert(diffLvl <= lvlRank && "Level-diff is out of bounds");
+    assert(diffLvl <= lvlRank);
     const uint64_t stop = lvlRank - diffLvl;
     for (uint64_t i = 0; i < stop; ++i) {
       const uint64_t l = lastLvl - i;
@@ -606,7 +617,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   void insPath(const uint64_t *lvlCoords, uint64_t diffLvl, uint64_t full,
                V val) {
     const uint64_t lvlRank = getLvlRank();
-    assert(diffLvl <= lvlRank && "Level-diff is out of bounds");
+    assert(diffLvl <= lvlRank);
     for (uint64_t l = diffLvl; l < lvlRank; ++l) {
       const uint64_t c = lvlCoords[l];
       appendCrd(l, full, c);
@@ -646,11 +657,17 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   std::vector<std::vector<C>> coordinates;
   std::vector<V> values;
   std::vector<uint64_t> lvlCursor; // cursor for lexicographic insertion.
+  SparseTensorCOO<V> *lvlCOO;      // COO used during forwarding
 };
 
 #undef ASSERT_COMPRESSED_OR_SINGLETON_LVL
 
 //===----------------------------------------------------------------------===//
+//
+//  SparseTensorEnumerator
+//
+//===----------------------------------------------------------------------===//
+
 /// A (higher-order) function object for enumerating the elements of some
 /// `SparseTensorStorage` under a permutation.  That is, the `forallElements`
 /// method encapsulates the loop-nest for enumerating the elements of
@@ -738,7 +755,6 @@ class SparseTensorEnumeratorBase {
   std::vector<uint64_t> trgCursor; // in target order.
 };
 
-//===----------------------------------------------------------------------===//
 template <typename P, typename C, typename V>
 class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
   using Base = SparseTensorEnumeratorBase<V>;
@@ -778,8 +794,7 @@ class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
     // Recover the `<P,C,V>` type parameters of `src`.
     const auto &src = static_cast<const StorageImpl &>(this->src);
     if (l == src.getLvlRank()) {
-      assert(parentPos < src.values.size() &&
-             "Value position is out of bounds");
+      assert(parentPos < src.values.size());
       // TODO: <https://github.com/llvm/llvm-project/issues/54179>
       yield(this->trgCursor, src.values[parentPos]);
       return;
@@ -790,13 +805,12 @@ class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
       // Look up the bounds of the `l`-level segment determined by the
       // `(l - 1)`-level position `parentPos`.
       const std::vector<P> &positionsL = src.positions[l];
-      assert(parentPos + 1 < positionsL.size() &&
-             "Parent position is out of bounds");
+      assert(parentPos + 1 < positionsL.size());
       const uint64_t pstart = static_cast<uint64_t>(positionsL[parentPos]);
       const uint64_t pstop = static_cast<uint64_t>(positionsL[parentPos + 1]);
       // Loop-invariant code for looking up the `l`-level coordinates.
       const std::vector<C> &coordinatesL = src.coordinates[l];
-      assert(pstop <= coordinatesL.size() && "Stop position is out of bounds");
+      assert(pstop <= coordinatesL.size());
       for (uint64_t pos = pstart; pos < pstop; ++pos) {
         cursorL = static_cast<uint64_t>(coordinatesL[pos]);
         forallElements(yield, pos, l + 1);
@@ -805,7 +819,7 @@ class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
       cursorL = src.getCrd(l, parentPos);
       forallElements(yield, parentPos, l + 1);
     } else { // Dense level.
-      ASSERT_DENSE_DLT(dlt);
+      assert(isDenseDLT(dlt));
       const uint64_t sz = src.getLvlSizes()[l];
       const uint64_t pstart = parentPos * sz;
       for (uint64_t c = 0; c < sz; ++c) {
@@ -817,6 +831,11 @@ class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
 };
 
 //===----------------------------------------------------------------------===//
+//
+//  SparseTensorNNZ
+//
+//===----------------------------------------------------------------------===//
+
 /// Statistics regarding the number of nonzero subtensors in
 /// a source tensor, for direct sparse=>sparse conversion a la
 /// <https://arxiv.org/abs/2001.02609>.
@@ -889,7 +908,23 @@ class SparseTensorNNZ final {
 };
 
 //===----------------------------------------------------------------------===//
-// Definitions of the ctors and factories of `SparseTensorStorage<P,C,V>`.
+//
+//  SparseTensorStorage Factories
+//
+//===----------------------------------------------------------------------===//
+
+template <typename P, typename C, typename V>
+SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newEmpty(
+    uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
+    const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
+    const uint64_t *dim2lvl, const uint64_t *lvl2dim, bool forwarding) {
+  SparseTensorCOO<V> *lvlCOO = nullptr;
+  if (forwarding)
+    lvlCOO = new SparseTensorCOO<V>(lvlRank, lvlSizes);
+  return new SparseTensorStorage<P, C, V>(dimRank, dimSizes, lvlRank, lvlSizes,
+                                          lvlTypes, dim2lvl, lvl2dim, lvlCOO,
+                                          !forwarding);
+}
 
 // TODO: MapRef
 template <typename P, typename C, typename V>
@@ -897,8 +932,7 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromCOO(
     uint64_t dimRank, const uint64_t *dimShape, uint64_t lvlRank,
     const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
     const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO) {
-  assert(dimShape && "Got nullptr for dimension shape");
-  assert(lvl2dim && "Got nullptr for level-to-dimension mapping");
+  assert(dimShape && dim2lvl && lvl2dim);
   const auto &lvlSizes = lvlCOO.getDimSizes();
   assert(lvlRank == lvlSizes.size() && "Level-rank mismatch");
   // Must reconstruct `dimSizes` from `lvlSizes`.  While this is easy
@@ -956,14 +990,21 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::packFromLvlBuffers(
   return tensor;
 }
 
+//===----------------------------------------------------------------------===//
+//
+//  SparseTensorStorage Constructors
+//
+//===----------------------------------------------------------------------===//
+
 template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V>::SparseTensorStorage(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-    const uint64_t *dim2lvl, const uint64_t *lvl2dim,
+    const uint64_t *dim2lvl, const uint64_t *lvl2dim, SparseTensorCOO<V> *coo,
     bool initializeValuesIfAllDense)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
                           dim2lvl, lvl2dim) {
+  lvlCOO = coo;
   // Provide hints on capacity of positions and coordinates.
   // TODO: needs much fine-tuning based on actual sparsity; currently
   // we reserve position/coordinate space based on all previous dense
@@ -984,7 +1025,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       sz = 1;
       allDense = false;
     } else { // Dense level.
-      ASSERT_DENSE_DLT(dlt);
+      assert(isDenseDLT(dlt));
       sz = detail::checkedMul(sz, lvlSizes[l]);
     }
   }
@@ -992,6 +1033,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     values.resize(sz, 0);
 }
 
+// TODO: share more code with forwarding methods?
 template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V>::SparseTensorStorage( // NOLINT
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
@@ -999,14 +1041,14 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage( // NOLINT
     const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank,
                           lvlCOO.getDimSizes().data(), lvlTypes, dim2lvl,
-                          lvl2dim, false) {
+                          lvl2dim, nullptr, false) {
+  // Ensure lvlCOO is sorted.
   assert(lvlRank == lvlCOO.getDimSizes().size() && "Level-rank mismatch");
-  // Ensure the preconditions of `fromCOO`.  (One is already ensured by
-  // using `lvlSizes = lvlCOO.getDimSizes()` in the ctor above.)
   lvlCOO.sort();
   // Now actually insert the `elements`.
   const auto &elements = lvlCOO.getElements();
   const uint64_t nse = elements.size();
+  assert(values.size() == 0);
   values.reserve(nse);
   fromCOO(elements, 0, nse, 0);
 }
@@ -1053,7 +1095,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       if (isCompressedDLT(dlt) || isSingletonDLT(dlt))
         coordinates[l].resize(parentSz, 0);
       else
-        ASSERT_DENSE_DLT(dlt); // Future-proofing.
+        assert(isDenseDLT(dlt));
     }
     values.resize(parentSz, 0); // Both allocate and zero-initialize.
   }
@@ -1067,7 +1109,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
         // however, it's semantically invalid here since that entry
         // does not represent a segment of `coordinates[l]`.  Moreover, that
         // entry must be immutable for `assembledSize` to remain valid.
-        assert(parentPos < parentSz && "Parent position is out of bounds");
+        assert(parentPos < parentSz);
         const uint64_t currentPos = positions[l][parentPos];
         // This increment won't overflow the `P` type, since it can't
         // exceed the original value of `positions[l][parentPos+1]`
@@ -1080,12 +1122,12 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
         writeCrd(l, parentPos, lvlCoords[l]);
         // the new parentPos equals the old parentPos.
       } else { // Dense level.
-        ASSERT_DENSE_DLT(dlt);
+        assert(isDenseDLT(dlt));
         parentPos = parentPos * getLvlSizes()[l] + lvlCoords[l];
       }
       parentSz = assembledSize(parentSz, l);
     }
-    assert(parentPos < values.size() && "Value position is out of bounds");
+    assert(parentPos < values.size());
     values[parentPos] = val;
   });
   // The finalizeYieldPos loop
@@ -1105,8 +1147,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     } else {
       // Both dense and singleton are no-ops for the finalizeYieldPos loop.
       // This assertion is for future-proofing.
-      assert((isDenseDLT(dlt) || isSingletonDLT(dlt)) &&
-             "Level is neither dense nor singleton");
+      assert((isDenseDLT(dlt) || isSingletonDLT(dlt)));
     }
     parentSz = assembledSize(parentSz, l);
   }
@@ -1140,7 +1181,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       positions[l].assign(posPtr, posPtr + parentSz + 1);
       coordinates[l].assign(crdPtr, crdPtr + positions[l][parentSz]);
     } else {
-      assert(isDenseLvl(l) && "Level is not dense");
+      assert(isDenseLvl(l));
     }
     parentSz = assembledSize(parentSz, l);
   }
@@ -1165,8 +1206,6 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
   values.assign(valPtr, valPtr + parentSz);
 }
 
-#undef ASSERT_DENSE_DLT
-
 } // namespace sparse_tensor
 } // namespace mlir
 
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
index e723a354345849d..776f74cdc804ea0 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
@@ -37,7 +37,6 @@ extern "C" {
 //
 //===----------------------------------------------------------------------===//
 
-/// The @newSparseTensor function for constructing a new sparse tensor.
 /// This is the "swiss army knife" method for materializing sparse
 /// tensors into the computation.  The types of the `ptr` argument and
 /// the result depend on the action, as explained in the following table
@@ -45,14 +44,13 @@ extern "C" {
 /// a coordinate-scheme object, and "Iterator" means an iterator object).
 ///
 /// Action:         `ptr`:          Returns:
-/// kEmpty          unused          STS, empty
-/// kEmptyCOO       unused          COO, empty
-/// kFromFile       char* filename  STS, read from the file
+/// kEmpty          -               STS, empty
+/// kEmptyForward   -               STS, empty, with forwarding COO
 /// kFromCOO        COO             STS, copied from the COO source
-/// kToCOO          STS             COO, copied from the STS source
 /// kSparseToSparse STS             STS, copied from the STS source
-/// kToIterator     STS             Iterator, call @getNext to use and
-///                                 @delSparseTensorIterator to free.
+/// kToCOO          STS             COO, copied from the STS source
+/// kToIterator     STS             Iterator (@getNext/@delSparseTensorIterator)
+/// kPack           buffers         STS, from level buffers
 MLIR_CRUNNERUTILS_EXPORT void *_mlir_ciface_newSparseTensor( // NOLINT
     StridedMemRefType<index_type, 1> *dimSizesRef,
     StridedMemRefType<index_type, 1> *lvlSizesRef,
@@ -84,19 +82,15 @@ MLIR_SPARSETENSOR_FOREVERY_O(DECL_SPARSEPOSITIONS)
 MLIR_SPARSETENSOR_FOREVERY_O(DECL_SPARSECOORDINATES)
 #undef DECL_SPARSECOORDINATES
 
-/// Coordinate-scheme method for adding a new element.
-/// TODO: remove dim2lvl
-#define DECL_ADDELT(VNAME, V)                                                  \
-  MLIR_CRUNNERUTILS_EXPORT void *_mlir_ciface_addElt##VNAME(                   \
-      void *lvlCOO, StridedMemRefType<V, 0> *vref,                             \
-      StridedMemRefType<index_type, 1> *dimCoordsRef,                          \
-      StridedMemRefType<index_type, 1> *dim2lvlRef);
-MLIR_SPARSETENSOR_FOREVERY_V(DECL_ADDELT)
-#undef DECL_ADDELT
+/// Tensor-storage method for a dim to lvl forwarding insertion.
+#define DECL_FORWARDINGINSERT(VNAME, V)                                        \
+  MLIR_CRUNNERUTILS_EXPORT void _mlir_ciface_forwardingInsert##VNAME(          \
+      void *tensor, StridedMemRefType<V, 0> *vref,                             \
+      StridedMemRefType<index_type, 1> *dimCoordsRef);                         \
+  MLIR_SPARSETENSOR_FOREVERY_V(DECL_FORWARDINGINSERT)
 
+#undef DECL_FORWARDINGINSERT
 /// Coordinate-scheme method for getting the next element while iterating.
-/// The `cref` argument uses the same coordinate-space as the `iter` (which
-/// can be either dim- or lvl-coords, depending on context).
 #define DECL_GETNEXT(VNAME, V)                                                 \
   MLIR_CRUNNERUTILS_EXPORT bool _mlir_ciface_getNext##VNAME(                   \
       void *iter, StridedMemRefType<index_type, 1> *cref,                      \
@@ -185,8 +179,11 @@ MLIR_CRUNNERUTILS_EXPORT index_type sparseLvlSize(void *tensor, index_type l);
 /// Tensor-storage method to get the size of the given dimension.
 MLIR_CRUNNERUTILS_EXPORT index_type sparseDimSize(void *tensor, index_type d);
 
+/// Tensor-storage method to finalize forwarding insertions.
+MLIR_CRUNNERUTILS_EXPORT void endForwardingInsert(void *tensor);
+
 /// Tensor-storage method to finalize lexicographic insertions.
-MLIR_CRUNNERUTILS_EXPORT void endInsert(void *tensor);
+MLIR_CRUNNERUTILS_EXPORT void endLexInsert(void *tensor);
 
 /// Coordinate-scheme method to write to file in extended FROSTT format.
 #define DECL_OUTSPARSETENSOR(VNAME, V)                                         \
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index d2d7b46ab834e71..5c3a59aa3caf36a 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -241,16 +241,6 @@ class NewCallParams final {
     return true;
   }
 
-  /// Gets the dimension-to-level mapping.
-  //
-  // TODO: This is only ever used for passing into `genAddEltCall`;
-  // is there a better way to encapsulate that pattern (both to avoid
-  // this one-off getter, and to avoid potential mixups)?
-  Value getDimToLvl() const {
-    assert(isInitialized() && "Must initialize before getDimToLvl");
-    return params[kParamDim2Lvl];
-  }
-
   /// Generates a function call, with the current static parameters
   /// and the given dynamic arguments.
   Value genNewCall(Action action, Value ptr = Value()) {
@@ -307,16 +297,11 @@ static void genDelIteratorCall(OpBuilder &builder, Location loc, Type elemTp,
   createFuncCall(builder, loc, name, {}, iter, EmitCInterface::Off);
 }
 
-/// Generates a call that adds one element to a coordinate scheme.
-/// In particular, this generates code like the following:
-///   val = a[i1,..,ik];
-///   if val != 0
-///     t->add(&val, [i1,..,ik], [p1,..,pk]);
-static void genAddEltCall(OpBuilder &builder, Location loc, Type eltType,
-                          Value lvlCOO, Value valPtr, Value dimCoords,
-                          Value dimToLvl) {
-  SmallString<9> name{"addElt", primaryTypeFunctionSuffix(eltType)};
-  SmallVector<Value, 4> params{lvlCOO, valPtr, dimCoords, dimToLvl};
+/// Generates a call with a forwarding insertion.
+static void genForwardingInsert(OpBuilder &builder, Location loc, Type eltType,
+                                Value tensor, Value valPtr, Value dimCoords) {
+  SmallString<9> name{"forwardingInsert", primaryTypeFunctionSuffix(eltType)};
+  SmallVector<Value, 4> params{tensor, valPtr, dimCoords};
   Type pTp = getOpaquePointerType(builder);
   createFuncCall(builder, loc, name, pTp, params, EmitCInterface::On);
 }
@@ -659,40 +644,26 @@ class SparseTensorConvertConverter : public OpConversionPattern<ConvertOp> {
       return success();
     }
     assert(!srcTp.hasEncoding() && dstTp.hasEncoding());
-    // This is a dense => sparse conversion or a sparse constant in COO =>
-    // sparse conversion, which is handled as follows:
-    //   t = newSparseCOO()
-    //   ...code to fill the COO tensor t...
-    //   s = newSparseTensor(t)
-    //
-    // To fill the COO tensor from a dense tensor:
-    //   for i1 in dim1
-    //    ..
-    //     for ik in dimk
-    //       val = a[i1,..,ik]
-    //       if val != 0
-    //         t->add(val, [i1,..,ik], [p1,..,pk])
+    // This is a "dense => sparse conversion" or a "sparse constant => sparse
+    // conversion" which is conceptually handled as follows, with an additional
+    // test for nonzero values for the dense case.
     //
-    // To fill the COO tensor from a sparse constant in COO format:
-    //   for i in range(NNZ)
-    //     val = values[i]
-    //     [i1,..,ik] = coordinates[i]
-    //     t->add(val, [i1,..,ik], [p1,..,pk])
+    //   st = newSparseTensor()                  ; ST with forwarding COO
+    //   for i1 in dim1, ..., ik in dimk         ; loop nest or range(NNZ) loop
+    //     val = a[i1,..,ik]                     ;
+    //     st->insertForwarding(val, [i1,..,ik]) ; maps dim to level
+    //   st->endForwardingInsert()               ; finalize forwarding
     //
-    // Note that the dense tensor traversal code is actually implemented
-    // using MLIR IR to avoid having to expose too much low-level
-    // memref traversal details to the runtime support library.
-    // Also note that the code below only generates the "new" ops and
-    // the loop-nest per se; whereas the entire body of the innermost
-    // loop is generated by genAddElt().
+    // Note that the traversal code is actually implemented using MLIR IR to
+    // avoid having to expose too much low-level memref traversal details to
+    // the runtime support library.
     SmallVector<Value> dimSizes;
     sizesFromSrc(rewriter, dimSizes, loc, src);
     NewCallParams params(rewriter, loc);
-    Value coo =
-        params.genBuffers(dstTp, dimSizes).genNewCall(Action::kEmptyCOO);
+    Value tensor =
+        params.genBuffers(dstTp, dimSizes).genNewCall(Action::kEmptyForward);
     const Type iTp = rewriter.getIndexType();
     Value dimCoords = genAlloca(rewriter, loc, dimRank, iTp);
-    Value dimToLvl = params.getDimToLvl();
     Value elemPtr = genAllocaScalar(rewriter, loc, elemTp);
     genDenseTensorOrSparseConstantIterLoop(
         rewriter, loc, src, dimRank,
@@ -700,13 +671,13 @@ class SparseTensorConvertConverter : public OpConversionPattern<ConvertOp> {
           assert(dcvs.size() == static_cast<size_t>(dimRank));
           storeAll(builder, loc, dimCoords, dcvs);
           builder.create<memref::StoreOp>(loc, val, elemPtr);
-          genAddEltCall(builder, loc, elemTp, coo, elemPtr, dimCoords,
-                        dimToLvl);
+          genForwardingInsert(builder, loc, elemTp, tensor, elemPtr, dimCoords);
         });
     // Final call to construct sparse tensor storage.
-    Value dst = params.genNewCall(Action::kFromCOO, coo);
-    genDelCOOCall(rewriter, loc, elemTp, coo);
-    rewriter.replaceOp(op, dst);
+    StringRef name = "endForwardingInsert";
+    createFuncCall(rewriter, op->getLoc(), name, {}, tensor,
+                   EmitCInterface::Off);
+    rewriter.replaceOp(op, tensor);
     return success();
   }
 
@@ -828,7 +799,7 @@ class SparseTensorLoadConverter : public OpConversionPattern<LoadOp> {
                   ConversionPatternRewriter &rewriter) const override {
     if (op.getHasInserts()) {
       // Finalize any pending insertions.
-      StringRef name = "endInsert";
+      StringRef name = "endLexInsert";
       createFuncCall(rewriter, op->getLoc(), name, {}, adaptor.getOperands(),
                      EmitCInterface::Off);
     }
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index bc6d4ad2c740189..614f712e2248eee 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -177,9 +177,16 @@ extern "C" {
 #define CASE(p, c, v, P, C, V)                                                 \
   if (posTp == (p) && crdTp == (c) && valTp == (v)) {                          \
     switch (action) {                                                          \
-    case Action::kEmpty:                                                       \
+    case Action::kEmpty: {                                                     \
       return SparseTensorStorage<P, C, V>::newEmpty(                           \
-          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim);   \
+          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
+          false);                                                              \
+    }                                                                          \
+    case Action::kEmptyForward: {                                              \
+      return SparseTensorStorage<P, C, V>::newEmpty(                           \
+          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
+          true);                                                               \
+    }                                                                          \
     case Action::kFromCOO: {                                                   \
       assert(ptr && "Received nullptr for SparseTensorCOO object");            \
       auto &coo = *static_cast<SparseTensorCOO<V> *>(ptr);                     \
@@ -193,8 +200,9 @@ extern "C" {
           dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
           dimRank, tensor);                                                    \
     }                                                                          \
-    case Action::kEmptyCOO:                                                    \
-      return new SparseTensorCOO<V>(lvlRank, lvlSizes);                        \
+    case Action::kFuture: {                                                    \
+      break;                                                                   \
+    }                                                                          \
     case Action::kToCOO: {                                                     \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       auto &tensor = *static_cast<SparseTensorStorage<P, C, V> *>(ptr);        \
@@ -399,29 +407,20 @@ MLIR_SPARSETENSOR_FOREVERY_O(IMPL_SPARSECOORDINATES)
 #undef IMPL_SPARSECOORDINATES
 #undef IMPL_GETOVERHEAD
 
-// TODO: use MapRef here for translation of coordinates
-// TODO: remove dim2lvl
-#define IMPL_ADDELT(VNAME, V)                                                  \
-  void *_mlir_ciface_addElt##VNAME(                                            \
-      void *lvlCOO, StridedMemRefType<V, 0> *vref,                             \
-      StridedMemRefType<index_type, 1> *dimCoordsRef,                          \
-      StridedMemRefType<index_type, 1> *dim2lvlRef) {                          \
-    assert(lvlCOO &&vref);                                                     \
+#define IMPL_FORWARDINGINSERT(VNAME, V)                                        \
+  void _mlir_ciface_forwardingInsert##VNAME(                                   \
+      void *t, StridedMemRefType<V, 0> *vref,                                  \
+      StridedMemRefType<index_type, 1> *dimCoordsRef) {                        \
+    assert(t &&vref);                                                          \
+    auto &tensor = *static_cast<SparseTensorStorageBase *>(t);                 \
     ASSERT_NO_STRIDE(dimCoordsRef);                                            \
-    ASSERT_NO_STRIDE(dim2lvlRef);                                              \
-    const uint64_t rank = MEMREF_GET_USIZE(dimCoordsRef);                      \
-    ASSERT_USIZE_EQ(dim2lvlRef, rank);                                         \
     const index_type *dimCoords = MEMREF_GET_PAYLOAD(dimCoordsRef);            \
-    const index_type *dim2lvl = MEMREF_GET_PAYLOAD(dim2lvlRef);                \
-    std::vector<index_type> lvlCoords(rank);                                   \
-    for (uint64_t d = 0; d < rank; ++d)                                        \
-      lvlCoords[dim2lvl[d]] = dimCoords[d];                                    \
-    V *value = MEMREF_GET_PAYLOAD(vref);                                       \
-    static_cast<SparseTensorCOO<V> *>(lvlCOO)->add(lvlCoords, *value);         \
-    return lvlCOO;                                                             \
+    assert(dimCoords);                                                         \
+    const V *value = MEMREF_GET_PAYLOAD(vref);                                 \
+    tensor.forwardingInsert(dimCoords, *value);                                \
   }
-MLIR_SPARSETENSOR_FOREVERY_V(IMPL_ADDELT)
-#undef IMPL_ADDELT
+MLIR_SPARSETENSOR_FOREVERY_V(IMPL_FORWARDINGINSERT)
+#undef IMPL_FORWARDINGINSERT
 
 // NOTE: the `cref` argument uses the same coordinate-space as the `iter`
 // (which can be either dim- or lvl-coords, depending on context).
@@ -686,8 +685,12 @@ index_type sparseDimSize(void *tensor, index_type d) {
   return static_cast<SparseTensorStorageBase *>(tensor)->getDimSize(d);
 }
 
-void endInsert(void *tensor) {
-  return static_cast<SparseTensorStorageBase *>(tensor)->endInsert();
+void endForwardingInsert(void *tensor) {
+  return static_cast<SparseTensorStorageBase *>(tensor)->endForwardingInsert();
+}
+
+void endLexInsert(void *tensor) {
+  return static_cast<SparseTensorStorageBase *>(tensor)->endLexInsert();
 }
 
 #define IMPL_OUTSPARSETENSOR(VNAME, V)                                         \
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
index 29093a055ab2e04..96300a98a6a4bc5 100644
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -296,7 +296,7 @@ func.func @sparse_reconstruct(%arg0: tensor<128xf32, #SparseVector>) -> tensor<1
 
 // CHECK-LABEL: func @sparse_reconstruct_ins(
 //  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>
-//       CHECK: call @endInsert(%[[A]]) : (!llvm.ptr<i8>) -> ()
+//       CHECK: call @endLexInsert(%[[A]]) : (!llvm.ptr<i8>) -> ()
 //       CHECK: return %[[A]] : !llvm.ptr<i8>
 func.func @sparse_reconstruct_ins(%arg0: tensor<128xf32, #SparseVector>) -> tensor<128xf32, #SparseVector> {
   %0 = sparse_tensor.load %arg0 hasInserts : tensor<128xf32, #SparseVector>
diff --git a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
index 1a69c80f7ecadfd..d7f7ef2bfbb96d8 100644
--- a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
@@ -18,83 +18,79 @@
 
 // CHECK-LABEL:   func.func @sparse_convert_1d(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<?xi32>) -> !llvm.ptr<i8> {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 2 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 4 : i32
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 6 : i32
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 6 : i32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 8 : i8
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?xi32>
+// CHECK:           %[[VAL_8:.*]] = memref.alloca() : memref<1xi8>
+// CHECK:           %[[VAL_9:.*]] = memref.cast %[[VAL_8]] : memref<1xi8> to memref<?xi8>
+// CHECK:           memref.store %[[VAL_5]], %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<1xi8>
+// CHECK:           %[[VAL_10:.*]] = memref.alloca() : memref<1xindex>
+// CHECK:           %[[VAL_11:.*]] = memref.cast %[[VAL_10]] : memref<1xindex> to memref<?xindex>
+// CHECK:           memref.store %[[VAL_7]], %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<1xindex>
+// CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<1xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<1xindex> to memref<?xindex>
+// CHECK:           memref.store %[[VAL_6]], %[[VAL_12]]{{\[}}%[[VAL_6]]] : memref<1xindex>
+// CHECK:           %[[VAL_14:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
+// CHECK:           %[[VAL_15:.*]] = call @newSparseTensor(%[[VAL_11]], %[[VAL_11]], %[[VAL_9]], %[[VAL_13]], %[[VAL_13]], %[[VAL_3]], %[[VAL_3]], %[[VAL_2]], %[[VAL_1]], %[[VAL_14]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_16:.*]] = memref.alloca() : memref<1xindex>
+// CHECK:           %[[VAL_17:.*]] = memref.cast %[[VAL_16]] : memref<1xindex> to memref<?xindex>
+// CHECK:           %[[VAL_18:.*]] = memref.alloca() : memref<i32>
+// CHECK:           scf.for %[[VAL_19:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_4]] {
+// CHECK:             %[[VAL_20:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_19]]] : tensor<?xi32>
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_20]], %[[VAL_3]] : i32
+// CHECK:             scf.if %[[VAL_21]] {
+// CHECK:               memref.store %[[VAL_19]], %[[VAL_16]]{{\[}}%[[VAL_6]]] : memref<1xindex>
+// CHECK:               memref.store %[[VAL_20]], %[[VAL_18]][] : memref<i32>
+// CHECK:               %[[VAL_22:.*]] = func.call @forwardingInsertI32(%[[VAL_15]], %[[VAL_18]], %[[VAL_17]]) : (!llvm.ptr<i8>, memref<i32>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           call @endForwardingInsert(%[[VAL_15]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           return %[[VAL_15]] : !llvm.ptr<i8>
+// CHECK:         }
+func.func @sparse_convert_1d(%arg0: tensor<?xi32>) -> tensor<?xi32, #SparseVector> {
+  %0 = sparse_tensor.convert %arg0 : tensor<?xi32> to tensor<?xi32, #SparseVector>
+  return %0 : tensor<?xi32, #SparseVector>
+}
+
+// CHECK-LABEL:   func.func @sparse_convert_complex(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<100xcomplex<f64>>) -> !llvm.ptr<i8> {
+// CHECK-DAG:       %[[VAL_1:.*]] = complex.constant [0.000000e+00, 0.000000e+00] : complex<f64>
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 9 : i32
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 8 : i8
-// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_7]] : tensor<?xi32>
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 100 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 8 : i8
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_9:.*]] = memref.alloca() : memref<1xi8>
 // CHECK:           %[[VAL_10:.*]] = memref.cast %[[VAL_9]] : memref<1xi8> to memref<?xi8>
-// CHECK:           memref.store %[[VAL_6]], %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<1xi8>
+// CHECK:           memref.store %[[VAL_7]], %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<1xi8>
 // CHECK:           %[[VAL_11:.*]] = memref.alloca() : memref<1xindex>
 // CHECK:           %[[VAL_12:.*]] = memref.cast %[[VAL_11]] : memref<1xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_8]], %[[VAL_11]]{{\[}}%[[VAL_7]]] : memref<1xindex>
+// CHECK:           memref.store %[[VAL_6]], %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref<1xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.alloca() : memref<1xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.cast %[[VAL_13]] : memref<1xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_7]], %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref<1xindex>
+// CHECK:           memref.store %[[VAL_5]], %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<1xindex>
 // CHECK:           %[[VAL_15:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
 // CHECK:           %[[VAL_16:.*]] = call @newSparseTensor(%[[VAL_12]], %[[VAL_12]], %[[VAL_10]], %[[VAL_14]], %[[VAL_14]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_2]], %[[VAL_15]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
 // CHECK:           %[[VAL_17:.*]] = memref.alloca() : memref<1xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.cast %[[VAL_17]] : memref<1xindex> to memref<?xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.alloca() : memref<i32>
-// CHECK:           scf.for %[[VAL_20:.*]] = %[[VAL_7]] to %[[VAL_8]] step %[[VAL_5]] {
-// CHECK:             %[[VAL_21:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_20]]] : tensor<?xi32>
-// CHECK:             %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_4]] : i32
+// CHECK:           %[[VAL_19:.*]] = memref.alloca() : memref<complex<f64>>
+// CHECK:           scf.for %[[VAL_20:.*]] = %[[VAL_5]] to %[[VAL_6]] step %[[VAL_8]] {
+// CHECK:             %[[VAL_21:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_20]]] : tensor<100xcomplex<f64>>
+// CHECK:             %[[VAL_22:.*]] = complex.neq %[[VAL_21]], %[[VAL_1]] : complex<f64>
 // CHECK:             scf.if %[[VAL_22]] {
-// CHECK:               memref.store %[[VAL_20]], %[[VAL_17]]{{\[}}%[[VAL_7]]] : memref<1xindex>
-// CHECK:               memref.store %[[VAL_21]], %[[VAL_19]][] : memref<i32>
-// CHECK:               %[[VAL_23:.*]] = func.call @addEltI32(%[[VAL_16]], %[[VAL_19]], %[[VAL_18]], %[[VAL_14]]) : (!llvm.ptr<i8>, memref<i32>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
-// CHECK:             }
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = call @newSparseTensor(%[[VAL_12]], %[[VAL_12]], %[[VAL_10]], %[[VAL_14]], %[[VAL_14]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_1]], %[[VAL_16]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           call @delSparseTensorCOOI32(%[[VAL_16]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           return %[[VAL_24]] : !llvm.ptr<i8>
-// CHECK:         }
-func.func @sparse_convert_1d(%arg0: tensor<?xi32>) -> tensor<?xi32, #SparseVector> {
-  %0 = sparse_tensor.convert %arg0 : tensor<?xi32> to tensor<?xi32, #SparseVector>
-  return %0 : tensor<?xi32, #SparseVector>
-}
-
-// CHECK-LABEL:   func.func @sparse_convert_complex(
-// CHECK-SAME:      %[[VAL_0:.*]]: tensor<100xcomplex<f64>>) -> !llvm.ptr<i8> {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 2 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = complex.constant [0.000000e+00, 0.000000e+00] : complex<f64>
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 4 : i32
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 9 : i32
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 100 : index
-// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 8 : i8
-// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_10:.*]] = memref.alloca() : memref<1xi8>
-// CHECK:           %[[VAL_11:.*]] = memref.cast %[[VAL_10]] : memref<1xi8> to memref<?xi8>
-// CHECK:           memref.store %[[VAL_8]], %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<1xi8>
-// CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<1xindex>
-// CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<1xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_7]], %[[VAL_12]]{{\[}}%[[VAL_6]]] : memref<1xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.alloca() : memref<1xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.cast %[[VAL_14]] : memref<1xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_6]], %[[VAL_14]]{{\[}}%[[VAL_6]]] : memref<1xindex>
-// CHECK:           %[[VAL_16:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = call @newSparseTensor(%[[VAL_13]], %[[VAL_13]], %[[VAL_11]], %[[VAL_15]], %[[VAL_15]], %[[VAL_5]], %[[VAL_5]], %[[VAL_4]], %[[VAL_3]], %[[VAL_16]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = memref.alloca() : memref<1xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.cast %[[VAL_18]] : memref<1xindex> to memref<?xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.alloca() : memref<complex<f64>>
-// CHECK:           scf.for %[[VAL_21:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_9]] {
-// CHECK:             %[[VAL_22:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_21]]] : tensor<100xcomplex<f64>>
-// CHECK:             %[[VAL_23:.*]] = complex.neq %[[VAL_22]], %[[VAL_2]] : complex<f64>
-// CHECK:             scf.if %[[VAL_23]] {
-// CHECK:               memref.store %[[VAL_21]], %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref<1xindex>
-// CHECK:               memref.store %[[VAL_22]], %[[VAL_20]][] : memref<complex<f64>>
-// CHECK:               %[[VAL_24:.*]] = func.call @addEltC64(%[[VAL_17]], %[[VAL_20]], %[[VAL_19]], %[[VAL_15]]) : (!llvm.ptr<i8>, memref<complex<f64>>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:               memref.store %[[VAL_20]], %[[VAL_17]]{{\[}}%[[VAL_5]]] : memref<1xindex>
+// CHECK:               memref.store %[[VAL_21]], %[[VAL_19]][] : memref<complex<f64>>
+// CHECK:               %[[VAL_23:.*]] = func.call @forwardingInsertC64(%[[VAL_16]], %[[VAL_19]], %[[VAL_18]]) : (!llvm.ptr<i8>, memref<complex<f64>>, memref<?xindex>) -> !llvm.ptr<i8>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_25:.*]] = call @newSparseTensor(%[[VAL_13]], %[[VAL_13]], %[[VAL_11]], %[[VAL_15]], %[[VAL_15]], %[[VAL_5]], %[[VAL_5]], %[[VAL_4]], %[[VAL_1]], %[[VAL_17]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           call @delSparseTensorCOOC64(%[[VAL_17]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           return %[[VAL_25]] : !llvm.ptr<i8>
+// CHECK:           call @endForwardingInsert(%[[VAL_16]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           return %[[VAL_16]] : !llvm.ptr<i8>
 // CHECK:         }
 func.func @sparse_convert_complex(%arg0: tensor<100xcomplex<f64>>) -> tensor<100xcomplex<f64>, #SparseVector> {
   %0 = sparse_tensor.convert %arg0 : tensor<100xcomplex<f64>> to tensor<100xcomplex<f64>, #SparseVector>
@@ -103,49 +99,46 @@ func.func @sparse_convert_complex(%arg0: tensor<100xcomplex<f64>>) -> tensor<100
 
 // CHECK-LABEL:   func.func @sparse_convert_2d(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<2x4xf64>) -> !llvm.ptr<i8> {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 2 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 4 : i32
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 2 : index
-// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 4 : index
-// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 4 : i8
-// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 8 : i8
-// CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<2xi8>
-// CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<2xi8> to memref<?xi8>
-// CHECK:           memref.store %[[VAL_10]], %[[VAL_12]]{{\[}}%[[VAL_6]]] : memref<2xi8>
-// CHECK:           memref.store %[[VAL_11]], %[[VAL_12]]{{\[}}%[[VAL_8]]] : memref<2xi8>
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 4 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 4 : i8
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 8 : i8
+// CHECK:           %[[VAL_10:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:           %[[VAL_11:.*]] = memref.cast %[[VAL_10]] : memref<2xi8> to memref<?xi8>
+// CHECK:           memref.store %[[VAL_8]], %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<2xi8>
+// CHECK:           memref.store %[[VAL_9]], %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<2xi8>
+// CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<2xindex> to memref<?xindex>
+// CHECK:           memref.store %[[VAL_5]], %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<2xindex>
+// CHECK:           memref.store %[[VAL_7]], %[[VAL_12]]{{\[}}%[[VAL_6]]] : memref<2xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.alloca() : memref<2xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.cast %[[VAL_14]] : memref<2xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_7]], %[[VAL_14]]{{\[}}%[[VAL_6]]] : memref<2xindex>
-// CHECK:           memref.store %[[VAL_9]], %[[VAL_14]]{{\[}}%[[VAL_8]]] : memref<2xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.cast %[[VAL_16]] : memref<2xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_6]], %[[VAL_16]]{{\[}}%[[VAL_6]]] : memref<2xindex>
-// CHECK:           memref.store %[[VAL_8]], %[[VAL_16]]{{\[}}%[[VAL_8]]] : memref<2xindex>
-// CHECK:           %[[VAL_18:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
-// CHECK:           %[[VAL_19:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_15]], %[[VAL_13]], %[[VAL_17]], %[[VAL_17]], %[[VAL_5]], %[[VAL_5]], %[[VAL_4]], %[[VAL_3]], %[[VAL_18]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_20:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:           %[[VAL_21:.*]] = memref.cast %[[VAL_20]] : memref<2xindex> to memref<?xindex>
-// CHECK:           %[[VAL_22:.*]] = memref.alloca() : memref<f64>
-// CHECK:           scf.for %[[VAL_23:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] {
-// CHECK:             scf.for %[[VAL_24:.*]] = %[[VAL_6]] to %[[VAL_9]] step %[[VAL_8]] {
-// CHECK:               %[[VAL_25:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_23]], %[[VAL_24]]] : tensor<2x4xf64>
-// CHECK:               %[[VAL_26:.*]] = arith.cmpf une, %[[VAL_25]], %[[VAL_2]] : f64
-// CHECK:               scf.if %[[VAL_26]] {
-// CHECK:                 memref.store %[[VAL_23]], %[[VAL_20]]{{\[}}%[[VAL_6]]] : memref<2xindex>
-// CHECK:                 memref.store %[[VAL_24]], %[[VAL_20]]{{\[}}%[[VAL_8]]] : memref<2xindex>
-// CHECK:                 memref.store %[[VAL_25]], %[[VAL_22]][] : memref<f64>
-// CHECK:                 %[[VAL_27:.*]] = func.call @addEltF64(%[[VAL_19]], %[[VAL_22]], %[[VAL_21]], %[[VAL_17]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:           memref.store %[[VAL_4]], %[[VAL_14]]{{\[}}%[[VAL_4]]] : memref<2xindex>
+// CHECK:           memref.store %[[VAL_6]], %[[VAL_14]]{{\[}}%[[VAL_6]]] : memref<2xindex>
+// CHECK:           %[[VAL_16:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
+// CHECK:           %[[VAL_17:.*]] = call @newSparseTensor(%[[VAL_13]], %[[VAL_13]], %[[VAL_11]], %[[VAL_15]], %[[VAL_15]], %[[VAL_3]], %[[VAL_3]], %[[VAL_2]], %[[VAL_2]], %[[VAL_16]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_18:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:           %[[VAL_19:.*]] = memref.cast %[[VAL_18]] : memref<2xindex> to memref<?xindex>
+// CHECK:           %[[VAL_20:.*]] = memref.alloca() : memref<f64>
+// CHECK:           scf.for %[[VAL_21:.*]] = %[[VAL_4]] to %[[VAL_5]] step %[[VAL_6]] {
+// CHECK:             scf.for %[[VAL_22:.*]] = %[[VAL_4]] to %[[VAL_7]] step %[[VAL_6]] {
+// CHECK:               %[[VAL_23:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_21]], %[[VAL_22]]] : tensor<2x4xf64>
+// CHECK:               %[[VAL_24:.*]] = arith.cmpf une, %[[VAL_23]], %[[VAL_1]] : f64
+// CHECK:               scf.if %[[VAL_24]] {
+// CHECK:                 memref.store %[[VAL_21]], %[[VAL_18]]{{\[}}%[[VAL_4]]] : memref<2xindex>
+// CHECK:                 memref.store %[[VAL_22]], %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref<2xindex>
+// CHECK:                 memref.store %[[VAL_23]], %[[VAL_20]][] : memref<f64>
+// CHECK:                 %[[VAL_25:.*]] = func.call @forwardingInsertF64(%[[VAL_17]], %[[VAL_20]], %[[VAL_19]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>) -> !llvm.ptr<i8>
 // CHECK:               }
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_28:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_15]], %[[VAL_13]], %[[VAL_17]], %[[VAL_17]], %[[VAL_5]], %[[VAL_5]], %[[VAL_4]], %[[VAL_1]], %[[VAL_19]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           call @delSparseTensorCOOF64(%[[VAL_19]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           return %[[VAL_28]] : !llvm.ptr<i8>
+// CHECK:           call @endForwardingInsert(%[[VAL_17]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           return %[[VAL_17]] : !llvm.ptr<i8>
 // CHECK:         }
 func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
   %0 = sparse_tensor.convert %arg0 : tensor<2x4xf64> to tensor<2x4xf64, #CSR>
@@ -155,7 +148,7 @@ func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
 // CHECK-LABEL:   func.func @sparse_constant() -> !llvm.ptr<i8> {
 // CHECK-DAG:       %[[VAL_0:.*]] = arith.constant dense<[1.000000e+00, 5.000000e+00]> : tensor<2xf32>
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant dense<{{\[\[}}0, 0], [1, 6]]> : tensor<2x2xi64>
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 4 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i32
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : i32
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
@@ -191,11 +184,10 @@ func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
 // CHECK:             memref.store %[[VAL_25]], %[[VAL_20]]{{\[}}%[[VAL_5]]] : memref<2xindex>
 // CHECK:             memref.store %[[VAL_27]], %[[VAL_20]]{{\[}}%[[VAL_7]]] : memref<2xindex>
 // CHECK:             memref.store %[[VAL_28]], %[[VAL_22]][] : memref<f32>
-// CHECK:             %[[VAL_29:.*]] = func.call @addEltF32(%[[VAL_19]], %[[VAL_22]], %[[VAL_21]], %[[VAL_17]]) : (!llvm.ptr<i8>, memref<f32>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:             %[[VAL_29:.*]] = func.call @forwardingInsertF32(%[[VAL_19]], %[[VAL_22]], %[[VAL_21]]) : (!llvm.ptr<i8>, memref<f32>, memref<?xindex>) -> !llvm.ptr<i8>
 // CHECK:           }
-// CHECK:           %[[VAL_30:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_15]], %[[VAL_13]], %[[VAL_17]], %[[VAL_17]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_3]], %[[VAL_19]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           call @delSparseTensorCOOF32(%[[VAL_19]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           return %[[VAL_30]] : !llvm.ptr<i8>
+// CHECK:           call @endForwardingInsert(%[[VAL_19]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           return %[[VAL_19]] : !llvm.ptr<i8>
 // CHECK:         }
 func.func @sparse_constant() -> tensor<8x7xf32, #CSR>{
   // Initialize a tensor.
@@ -208,7 +200,7 @@ func.func @sparse_constant() -> tensor<8x7xf32, #CSR>{
 // CHECK-LABEL:   func.func @sparse_constant_csc() -> !llvm.ptr<i8> {
 // CHECK-DAG:       %[[VAL_0:.*]] = arith.constant dense<[1.000000e+00, 5.000000e+00]> : tensor<2xf32>
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant dense<{{\[\[}}0, 0], [1, 6]]> : tensor<2x2xi64>
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 4 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i32
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : i32
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
@@ -252,11 +244,10 @@ func.func @sparse_constant() -> tensor<8x7xf32, #CSR>{
 // CHECK:             memref.store %[[VAL_29]], %[[VAL_24]]{{\[}}%[[VAL_5]]] : memref<2xindex>
 // CHECK:             memref.store %[[VAL_31]], %[[VAL_24]]{{\[}}%[[VAL_7]]] : memref<2xindex>
 // CHECK:             memref.store %[[VAL_32]], %[[VAL_26]][] : memref<f32>
-// CHECK:             %[[VAL_33:.*]] = func.call @addEltF32(%[[VAL_23]], %[[VAL_26]], %[[VAL_25]], %[[VAL_17]]) : (!llvm.ptr<i8>, memref<f32>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:             %[[VAL_33:.*]] = func.call @forwardingInsertF32(%[[VAL_23]], %[[VAL_26]], %[[VAL_25]]) : (!llvm.ptr<i8>, memref<f32>, memref<?xindex>) -> !llvm.ptr<i8>
 // CHECK:           }
-// CHECK:           %[[VAL_34:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_21]], %[[VAL_13]], %[[VAL_17]], %[[VAL_19]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_3]], %[[VAL_23]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           call @delSparseTensorCOOF32(%[[VAL_23]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           return %[[VAL_34]] : !llvm.ptr<i8>
+// CHECK:           call @endForwardingInsert(%[[VAL_23]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           return %[[VAL_23]] : !llvm.ptr<i8>
 // CHECK:         }
 func.func @sparse_constant_csc() -> tensor<8x7xf32, #CSC>{
   // Initialize a tensor.
@@ -268,70 +259,67 @@ func.func @sparse_constant_csc() -> tensor<8x7xf32, #CSC>{
 
 // CHECK-LABEL:   func.func @sparse_convert_3d(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?x?xf64>) -> !llvm.ptr<i8> {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 2 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 4 : i32
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 8 : i8
-// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 4 : i8
-// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 2 : index
-// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_11:.*]] = tensor.dim %[[VAL_0]], %[[VAL_10]] : tensor<?x?x?xf64>
-// CHECK:           %[[VAL_12:.*]] = tensor.dim %[[VAL_0]], %[[VAL_9]] : tensor<?x?x?xf64>
-// CHECK:           %[[VAL_13:.*]] = tensor.dim %[[VAL_0]], %[[VAL_8]] : tensor<?x?x?xf64>
-// CHECK:           %[[VAL_14:.*]] = memref.alloca() : memref<3xi8>
-// CHECK:           %[[VAL_15:.*]] = memref.cast %[[VAL_14]] : memref<3xi8> to memref<?xi8>
-// CHECK:           memref.store %[[VAL_7]], %[[VAL_14]]{{\[}}%[[VAL_10]]] : memref<3xi8>
-// CHECK:           memref.store %[[VAL_6]], %[[VAL_14]]{{\[}}%[[VAL_9]]] : memref<3xi8>
-// CHECK:           memref.store %[[VAL_6]], %[[VAL_14]]{{\[}}%[[VAL_8]]] : memref<3xi8>
-// CHECK:           %[[VAL_16:.*]] = memref.alloca() : memref<3xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.cast %[[VAL_16]] : memref<3xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_11]], %[[VAL_16]]{{\[}}%[[VAL_10]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_12]], %[[VAL_16]]{{\[}}%[[VAL_9]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_13]], %[[VAL_16]]{{\[}}%[[VAL_8]]] : memref<3xindex>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_8]]] : memref<3xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_10]]] : memref<3xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_9]]] : memref<3xindex>
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 8 : i8
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 4 : i8
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_9:.*]] = tensor.dim %[[VAL_0]], %[[VAL_8]] : tensor<?x?x?xf64>
+// CHECK:           %[[VAL_10:.*]] = tensor.dim %[[VAL_0]], %[[VAL_7]] : tensor<?x?x?xf64>
+// CHECK:           %[[VAL_11:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?x?xf64>
+// CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<3xi8>
+// CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<3xi8> to memref<?xi8>
+// CHECK:           memref.store %[[VAL_5]], %[[VAL_12]]{{\[}}%[[VAL_8]]] : memref<3xi8>
+// CHECK:           memref.store %[[VAL_4]], %[[VAL_12]]{{\[}}%[[VAL_7]]] : memref<3xi8>
+// CHECK:           memref.store %[[VAL_4]], %[[VAL_12]]{{\[}}%[[VAL_6]]] : memref<3xi8>
+// CHECK:           %[[VAL_14:.*]] = memref.alloca() : memref<3xindex>
+// CHECK:           %[[VAL_15:.*]] = memref.cast %[[VAL_14]] : memref<3xindex> to memref<?xindex>
+// CHECK:           memref.store %[[VAL_9]], %[[VAL_14]]{{\[}}%[[VAL_8]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_10]], %[[VAL_14]]{{\[}}%[[VAL_7]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_11]], %[[VAL_14]]{{\[}}%[[VAL_6]]] : memref<3xindex>
+// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_6]]] : memref<3xindex>
+// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_8]]] : memref<3xindex>
+// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_7]]] : memref<3xindex>
+// CHECK:           %[[VAL_19:.*]] = memref.alloca() : memref<3xindex>
+// CHECK:           %[[VAL_20:.*]] = memref.cast %[[VAL_19]] : memref<3xindex> to memref<?xindex>
+// CHECK:           memref.store %[[VAL_7]], %[[VAL_19]]{{\[}}%[[VAL_8]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_6]], %[[VAL_19]]{{\[}}%[[VAL_7]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_8]], %[[VAL_19]]{{\[}}%[[VAL_6]]] : memref<3xindex>
 // CHECK:           %[[VAL_21:.*]] = memref.alloca() : memref<3xindex>
 // CHECK:           %[[VAL_22:.*]] = memref.cast %[[VAL_21]] : memref<3xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_9]], %[[VAL_21]]{{\[}}%[[VAL_10]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_8]], %[[VAL_21]]{{\[}}%[[VAL_9]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_10]], %[[VAL_21]]{{\[}}%[[VAL_8]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_6]], %[[VAL_21]]{{\[}}%[[VAL_8]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_8]], %[[VAL_21]]{{\[}}%[[VAL_7]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_7]], %[[VAL_21]]{{\[}}%[[VAL_6]]] : memref<3xindex>
 // CHECK:           %[[VAL_23:.*]] = memref.alloca() : memref<3xindex>
 // CHECK:           %[[VAL_24:.*]] = memref.cast %[[VAL_23]] : memref<3xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_8]], %[[VAL_23]]{{\[}}%[[VAL_10]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_10]], %[[VAL_23]]{{\[}}%[[VAL_9]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_9]], %[[VAL_23]]{{\[}}%[[VAL_8]]] : memref<3xindex>
-// CHECK:           %[[VAL_25:.*]] = memref.alloca() : memref<3xindex>
-// CHECK:           %[[VAL_26:.*]] = memref.cast %[[VAL_25]] : memref<3xindex> to memref<?xindex>
-// CHECK:           memref.store %[[VAL_18]], %[[VAL_25]]{{\[}}%[[VAL_10]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_19]], %[[VAL_25]]{{\[}}%[[VAL_9]]] : memref<3xindex>
-// CHECK:           memref.store %[[VAL_20]], %[[VAL_25]]{{\[}}%[[VAL_8]]] : memref<3xindex>
-// CHECK:           %[[VAL_27:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = call @newSparseTensor(%[[VAL_17]], %[[VAL_26]], %[[VAL_15]], %[[VAL_22]], %[[VAL_24]], %[[VAL_5]], %[[VAL_5]], %[[VAL_4]], %[[VAL_3]], %[[VAL_27]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_29:.*]] = memref.alloca() : memref<3xindex>
-// CHECK:           %[[VAL_30:.*]] = memref.cast %[[VAL_29]] : memref<3xindex> to memref<?xindex>
-// CHECK:           %[[VAL_31:.*]] = memref.alloca() : memref<f64>
-// CHECK:           scf.for %[[VAL_32:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_9]] {
-// CHECK:             scf.for %[[VAL_33:.*]] = %[[VAL_10]] to %[[VAL_12]] step %[[VAL_9]] {
-// CHECK:               scf.for %[[VAL_34:.*]] = %[[VAL_10]] to %[[VAL_13]] step %[[VAL_9]] {
-// CHECK:                 %[[VAL_35:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_32]], %[[VAL_33]], %[[VAL_34]]] : tensor<?x?x?xf64>
-// CHECK:                 %[[VAL_36:.*]] = arith.cmpf une, %[[VAL_35]], %[[VAL_2]] : f64
-// CHECK:                 scf.if %[[VAL_36]] {
-// CHECK:                   memref.store %[[VAL_32]], %[[VAL_29]]{{\[}}%[[VAL_10]]] : memref<3xindex>
-// CHECK:                   memref.store %[[VAL_33]], %[[VAL_29]]{{\[}}%[[VAL_9]]] : memref<3xindex>
-// CHECK:                   memref.store %[[VAL_34]], %[[VAL_29]]{{\[}}%[[VAL_8]]] : memref<3xindex>
-// CHECK:                   memref.store %[[VAL_35]], %[[VAL_31]][] : memref<f64>
-// CHECK:                   %[[VAL_37:.*]] = func.call @addEltF64(%[[VAL_28]], %[[VAL_31]], %[[VAL_30]], %[[VAL_22]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:           memref.store %[[VAL_16]], %[[VAL_23]]{{\[}}%[[VAL_8]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_17]], %[[VAL_23]]{{\[}}%[[VAL_7]]] : memref<3xindex>
+// CHECK:           memref.store %[[VAL_18]], %[[VAL_23]]{{\[}}%[[VAL_6]]] : memref<3xindex>
+// CHECK:           %[[VAL_25:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
+// CHECK:           %[[VAL_26:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_24]], %[[VAL_13]], %[[VAL_20]], %[[VAL_22]], %[[VAL_3]], %[[VAL_3]], %[[VAL_2]], %[[VAL_2]], %[[VAL_25]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_27:.*]] = memref.alloca() : memref<3xindex>
+// CHECK:           %[[VAL_28:.*]] = memref.cast %[[VAL_27]] : memref<3xindex> to memref<?xindex>
+// CHECK:           %[[VAL_29:.*]] = memref.alloca() : memref<f64>
+// CHECK:           scf.for %[[VAL_30:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_7]] {
+// CHECK:             scf.for %[[VAL_31:.*]] = %[[VAL_8]] to %[[VAL_10]] step %[[VAL_7]] {
+// CHECK:               scf.for %[[VAL_32:.*]] = %[[VAL_8]] to %[[VAL_11]] step %[[VAL_7]] {
+// CHECK:                 %[[VAL_33:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_30]], %[[VAL_31]], %[[VAL_32]]] : tensor<?x?x?xf64>
+// CHECK:                 %[[VAL_34:.*]] = arith.cmpf une, %[[VAL_33]], %[[VAL_1]] : f64
+// CHECK:                 scf.if %[[VAL_34]] {
+// CHECK:                   memref.store %[[VAL_30]], %[[VAL_27]]{{\[}}%[[VAL_8]]] : memref<3xindex>
+// CHECK:                   memref.store %[[VAL_31]], %[[VAL_27]]{{\[}}%[[VAL_7]]] : memref<3xindex>
+// CHECK:                   memref.store %[[VAL_32]], %[[VAL_27]]{{\[}}%[[VAL_6]]] : memref<3xindex>
+// CHECK:                   memref.store %[[VAL_33]], %[[VAL_29]][] : memref<f64>
+// CHECK:                   %[[VAL_35:.*]] = func.call @forwardingInsertF64(%[[VAL_26]], %[[VAL_29]], %[[VAL_28]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>) -> !llvm.ptr<i8>
 // CHECK:                 }
 // CHECK:               }
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_38:.*]] = call @newSparseTensor(%[[VAL_17]], %[[VAL_26]], %[[VAL_15]], %[[VAL_22]], %[[VAL_24]], %[[VAL_5]], %[[VAL_5]], %[[VAL_4]], %[[VAL_1]], %[[VAL_28]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
-// CHECK:           call @delSparseTensorCOOF64(%[[VAL_28]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           return %[[VAL_38]] : !llvm.ptr<i8>
+// CHECK:           call @endForwardingInsert(%[[VAL_26]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           return %[[VAL_26]] : !llvm.ptr<i8>
 // CHECK:         }
 func.func @sparse_convert_3d(%arg0: tensor<?x?x?xf64>) -> tensor<?x?x?xf64, #SparseTensor> {
   %0 = sparse_tensor.convert %arg0 : tensor<?x?x?xf64> to tensor<?x?x?xf64, #SparseTensor>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
index d19d7fe2871d674..9d8db10aa423022 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
@@ -62,7 +62,7 @@
 // CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
 // CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
 // CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
-// CHECK-CONVERT: call @endInsert
+// CHECK-CONVERT: call @endLexInsert
 //
 func.func @kernel(%arga: tensor<?x?xf64, #DCSC>) -> tensor<?xf64, #SV> {
   %c0 = arith.constant 0 : index
@@ -115,7 +115,7 @@ func.func @kernel(%arga: tensor<?x?xf64, #DCSC>) -> tensor<?xf64, #SV> {
 // CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
 // CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
 // CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
-// CHECK-CONVERT: call @endInsert
+// CHECK-CONVERT: call @endLexInsert
 //
 func.func @matmul1(%A: tensor<8x2xf64, #CSR>,
                    %B: tensor<2x4xf64, #CSR>) -> tensor<8x4xf64, #CSR> {
@@ -163,7 +163,7 @@ func.func @matmul1(%A: tensor<8x2xf64, #CSR>,
 // CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
 // CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
 // CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
-// CHECK-CONVERT: call @endInsert
+// CHECK-CONVERT: call @endLexInsert
 //
 func.func @matmul2(%A: tensor<8x2xf64, #CSC>,
                    %B: tensor<2x4xf64, #CSC>) -> tensor<8x4xf64, #CSC> {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
index 7d852ca9cc1aa26..8ecbc1da965a156 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -112,7 +112,7 @@
 // CHECK:           memref.dealloc %[[VAL_20]] : memref<300xf64>
 // CHECK:           memref.dealloc %[[VAL_22]] : memref<300xi1>
 // CHECK:           memref.dealloc %[[VAL_24]] : memref<300xindex>
-// CHECK:           call @endInsert(%[[VAL_19]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           call @endLexInsert(%[[VAL_19]]) : (!llvm.ptr<i8>) -> ()
 // CHECK:           return %[[VAL_19]] : !llvm.ptr<i8>
 // CHECK:       }
 func.func @fill_zero_after_alloc(%arg0: tensor<100x200xf64, #DCSR>,