[Mlir-commits] [mlir] c248219 - [mlir][sparse] Implements concatenate operation for sparse tensor

Tue Aug 16 13:47:52 PDT 2022

Author: Peiming Liu
Date: 2022-08-16T20:47:47Z
New Revision: c248219b09c1e724468d4603f647466b3e282330

URL: https://github.com/llvm/llvm-project/commit/c248219b09c1e724468d4603f647466b3e282330
DIFF: https://github.com/llvm/llvm-project/commit/c248219b09c1e724468d4603f647466b3e282330.diff

LOG: [mlir][sparse] Implements concatenate operation for sparse tensor

This patch implements the conversion rule for operation introduced in https://reviews.llvm.org/D131200.
Also contains integration test for correctness

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D131200

Added: 
    mlir/test/Dialect/SparseTensor/sparse_concat.mlir
    mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate.mlir

Modified: 
    mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index dca1c52413724..d949cf6006a0a 100644

--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -117,6 +117,26 @@ static Value genNewCall(OpBuilder &builder, Operation *op,
       .getResult(0);
 }
 
+/// Compute the size from type (for static sizes) or from an already-converted
+/// opaque pointer source (for dynamic sizes) at the given dimension.
+static Value sizeFromPtrAtDim(OpBuilder &builder, Operation *op,
+                              SparseTensorEncodingAttr &enc, ShapedType stp,
+                              Value src, unsigned dim) {
+  auto shape = stp.getShape();
+  if (shape[dim] == ShapedType::kDynamicSize)
+    return genDimSizeCall(builder, op, enc, src, dim);
+  return constantIndex(builder, op->getLoc(), shape[dim]);
+}
+
+/// Populates given sizes array from type (for static sizes) and from
+/// an already-converted opaque pointer source (for dynamic sizes).
+static void sizesFromPtr(OpBuilder &builder, SmallVector<Value, 4> &sizes,
+                         Operation *op, SparseTensorEncodingAttr &enc,
+                         ShapedType stp, Value src) {
+  for (unsigned i = 0, rank = stp.getRank(); i < rank; i++)
+    sizes.push_back(sizeFromPtrAtDim(builder, op, enc, stp, src, i));
+}
+
 /// Populates given sizes array from type.
 static void sizesFromType(OpBuilder &builder, SmallVector<Value, 4> &sizes,
                           Location loc, ShapedType stp) {
@@ -135,18 +155,42 @@ static void sizesFromSrc(OpBuilder &builder, SmallVector<Value, 4> &sizes,
     sizes.push_back(linalg::createOrFoldDimOp(builder, loc, src, i));
 }
 
-/// Populates given sizes array from type (for static sizes) and from
-/// an already converted into opague pointer source (for dynamic sizes).
-static void sizesFromPtr(OpBuilder &builder, SmallVector<Value, 4> &sizes,
-                         Operation *op, SparseTensorEncodingAttr &enc,
-                         ShapedType stp, Value src) {
+/// Populates the given sizes array for concatenation from type (for static
+/// sizes) and from an already-converted opaque pointer source (for dynamic
+/// sizes).
+static void concatSizesFromInputs(OpBuilder &builder,
+                                  SmallVector<Value, 4> &sizes, Operation *op,
+                                  ShapedType dstTp, ValueRange srcs,
+                                  unsigned dim) {
   Location loc = op->getLoc();
-  auto shape = stp.getShape();
-  for (unsigned i = 0, rank = stp.getRank(); i < rank; i++)
-    if (shape[i] == ShapedType::kDynamicSize)
-      sizes.push_back(genDimSizeCall(builder, op, enc, src, i));
-    else
-      sizes.push_back(constantIndex(builder, loc, shape[i]));
+  auto dstShape = dstTp.getShape();
+
+  auto srcTp = srcs[0].getType().cast<ShapedType>();
+  auto srcEnc = getSparseTensorEncoding(srcTp);
+  // We first fills the sizes from an input tensor, and then
+  // compute the size of the concatenation dimension if necessary.
+  if (srcEnc)
+    // Reuses sizes from an arbitrary input tensor is fine.
+    sizesFromPtr(builder, sizes, op, srcEnc, srcTp, srcs[0]);
+  else
+    sizesFromSrc(builder, sizes, loc, srcs[0]);
+
+  // Sum up on the `dim` if the dimension is dynamic.
+  if (dstShape[dim] != ShapedType::kDynamicSize) {
+    // Faithfully take the static size.
+    sizes[dim] = constantIndex(builder, loc, dstShape[dim]);
+  } else {
+    // Else, compute the shape dynamically.
+    for (size_t i = 1, sz = srcs.size(); i < sz; i++) {
+      auto srcTp = srcs[i].getType().cast<ShapedType>();
+      auto encSrc = getSparseTensorEncoding(srcTp);
+      Value srcSz =
+          encSrc ? sizeFromPtrAtDim(builder, op, encSrc, srcTp, srcs[i], dim)
+                 : linalg::createOrFoldDimOp(builder, loc, srcs[i], dim);
+      // Sum up all the sizes.
+      sizes[dim] = builder.create<arith::AddIOp>(loc, sizes[dim], srcSz);
+    }
+  }
 }
 
 /// Generates an uninitialized temporary buffer of the given size and
@@ -234,6 +278,20 @@ static void newParams(OpBuilder &builder, SmallVector<Value, 8> &params,
   params.push_back(ptr);
 }
 
+/// Generates the code to read the value from tensor[ivs].The generated code
+/// looks like the following and the insertion point after this routine is
+/// inside the if-then branch behind the assignment to ind.
+///    if (tensor[ivs] != 0)
+///      insert_point
+static Value genValueForDense(OpBuilder &builder, Location loc, Value tensor,
+                              ValueRange ivs) {
+  Value val = builder.create<tensor::ExtractOp>(loc, tensor, ivs);
+  Value cond = genIsNonzero(builder, loc, val);
+  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, cond, /*else*/ false);
+  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+  return val;
+}
+
 /// Generates the code to read the value from tensor[ivs], and conditionally
 /// stores the indices ivs to the memory in ind. The generated code looks like
 /// the following and the insertion point after this routine is inside the
@@ -243,10 +301,7 @@ static void newParams(OpBuilder &builder, SmallVector<Value, 8> &params,
 ///      ind = ivs
 static Value genIndexAndValueForDense(OpBuilder &builder, Location loc,
                                       Value tensor, Value ind, ValueRange ivs) {
-  Value val = builder.create<tensor::ExtractOp>(loc, tensor, ivs);
-  Value cond = genIsNonzero(builder, loc, val);
-  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, cond, /*else*/ false);
-  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+  Value val = genValueForDense(builder, loc, tensor, ivs);
   unsigned i = 0;
   for (auto iv : ivs) {
     Value idx = constantIndex(builder, loc, i++);
@@ -346,18 +401,43 @@ static void deallocDenseTensor(OpBuilder &builder, Location loc, Value buffer) {
   builder.create<memref::DeallocOp>(loc, buffer);
 }
 
-/// Inserts the element returned by genGetNextCall(_, ind, elemPtr) into
-/// the tensor created by allocDenseTensor().  The `rank` is the rank
-/// of the `tensor` and the length of `ind`.
-static void insertScalarIntoDenseTensor(OpBuilder &builder, Location loc,
-                                        Value elemPtr, Value tensor,
-                                        unsigned rank, Value ind) {
+/// Converts a pointer to COO (from calls to iter->next()) into a vector of
+/// indices, apply (optional) `offset` on `offsetDim`.
+static SmallVector<Value, 4> loadIndices(OpBuilder &builder, Location loc,
+                                         unsigned rank, Value ind,
+                                         unsigned offsetDim = 0,
+                                         Value offset = Value()) {
   SmallVector<Value, 4> ivs;
   ivs.reserve(rank);
   for (unsigned i = 0; i < rank; i++) {
     Value idx = constantIndex(builder, loc, i);
-    ivs.push_back(builder.create<memref::LoadOp>(loc, ind, idx));
+    idx = builder.create<memref::LoadOp>(loc, ind, idx);
+    if (offsetDim == i && offset)
+      idx = builder.create<arith::AddIOp>(loc, idx, offset);
+    ivs.push_back(idx);
+  }
+  return ivs;
+}
+
+/// Converts the vector indices and store it into the memory pointed by
+/// `ind`, apply (optional) `offset` on `offsetDim`.
+static void storeIndices(OpBuilder &builder, Location loc, unsigned rank,
+                         Value ind, ValueRange ivs, unsigned offsetDim = 0,
+                         Value offset = Value()) {
+  for (unsigned i = 0; i < rank; i++) {
+    Value idx = ivs[i];
+    if (offsetDim == i && offset)
+      idx = builder.create<arith::AddIOp>(loc, idx, offset);
+    builder.create<memref::StoreOp>(loc, idx, ind,
+                                    constantIndex(builder, loc, i));
   }
+}
+
+/// Inserts a value stored in `elemPtr` into a dense tensor created by
+/// allocDenseTensor().
+static void insertScalarIntoDenseTensor(OpBuilder &builder, Location loc,
+                                        Value elemPtr, Value tensor,
+                                        ValueRange ivs) {
   Value elemV = builder.create<memref::LoadOp>(loc, elemPtr);
   builder.create<memref::StoreOp>(loc, elemV, tensor, ivs);
 }
@@ -510,6 +590,100 @@ genSparse2SparseReshape(Operation *op, ConversionPatternRewriter &rewriter,
   return success();
 }
 
+// Generates a while loop that iterates over the COO list extracted
+// from `t`, using `bodyBuilder` to build the loop body.
+//   while (elem = coo->getNext()) {
+//     bodyBuilder
+//   }
+// TODO: Get rid of Operation *op in the parameters list! It seems
+// that we only use it for op->getLoc(), pass the loc directly instead!
+// TODO: It can be used by other operators (ReshapeOp, ConvertOP) conversion to
+// reduce code repetition!
+static void genSparseCOOIterationLoop(
+    ConversionPatternRewriter &rewriter, Operation *op, Value t,
+    RankedTensorType tensorTp,
+    function_ref<void(OpBuilder &, Location, Value, Value)> bodyBuilder) {
+  Location loc = op->getLoc();
+  auto enc = getSparseTensorEncoding(tensorTp);
+  assert(enc && "Generating Sparse Tensor COO Loop on a Dense Tensor!");
+
+  unsigned rank = tensorTp.getRank();
+  Type elemTp = tensorTp.getElementType();
+
+  // Start an iterator over the tensor (in original index order).
+  auto noPerm = SparseTensorEncodingAttr::get(
+      rewriter.getContext(), enc.getDimLevelType(), AffineMap(),
+      enc.getPointerBitWidth(), enc.getIndexBitWidth());
+  SmallVector<Value, 4> sizes;
+  SmallVector<Value, 8> params;
+  sizesFromPtr(rewriter, sizes, op, noPerm, tensorTp, t);
+  newParams(rewriter, params, op, tensorTp, noPerm, Action::kToIterator, sizes,
+            t);
+  Value iter = genNewCall(rewriter, op, params);
+
+  // Construct a while loop over the iterator.
+  Value srcIdx = genAlloca(rewriter, loc, rank, rewriter.getIndexType());
+  Value elemPtr = genAllocaScalar(rewriter, loc, elemTp);
+  SmallVector<Value> noArgs;
+  SmallVector<Type> noTypes;
+  auto whileOp = rewriter.create<scf::WhileOp>(loc, noTypes, noArgs);
+  Block *before = rewriter.createBlock(&whileOp.getBefore(), {}, noTypes);
+  rewriter.setInsertionPointToEnd(before);
+  Value cond = genGetNextCall(rewriter, op, iter, srcIdx, elemPtr);
+  rewriter.create<scf::ConditionOp>(loc, cond, before->getArguments());
+  Block *after = rewriter.createBlock(&whileOp.getAfter(), {}, noTypes);
+  rewriter.setInsertionPointToStart(after);
+  // Callback here to build loop body.
+  bodyBuilder(rewriter, loc, srcIdx, elemPtr);
+  rewriter.create<scf::YieldOp>(loc);
+  // Finish generating loop.
+  rewriter.setInsertionPointAfter(whileOp);
+
+  // Free memory for iterator.
+  genDelCOOCall(rewriter, op, elemTp, iter);
+}
+
+// Generate loop that iterates over a dense tensor.
+//   for i1 in dim1
+//    ..
+//     for ik in dimk
+//       val = a[i1,..,ik]
+//       if val != 0
+//         bodyBuilder(v, [i1, ..., ik])
+// TODO: It can be used by other operators (ReshapeOp, ConvertOP) conversion to
+// reduce code repetition!
+static void genDenseTensorIterationLoop(
+    ConversionPatternRewriter &rewriter, Operation *op, Value t,
+    RankedTensorType tensorTp,
+    function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {
+  Location loc = op->getLoc();
+  auto enc = getSparseTensorEncoding(tensorTp);
+  assert(!enc && "Generating Densor Tensor Loop on a Sparse Tensor!");
+
+  unsigned rank = tensorTp.getRank();
+  Value zero = constantIndex(rewriter, loc, 0);
+  Value one = constantIndex(rewriter, loc, 1);
+
+  SmallVector<Value> lo;
+  SmallVector<Value> hi;
+  SmallVector<Value> st;
+
+  // Fill out loop iteration information.
+  for (unsigned i = 0; i < rank; i++) {
+    lo.push_back(zero);
+    hi.push_back(linalg::createOrFoldDimOp(rewriter, loc, t, i));
+    st.push_back(one);
+  }
+
+  scf::buildLoopNest(rewriter, op->getLoc(), lo, hi, st, {},
+                     [&](OpBuilder &builder, Location loc, ValueRange ivs,
+                         ValueRange args) -> scf::ValueVector {
+                       // Invoke callback to build the body of the loop.
+                       bodyBuilder(builder, loc, ivs);
+                       return {};
+                     });
+}
+
 //===----------------------------------------------------------------------===//
 // Conversion rules.
 //===----------------------------------------------------------------------===//
@@ -760,7 +934,8 @@ class SparseTensorConvertConverter : public OpConversionPattern<ConvertOp> {
       rewriter.create<scf::ConditionOp>(loc, cond, before->getArguments());
       Block *after = rewriter.createBlock(&whileOp.getAfter(), {}, noTypes);
       rewriter.setInsertionPointToStart(after);
-      insertScalarIntoDenseTensor(rewriter, loc, elemPtr, dst, rank, ind);
+      SmallVector<Value, 4> ivs = loadIndices(rewriter, loc, rank, ind);
+      insertScalarIntoDenseTensor(rewriter, loc, elemPtr, dst, ivs);
       rewriter.create<scf::YieldOp>(loc);
       rewriter.setInsertionPointAfter(whileOp);
       genDelCOOCall(rewriter, op, elemTp, iter);
@@ -1043,6 +1218,139 @@ class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
   }
 };
 
+/// Sparse conversion rule for the concatenate operator.
+class SparseTensorConcatConverter : public OpConversionPattern<ConcatenateOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(ConcatenateOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // The conversion works as follow:
+    // (1). When output is sparse, and mix of inputs:
+    //    a_sparse = concat (b_dense, c_sparse, ....)
+    // =>
+    //    coo_for_a = newSparseCOO(shapeOf(a))
+    //    for i, j, k // dense input
+    //      coo->add(adjustForOffset(i,j,k), b[i,j,k])
+    //
+    //    for elem in sparse_input
+    //      coo->add(adjustForOffset(elem.indices), elem.value)
+    //    ...
+    //    a = newSparseTensor(coo_for_a)
+    //    return a
+    //
+    // (2). When output is dense, and mix of inputs:
+    //    a_dense = concat (b_dense, c_sparse, ....)
+    // =>
+    //    a = malloc(shapeOf(a))
+    //    for i, j, k // dense input
+    //      a[ adjustForOffset(i,j,k) ] = b[i,j,k]
+    //
+    //    for elem in sparse_input
+    //      a[ adjustForOffset(elem.indices) ] = elem.value
+    //    return a
+    Location loc = op.getLoc();
+    auto dstTp = op.getType().cast<RankedTensorType>();
+    auto encDst = getSparseTensorEncoding(dstTp);
+    Type elemTp = dstTp.getElementType();
+    uint64_t concatDim = op.getDimension().getZExtValue();
+    unsigned rank = dstTp.getRank();
+
+    Value dst;     // destination tensor
+    Value dstPerm; // destination tensor permutation (if sparse out)
+    // A pointer to the value being inserted (if dense => sparse)
+    Value elemPtr;
+    // Memory that holds the COO for destination tensor (if sparse out)
+    Value dstIdx;
+    // The offset applied to the dimenstion to be concated (starting from 0)
+    Value offset = constantIndex(rewriter, loc, 0);
+
+    SmallVector<Value, 4> sizes;
+    SmallVector<Value, 8> params;
+    concatSizesFromInputs(rewriter, sizes, op, dstTp, op.getInputs(),
+                          concatDim);
+
+    if (encDst) {
+      // Start a new COO for the destination tensor.
+      newParams(rewriter, params, op, dstTp, encDst, Action::kEmptyCOO, sizes);
+      dst = genNewCall(rewriter, op, params);
+      dstPerm = params[2];
+      elemPtr = genAllocaScalar(rewriter, loc, elemTp);
+      dstIdx = genAlloca(rewriter, loc, rank, rewriter.getIndexType());
+    } else {
+      // TODO: Dense buffers should be allocated/deallocated via the callback
+      // in BufferizationOptions.
+      dst = allocDenseTensor(rewriter, loc, dstTp, sizes);
+    }
+    for (auto it : llvm::zip(op.getInputs(), adaptor.getInputs())) {
+      Value orignalOp = std::get<0>(it); // Input (with encoding) from Op
+      Value adaptedOp = std::get<1>(it); // Input (type converted) from adaptor
+      RankedTensorType srcTp = orignalOp.getType().cast<RankedTensorType>();
+      auto encSrc = getSparseTensorEncoding(srcTp);
+      if (encSrc) {
+        genSparseCOOIterationLoop(
+            rewriter, op, adaptedOp, srcTp,
+            [&](OpBuilder &builder, Location loc, Value idx,
+                Value elemPtr) -> void {
+              auto indVec =
+                  loadIndices(builder, loc, rank, idx, concatDim, offset);
+              if (encDst) {
+                // Case: sparse => sparse
+                storeIndices(builder, loc, rank, dstIdx, indVec);
+                genAddEltCall(builder, op, elemTp, dst, elemPtr, dstIdx,
+                              dstPerm);
+              } else {
+                // Case: sparse => dense
+                insertScalarIntoDenseTensor(builder, loc, elemPtr, dst, indVec);
+              }
+            });
+      } else {
+        genDenseTensorIterationLoop(
+            rewriter, op, adaptedOp, srcTp,
+            [&](OpBuilder &builder, Location loc, ValueRange idx) -> void {
+              if (encDst) {
+                // Case: dense => sparse
+                storeIndices(builder, loc, rank, dstIdx, idx, concatDim,
+                             offset);
+                Value val = genValueForDense(builder, loc, adaptedOp, idx);
+                builder.create<memref::StoreOp>(loc, val, elemPtr);
+                genAddEltCall(builder, op, elemTp, dst, elemPtr, dstIdx,
+                              dstPerm);
+              } else {
+                // Case: dense => dense
+                Value val = genValueForDense(builder, loc, adaptedOp, idx);
+                SmallVector<Value, 4> indVec(idx);
+                // Apply offset.
+                indVec[concatDim] = builder.create<arith::AddIOp>(
+                    loc, indVec[concatDim], offset);
+                builder.create<memref::StoreOp>(loc, val, dst, indVec);
+              }
+            });
+      }
+      // Accumulate offset.
+      // TODO: avoid calling sparseDimSize multiple times by caching the result!
+      Value curDim = encSrc ? sizeFromPtrAtDim(rewriter, op, encSrc, srcTp,
+                                               adaptedOp, concatDim)
+                            : linalg::createOrFoldDimOp(rewriter, loc,
+                                                        adaptedOp, concatDim);
+
+      offset = rewriter.create<arith::AddIOp>(loc, offset, curDim);
+    }
+    if (encDst) {
+      params[6] = constantAction(rewriter, loc, Action::kFromCOO);
+      // In sparse output case, the destination holds the COO.
+      Value coo = dst;
+      params[7] = coo;
+      dst = genNewCall(rewriter, op, params);
+      // Release resources.
+      genDelCOOCall(rewriter, op, elemTp, coo);
+      rewriter.replaceOp(op, dst);
+    } else {
+      rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, dstTp, dst);
+    }
+    return success();
+  }
+};
 /// Sparse conversion rule for the output operator.
 class SparseTensorOutConverter : public OpConversionPattern<OutOp> {
 public:
@@ -1099,12 +1407,13 @@ void mlir::populateSparseTensorConversionPatterns(
                SparseCastConverter, SparseTensorNewConverter,
                SparseReshapeConverter<tensor::ExpandShapeOp>,
                SparseReshapeConverter<tensor::CollapseShapeOp>,
-               SparseTensorAllocConverter, SparseTensorDeallocConverter,
-               SparseTensorToPointersConverter, SparseTensorToIndicesConverter,
-               SparseTensorToValuesConverter, SparseTensorLoadConverter,
-               SparseTensorLexInsertConverter, SparseTensorExpandConverter,
-               SparseTensorCompressConverter, SparseTensorOutConverter>(
-      typeConverter, patterns.getContext());
+               SparseTensorConcatConverter, SparseTensorAllocConverter,
+               SparseTensorDeallocConverter, SparseTensorToPointersConverter,
+               SparseTensorToIndicesConverter, SparseTensorToValuesConverter,
+               SparseTensorLoadConverter, SparseTensorLexInsertConverter,
+               SparseTensorExpandConverter, SparseTensorCompressConverter,
+               SparseTensorOutConverter>(typeConverter, patterns.getContext());
+
   patterns.add<SparseTensorConvertConverter>(typeConverter,
                                              patterns.getContext(), options);
 }

diff  --git a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
new file mode 100644
index 0000000000000..4bdb5dd8a711f
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
@@ -0,0 +1,360 @@
+// RUN: mlir-opt %s --sparse-tensor-conversion --canonicalize --cse | FileCheck %s
+
+#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["compressed", "compressed"]}>
+
+#SparseMatrix_P = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+// CHECK-LABEL: func.func @concat_mix_dense(
+// CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<2x4xf64>,
+// CHECK-SAME:    %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
+// CHECK-DAG:     %[[TMP_c2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
+// CHECK-DAG:     %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
+// CHECK-DAG:     %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[TMP_c1_i8:.*]] = arith.constant 1 : i8
+// CHECK-DAG:     %[[TMP_c3:.*]] = arith.constant 3 : index
+// CHECK-DAG:     %[[TMP_c1:.*]] = arith.constant 1 : index
+// CHECK-DAG:     %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:     %[[TMP_c0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[TMP_c4:.*]] = arith.constant 4 : index
+// CHECK:         %[[TMP_0:.*]] = memref.alloc() : memref<5x4xf64>
+// CHECK:         linalg.fill ins(%[[TMP_cst]] : f64) outs(%[[TMP_0]] : memref<5x4xf64>)
+// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
+// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
+// CHECK:             %[[TMP_12:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<2x4xf64>
+// CHECK:             %[[TMP_13:.*]] = arith.cmpf une, %[[TMP_12]], %[[TMP_cst]] : f64
+// CHECK:             scf.if %[[TMP_13]] {
+// CHECK:               memref.store %[[TMP_12]], %[[TMP_0]][%[[TMP_arg2]], %[[TMP_arg3]]] : memref<5x4xf64>
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+// CHECK:         %[[TMP_1:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:         %[[TMP_2:.*]] = memref.cast %[[TMP_1]] : memref<2xi8> to memref<?xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_1]][%[[TMP_c0]]] : memref<2xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_1]][%[[TMP_c1]]] : memref<2xi8>
+// CHECK:         %[[TMP_3:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_4:.*]] = memref.cast %[[TMP_3]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c3]], %[[TMP_3]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c4]], %[[TMP_3]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_5:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_6:.*]] = memref.cast %[[TMP_5]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c0]], %[[TMP_5]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c1]], %[[TMP_5]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[TMP_2]], %[[TMP_4]], %[[TMP_6]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_9:.*]] = memref.cast %[[TMP_8]] : memref<2xindex> to memref<?xindex>
+// CHECK:         %[[TMP_10:.*]] = memref.alloca() : memref<f64>
+// CHECK:         scf.while : () -> () {
+// CHECK:           %[[TMP_12:.*]] = func.call @getNextF64(%[[TMP_7]], %[[TMP_9]], %[[TMP_10]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
+// CHECK:           scf.condition(%[[TMP_12]])
+// CHECK:         } do {
+// CHECK:           %[[TMP_12:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           %[[TMP_13:.*]] = arith.addi %[[TMP_12]], %[[TMP_c2]] : index
+// CHECK:           %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           %[[TMP_15:.*]] = memref.load %[[TMP_10]][] : memref<f64>
+// CHECK:           memref.store %[[TMP_15]], %[[TMP_0]][%[[TMP_13]], %[[TMP_14]]] : memref<5x4xf64>
+// CHECK:           scf.yield
+// CHECK:         }
+// CHECK:         call @delSparseTensorCOOF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:         %[[TMP_11:.*]] = bufferization.to_tensor %[[TMP_0]] : memref<5x4xf64>
+// CHECK:         return %[[TMP_11]] : tensor<5x4xf64>
+// CHECK:       }
+func.func @concat_mix_dense(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #SparseMatrix>) -> tensor<5x4xf64> {
+  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 0 : index}
+       : tensor<2x4xf64>, tensor<3x4xf64, #SparseMatrix> to tensor<5x4xf64>
+  return %0 : tensor<5x4xf64>
+}
+
+// CHECK-LABEL: func.func @concat_mix_sparse(
+// CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<2x4xf64>,
+// CHECK-SAME:    %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
+// CHECK-DAG:     %[[TMP_c2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[TMP_c2_i32:.*]] = arith.constant 2 : i32
+// CHECK-DAG:     %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
+// CHECK-DAG:     %[[TMP_c3:.*]] = arith.constant 3 : index
+// CHECK-DAG:     %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:     %[[TMP_c4_i32:.*]] = arith.constant 4 : i32
+// CHECK-DAG:     %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
+// CHECK-DAG:     %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[TMP_c1:.*]] = arith.constant 1 : index
+// CHECK-DAG:     %[[TMP_c0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[TMP_c5:.*]] = arith.constant 5 : index
+// CHECK-DAG:     %[[TMP_c4:.*]] = arith.constant 4 : index
+// CHECK-DAG:     %[[TMP_c1_i8:.*]] = arith.constant 1 : i8
+// CHECK:         %[[TMP_0:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:         %[[TMP_1:.*]] = memref.cast %[[TMP_0]] : memref<2xi8> to memref<?xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_0]][%[[TMP_c0]]] : memref<2xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_0]][%[[TMP_c1]]] : memref<2xi8>
+// CHECK:         %[[TMP_2:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_3:.*]] = memref.cast %[[TMP_2]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c5]], %[[TMP_2]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c4]], %[[TMP_2]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_4:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_5:.*]] = memref.cast %[[TMP_4]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c0]], %[[TMP_4]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c1]], %[[TMP_4]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_6:.*]] = llvm.mlir.null : !llvm.ptr<i8>
+// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[TMP_1]], %[[TMP_3]], %[[TMP_5]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c4_i32]], %[[TMP_6]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<f64>
+// CHECK:         %[[TMP_9:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_10:.*]] = memref.cast %[[TMP_9]] : memref<2xindex> to memref<?xindex>
+// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
+// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
+// CHECK:             memref.store %[[TMP_arg2]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:             memref.store %[[TMP_arg3]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:             %[[TMP_22:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<2x4xf64>
+// CHECK:             %[[TMP_23:.*]] = arith.cmpf une, %[[TMP_22]], %[[TMP_cst]] : f64
+// CHECK:             scf.if %[[TMP_23]] {
+// CHECK:               memref.store %[[TMP_22]], %[[TMP_8]][] : memref<f64>
+// CHECK:               %[[TMP_24:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_8]], %[[TMP_10]], %[[TMP_5]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+// CHECK:         %[[TMP_11:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:         %[[TMP_12:.*]] = memref.cast %[[TMP_11]] : memref<2xi8> to memref<?xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_11]][%[[TMP_c0]]] : memref<2xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_11]][%[[TMP_c1]]] : memref<2xi8>
+// CHECK:         %[[TMP_13:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_14:.*]] = memref.cast %[[TMP_13]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c3]], %[[TMP_13]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c4]], %[[TMP_13]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_15:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_16:.*]] = memref.cast %[[TMP_15]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c0]], %[[TMP_15]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c1]], %[[TMP_15]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_17:.*]] = call @newSparseTensor(%[[TMP_12]], %[[TMP_14]], %[[TMP_16]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         %[[TMP_18:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_19:.*]] = memref.cast %[[TMP_18]] : memref<2xindex> to memref<?xindex>
+// CHECK:         %[[TMP_20:.*]] = memref.alloca() : memref<f64>
+// CHECK:         scf.while : () -> () {
+// CHECK:           %[[TMP_22:.*]] = func.call @getNextF64(%[[TMP_17]], %[[TMP_19]], %[[TMP_20]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
+// CHECK:           scf.condition(%[[TMP_22]])
+// CHECK:         } do {
+// CHECK:           %[[TMP_22:.*]] = memref.load %[[TMP_18]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           %[[TMP_23:.*]] = arith.addi %[[TMP_22]], %[[TMP_c2]] : index
+// CHECK:           %[[TMP_24:.*]] = memref.load %[[TMP_18]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           memref.store %[[TMP_23]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           memref.store %[[TMP_24]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           %[[TMP_25:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_20]], %[[TMP_10]], %[[TMP_5]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:           scf.yield
+// CHECK:         }
+// CHECK:         call @delSparseTensorCOOF64(%[[TMP_17]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:         %[[TMP_21:.*]] = call @newSparseTensor(%[[TMP_1]], %[[TMP_3]], %[[TMP_5]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c2_i32]], %[[TMP_7]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         call @delSparseTensorCOOF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:         return %[[TMP_21]] : !llvm.ptr<i8>
+// CHECK:       }
+func.func @concat_mix_sparse(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #SparseMatrix>) -> tensor<5x4xf64, #SparseMatrix> {
+  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 0 : index}
+       : tensor<2x4xf64>, tensor<3x4xf64, #SparseMatrix> to tensor<5x4xf64, #SparseMatrix>
+  return %0 : tensor<5x4xf64, #SparseMatrix>
+}
+
+// CHECK-LABEL: func.func @concat_mix_sparse_perm_dim1(
+// CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<4x2xf64>,
+// CHECK-SAME:    %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
+// CHECK-DAG:     %[[TMP_c2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[TMP_c2_i32:.*]] = arith.constant 2 : i32
+// CHECK-DAG:     %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
+// CHECK-DAG:     %[[TMP_c3:.*]] = arith.constant 3 : index
+// CHECK-DAG:     %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:     %[[TMP_c4_i32:.*]] = arith.constant 4 : i32
+// CHECK-DAG:     %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
+// CHECK-DAG:     %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[TMP_c1:.*]] = arith.constant 1 : index
+// CHECK-DAG:     %[[TMP_c0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[TMP_c4:.*]] = arith.constant 4 : index
+// CHECK-DAG:     %[[TMP_c5:.*]] = arith.constant 5 : index
+// CHECK-DAG:     %[[TMP_c1_i8:.*]] = arith.constant 1 : i8
+// CHECK:         %[[TMP_0:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:         %[[TMP_1:.*]] = memref.cast %[[TMP_0]] : memref<2xi8> to memref<?xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_0]][%[[TMP_c0]]] : memref<2xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_0]][%[[TMP_c1]]] : memref<2xi8>
+// CHECK:         %[[TMP_2:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_3:.*]] = memref.cast %[[TMP_2]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c4]], %[[TMP_2]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c5]], %[[TMP_2]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_4:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_5:.*]] = memref.cast %[[TMP_4]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c1]], %[[TMP_4]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c0]], %[[TMP_4]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_6:.*]] = llvm.mlir.null : !llvm.ptr<i8>
+// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[TMP_1]], %[[TMP_3]], %[[TMP_5]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c4_i32]], %[[TMP_6]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<f64>
+// CHECK:         %[[TMP_9:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_10:.*]] = memref.cast %[[TMP_9]] : memref<2xindex> to memref<?xindex>
+// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
+// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
+// CHECK:             memref.store %[[TMP_arg2]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:             memref.store %[[TMP_arg3]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:             %[[TMP_22:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<4x2xf64>
+// CHECK:             %[[TMP_23:.*]] = arith.cmpf une, %[[TMP_22]], %[[TMP_cst]] : f64
+// CHECK:             scf.if %[[TMP_23]] {
+// CHECK:               memref.store %[[TMP_22]], %[[TMP_8]][] : memref<f64>
+// CHECK:               %[[TMP_24:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_8]], %[[TMP_10]], %[[TMP_5]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+// CHECK:         %[[TMP_11:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:         %[[TMP_12:.*]] = memref.cast %[[TMP_11]] : memref<2xi8> to memref<?xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_11]][%[[TMP_c0]]] : memref<2xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_11]][%[[TMP_c1]]] : memref<2xi8>
+// CHECK:         %[[TMP_13:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_14:.*]] = memref.cast %[[TMP_13]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c4]], %[[TMP_13]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c3]], %[[TMP_13]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_15:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_16:.*]] = memref.cast %[[TMP_15]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c0]], %[[TMP_15]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c1]], %[[TMP_15]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_17:.*]] = call @newSparseTensor(%[[TMP_12]], %[[TMP_14]], %[[TMP_16]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         %[[TMP_18:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_19:.*]] = memref.cast %[[TMP_18]] : memref<2xindex> to memref<?xindex>
+// CHECK:         %[[TMP_20:.*]] = memref.alloca() : memref<f64>
+// CHECK:         scf.while : () -> () {
+// CHECK:           %[[TMP_22:.*]] = func.call @getNextF64(%[[TMP_17]], %[[TMP_19]], %[[TMP_20]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
+// CHECK:           scf.condition(%[[TMP_22]])
+// CHECK:         } do {
+// CHECK:           %[[TMP_22:.*]] = memref.load %[[TMP_18]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           %[[TMP_23:.*]] = memref.load %[[TMP_18]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           %[[TMP_24:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
+// CHECK:           memref.store %[[TMP_22]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           memref.store %[[TMP_24]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           %[[TMP_25:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_20]], %[[TMP_10]], %[[TMP_5]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
+// CHECK:           scf.yield
+// CHECK:         }
+// CHECK:         call @delSparseTensorCOOF64(%[[TMP_17]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:         %[[TMP_21:.*]] = call @newSparseTensor(%[[TMP_1]], %[[TMP_3]], %[[TMP_5]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c2_i32]], %[[TMP_7]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         call @delSparseTensorCOOF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:         return %[[TMP_21]] : !llvm.ptr<i8>
+// CHECK:       }
+func.func @concat_mix_sparse_perm_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #SparseMatrix_P>) -> tensor<4x5xf64, #SparseMatrix_P> {
+  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 1 : index}
+       : tensor<4x2xf64>, tensor<4x3xf64, #SparseMatrix_P> to tensor<4x5xf64, #SparseMatrix_P>
+  return %0 : tensor<4x5xf64, #SparseMatrix_P>
+}
+
+// CHECK-LABEL: func.func @concat_mix_dense_perm_dim1(
+// CHECK-SAME:     %[[TMP_arg0:.*]]: tensor<4x2xf64>,
+// CHECK-SAME:     %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
+// CHECK-DAG:         %[[TMP_c2:.*]] = arith.constant 2 : index
+// CHECK-DAG:         %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
+// CHECK-DAG:         %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
+// CHECK-DAG:         %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:         %[[TMP_c1_i8:.*]] = arith.constant 1 : i8
+// CHECK-DAG:         %[[TMP_c3:.*]] = arith.constant 3 : index
+// CHECK-DAG:         %[[TMP_c1:.*]] = arith.constant 1 : index
+// CHECK-DAG:         %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:         %[[TMP_c0:.*]] = arith.constant 0 : index
+// CHECK-DAG:         %[[TMP_c4:.*]] = arith.constant 4 : index
+// CHECK:         %[[TMP_0:.*]] = memref.alloc() : memref<4x5xf64>
+// CHECK:         linalg.fill ins(%[[TMP_cst]] : f64) outs(%[[TMP_0]] : memref<4x5xf64>)
+// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
+// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
+// CHECK:             %[[TMP_12:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<4x2xf64>
+// CHECK:             %[[TMP_13:.*]] = arith.cmpf une, %[[TMP_12]], %[[TMP_cst]] : f64
+// CHECK:             scf.if %[[TMP_13]] {
+// CHECK:               memref.store %[[TMP_12]], %[[TMP_0]][%[[TMP_arg2]], %[[TMP_arg3]]] : memref<4x5xf64>
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+// CHECK:         %[[TMP_1:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:         %[[TMP_2:.*]] = memref.cast %[[TMP_1]] : memref<2xi8> to memref<?xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_1]][%[[TMP_c0]]] : memref<2xi8>
+// CHECK:         memref.store %[[TMP_c1_i8]], %[[TMP_1]][%[[TMP_c1]]] : memref<2xi8>
+// CHECK:         %[[TMP_3:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_4:.*]] = memref.cast %[[TMP_3]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c4]], %[[TMP_3]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c3]], %[[TMP_3]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_5:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_6:.*]] = memref.cast %[[TMP_5]] : memref<2xindex> to memref<?xindex>
+// CHECK:         memref.store %[[TMP_c0]], %[[TMP_5]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:         memref.store %[[TMP_c1]], %[[TMP_5]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[TMP_2]], %[[TMP_4]], %[[TMP_6]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:         %[[TMP_9:.*]] = memref.cast %[[TMP_8]] : memref<2xindex> to memref<?xindex>
+// CHECK:         %[[TMP_10:.*]] = memref.alloca() : memref<f64>
+// CHECK:         scf.while : () -> () {
+// CHECK:           %[[TMP_12:.*]] = func.call @getNextF64(%[[TMP_7]], %[[TMP_9]], %[[TMP_10]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
+// CHECK:           scf.condition(%[[TMP_12]])
+// CHECK:         } do {
+// CHECK:           %[[TMP_12:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           %[[TMP_14:.*]] = arith.addi %[[TMP_13]], %[[TMP_c2]] : index
+// CHECK:           %[[TMP_15:.*]] = memref.load %[[TMP_10]][] : memref<f64>
+// CHECK:           memref.store %[[TMP_15]], %[[TMP_0]][%[[TMP_12]], %[[TMP_14]]] : memref<4x5xf64>
+// CHECK:           scf.yield
+// CHECK:         }
+// CHECK:         call @delSparseTensorCOOF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:         %[[TMP_11:.*]] = bufferization.to_tensor %[[TMP_0]] : memref<4x5xf64>
+// CHECK:         return %[[TMP_11]] : tensor<4x5xf64>
+// CHECK:       }
+func.func @concat_mix_dense_perm_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #SparseMatrix_P>) -> tensor<4x5xf64> {
+  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 1 : index}
+       : tensor<4x2xf64>, tensor<4x3xf64, #SparseMatrix_P> to tensor<4x5xf64>
+  return %0 : tensor<4x5xf64>
+}
+
+// CHECK-LABEL: func.func @concat_mix_dense_perm_dim1_dyn(
+// CHECK-SAME:      %[[TMP_arg0:.*]]: tensor<3x2xf64>,
+// CHECK-SAME:      %[[TMP_arg1:.*]]: !llvm.ptr<i8>) 
+// CHECK-DAG:       %[[TMP_c2:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
+// CHECK-DAG:       %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
+// CHECK-DAG:       %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[TMP_c1_i8:.*]] = arith.constant 1 : i8
+// CHECK-DAG:       %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[TMP_c0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[TMP_c3:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[TMP_c1:.*]] = arith.constant 1 : index
+// CHECK:           %[[TMP_0:.*]] = memref.alloc() : memref<3x5xf64>
+// CHECK:           %[[TMP_1:.*]] = memref.cast %[[TMP_0]] : memref<3x5xf64> to memref<?x?xf64>
+// CHECK:           linalg.fill ins(%[[TMP_cst]] : f64) outs(%[[TMP_0]] : memref<3x5xf64>)
+// CHECK:           scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c3]] step %[[TMP_c1]] {
+// CHECK:             scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
+// CHECK:               %[[TMP_13:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<3x2xf64>
+// CHECK:               %[[TMP_14:.*]] = arith.cmpf une, %[[TMP_13]], %[[TMP_cst]] : f64
+// CHECK:               scf.if %[[TMP_14]] {
+// CHECK:                 memref.store %[[TMP_13]], %[[TMP_0]][%[[TMP_arg2]], %[[TMP_arg3]]] : memref<3x5xf64>
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[TMP_2:.*]] = memref.alloca() : memref<2xi8>
+// CHECK:           %[[TMP_3:.*]] = memref.cast %[[TMP_2]] : memref<2xi8> to memref<?xi8>
+// CHECK:           memref.store %[[TMP_c1_i8]], %[[TMP_2]][%[[TMP_c0]]] : memref<2xi8>
+// CHECK:           memref.store %[[TMP_c1_i8]], %[[TMP_2]][%[[TMP_c1]]] : memref<2xi8>
+// CHECK:           %[[TMP_4:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:           %[[TMP_5:.*]] = memref.cast %[[TMP_4]] : memref<2xindex> to memref<?xindex>
+// CHECK:           memref.store %[[TMP_c3]], %[[TMP_4]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           memref.store %[[TMP_c3]], %[[TMP_4]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           %[[TMP_6:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:           %[[TMP_7:.*]] = memref.cast %[[TMP_6]] : memref<2xindex> to memref<?xindex>
+// CHECK:           memref.store %[[TMP_c0]], %[[TMP_6]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:           memref.store %[[TMP_c1]], %[[TMP_6]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:           %[[TMP_8:.*]] = call @newSparseTensor(%[[TMP_3]], %[[TMP_5]], %[[TMP_7]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]]) : (memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:           %[[TMP_9:.*]] = memref.alloca() : memref<2xindex>
+// CHECK:           %[[TMP_10:.*]] = memref.cast %[[TMP_9]] : memref<2xindex> to memref<?xindex>
+// CHECK:           %[[TMP_11:.*]] = memref.alloca() : memref<f64>
+// CHECK:           scf.while : () -> () {
+// CHECK:             %[[TMP_13:.*]] = func.call @getNextF64(%[[TMP_8]], %[[TMP_10]], %[[TMP_11]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
+// CHECK:             scf.condition(%[[TMP_13]])
+// CHECK:           } do {
+// CHECK:             %[[TMP_13:.*]] = memref.load %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
+// CHECK:             %[[TMP_14:.*]] = memref.load %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
+// CHECK:             %[[TMP_15:.*]] = arith.addi %[[TMP_14]], %[[TMP_c2]] : index
+// CHECK:             %[[TMP_16:.*]] = memref.load %[[TMP_11]][] : memref<f64>
+// CHECK:             memref.store %[[TMP_16]], %[[TMP_0]][%[[TMP_13]], %[[TMP_15]]] : memref<3x5xf64>
+// CHECK:             scf.yield
+// CHECK:           }
+// CHECK:           call @delSparseTensorCOOF64(%[[TMP_8]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:           %[[TMP_12:.*]] = bufferization.to_tensor %[[TMP_1]] : memref<?x?xf64>
+// CHECK:           return %[[TMP_12]] : tensor<?x?xf64>
+// CHECK:         }
+// CHECK:       }
+func.func @concat_mix_dense_perm_dim1_dyn(%arg0: tensor<3x2xf64>, %arg1: tensor<3x3xf64, #SparseMatrix>) -> tensor<?x?xf64> {
+  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 1 : index}
+       : tensor<3x2xf64>, tensor<3x3xf64, #SparseMatrix> to tensor<?x?xf64>
+  return %0 : tensor<?x?xf64>
+}

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate.mlir
new file mode 100644
index 0000000000000..37f6f749d4dff
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate.mlir
@@ -0,0 +1,430 @@
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#MAT_C_C = #sparse_tensor.encoding<{dimLevelType = ["compressed", "compressed"]}>
+#MAT_D_C = #sparse_tensor.encoding<{dimLevelType = ["dense", "compressed"]}>
+#MAT_C_D = #sparse_tensor.encoding<{dimLevelType = ["compressed", "dense"]}>
+
+#MAT_C_C_P = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+#MAT_C_D_P = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "dense" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+#MAT_D_C_P = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+module {
+  //
+  // Tests without permutation.
+  //
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a sparse matrix.
+  func.func @concat_sparse_sparse(%arg0: tensor<2x4xf64, #MAT_C_C>, %arg1: tensor<3x4xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #MAT_C_C>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<9x4xf64, #MAT_C_C>
+    return %0 : tensor<9x4xf64, #MAT_C_C>
+  }
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a dense matrix.
+  func.func @concat_sparse_dense(%arg0: tensor<2x4xf64, #MAT_C_C>, %arg1: tensor<3x4xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #MAT_C_C>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<9x4xf64>
+    return %0 : tensor<9x4xf64>
+  }
+
+  // Concats mix sparse and dense matrices to a sparse matrix
+  func.func @concat_mix_sparse(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<9x4xf64, #MAT_C_C>
+    return %0 : tensor<9x4xf64, #MAT_C_C>
+  }
+
+  // Concats mix sparse and dense matrices to a dense matrix
+  func.func @concat_mix_dense(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<9x4xf64>
+    return %0 : tensor<9x4xf64>
+  }
+
+  //
+  // Tests with permutation.
+  //
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a sparse matrix.
+  func.func @concat_sparse_sparse_perm(%arg0: tensor<2x4xf64, #MAT_C_C_P>, %arg1: tensor<3x4xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C_P> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<9x4xf64, #MAT_C_C_P>
+    return %0 : tensor<9x4xf64, #MAT_C_C_P>
+  }
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a dense matrix.
+  func.func @concat_sparse_dense_perm(%arg0: tensor<2x4xf64, #MAT_C_C_P>, %arg1: tensor<3x4xf64, #MAT_C_D_P>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C> to tensor<9x4xf64>
+    return %0 : tensor<9x4xf64>
+  }
+
+  // Concats mix sparse and dense matrices to a sparse matrix
+  func.func @concat_mix_sparse_perm(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #MAT_C_D_P>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C> to tensor<9x4xf64, #MAT_C_C>
+    return %0 : tensor<9x4xf64, #MAT_C_C>
+  }
+
+  // Concats mix sparse and dense matrices to a dense matrix
+  func.func @concat_mix_dense_perm(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C_P>) -> tensor<9x4xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C_P> to tensor<9x4xf64>
+    return %0 : tensor<9x4xf64>
+  }
+
+  //
+  // Tests without perumutation (concatenate on dimension 1)
+  //
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a sparse matrix.
+  func.func @concat_sparse_sparse_dim1(%arg0: tensor<4x2xf64, #MAT_C_C>, %arg1: tensor<4x3xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<4x9xf64, #MAT_C_C>
+    return %0 : tensor<4x9xf64, #MAT_C_C>
+  }
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a dense matrix.
+  func.func @concat_sparse_dense_dim1(%arg0: tensor<4x2xf64, #MAT_C_C>, %arg1: tensor<4x3xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<4x9xf64>
+    return %0 : tensor<4x9xf64>
+  }
+
+  // Concats mix sparse and dense matrices to a sparse matrix
+  func.func @concat_mix_sparse_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<4x9xf64, #MAT_C_C>
+    return %0 : tensor<4x9xf64, #MAT_C_C>
+  }
+
+  // Concats mix sparse and dense matrices to a dense matrix
+  func.func @concat_mix_dense_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<4x9xf64>
+    return %0 : tensor<4x9xf64>
+  }
+
+  //
+  // Tests with perumutation (concatenate on dimension 1)
+  //
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a sparse matrix.
+  func.func @concat_sparse_sparse_perm_dim1(%arg0: tensor<4x2xf64, #MAT_C_C_P>, %arg1: tensor<4x3xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C_P> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<4x9xf64, #MAT_C_C_P>
+    return %0 : tensor<4x9xf64, #MAT_C_C_P>
+  }
+
+  // Concats all sparse matrices (with 
diff erent encodings) to a dense matrix.
+  func.func @concat_sparse_dense_perm_dim1(%arg0: tensor<4x2xf64, #MAT_C_C_P>, %arg1: tensor<4x3xf64, #MAT_C_D_P>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C> to tensor<4x9xf64>
+    return %0 : tensor<4x9xf64>
+  }
+
+  // Concats mix sparse and dense matrices to a sparse matrix
+  func.func @concat_mix_sparse_perm_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #MAT_C_D_P>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C> to tensor<4x9xf64, #MAT_C_C>
+    return %0 : tensor<4x9xf64, #MAT_C_C>
+  }
+
+  // Concats mix sparse and dense matrices to a dense matrix
+  func.func @concat_mix_dense_perm_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C_P>) -> tensor<4x9xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C_P> to tensor<4x9xf64>
+    return %0 : tensor<4x9xf64>
+  }
+
+  //
+  // Concats mix sparse and dense matrices to a sparse matrix (with dynamic sizes)
+  //
+  func.func @concat_mix_sparse_dyn(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #MAT_C_D>, %arg2: tensor<4x4xf64, #MAT_D_C>) -> tensor<?x?xf64, #MAT_C_C> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 1 : index}
+         : tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C> to tensor<?x?xf64, #MAT_C_C>
+    return %0 : tensor<?x?xf64, #MAT_C_C>
+  }
+
+  func.func @dump_mat_9x4(%A: tensor<9x4xf64, #MAT_C_C>) {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %c = sparse_tensor.convert %A : tensor<9x4xf64, #MAT_C_C> to tensor<9x4xf64>
+    %m = bufferization.to_memref %c : memref<9x4xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<9x4xf64>, vector<9x4xf64>
+    vector.print %v : vector<9x4xf64>
+
+    %1 = sparse_tensor.values %A : tensor<9x4xf64, #MAT_C_C> to memref<?xf64>
+    %2 = vector.transfer_read %1[%c0], %du: memref<?xf64>, vector<36xf64>
+    vector.print %2 : vector<36xf64>
+
+    return
+  }
+
+  func.func @dump_mat_perm_9x4(%A: tensor<9x4xf64, #MAT_C_C_P>) {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %c = sparse_tensor.convert %A : tensor<9x4xf64, #MAT_C_C_P> to tensor<9x4xf64>
+    %m = bufferization.to_memref %c : memref<9x4xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<9x4xf64>, vector<9x4xf64>
+    vector.print %v : vector<9x4xf64>
+
+    %1 = sparse_tensor.values %A : tensor<9x4xf64, #MAT_C_C_P> to memref<?xf64>
+    %2 = vector.transfer_read %1[%c0], %du: memref<?xf64>, vector<36xf64>
+    vector.print %2 : vector<36xf64>
+
+    return
+  }
+
+  func.func @dump_mat_dense_9x4(%A: tensor<9x4xf64>) {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %m = bufferization.to_memref %A : memref<9x4xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<9x4xf64>, vector<9x4xf64>
+    vector.print %v : vector<9x4xf64>
+
+    return
+  }
+
+  func.func @dump_mat_4x9(%A: tensor<4x9xf64, #MAT_C_C>) {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C> to tensor<4x9xf64>
+    %m = bufferization.to_memref %c : memref<4x9xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<4x9xf64>, vector<4x9xf64>
+    vector.print %v : vector<4x9xf64>
+
+    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C> to memref<?xf64>
+    %2 = vector.transfer_read %1[%c0], %du: memref<?xf64>, vector<36xf64>
+    vector.print %2 : vector<36xf64>
+
+    return
+  }
+
+  func.func @dump_mat_dyn(%A: tensor<?x?xf64, #MAT_C_C>) {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %c = sparse_tensor.convert %A : tensor<?x?xf64, #MAT_C_C> to tensor<?x?xf64>
+    %m = bufferization.to_memref %c : memref<?x?xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<?x?xf64>, vector<4x9xf64>
+    vector.print %v : vector<4x9xf64>
+
+    %1 = sparse_tensor.values %A : tensor<?x?xf64, #MAT_C_C> to memref<?xf64>
+    %2 = vector.transfer_read %1[%c0], %du: memref<?xf64>, vector<36xf64>
+    vector.print %2 : vector<36xf64>
+
+    return
+  }
+
+  func.func @dump_mat_perm_4x9(%A: tensor<4x9xf64, #MAT_C_C_P>) {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C_P> to tensor<4x9xf64>
+    %m = bufferization.to_memref %c : memref<4x9xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<4x9xf64>, vector<4x9xf64>
+    vector.print %v : vector<4x9xf64>
+
+    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C_P> to memref<?xf64>
+    %2 = vector.transfer_read %1[%c0], %du: memref<?xf64>, vector<36xf64>
+    vector.print %2 : vector<36xf64>
+
+    return
+  }
+
+  func.func @dump_mat_dense_4x9(%A: tensor<4x9xf64>) {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %m = bufferization.to_memref %A : memref<4x9xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<4x9xf64>, vector<4x9xf64>
+    vector.print %v : vector<4x9xf64>
+
+    return
+  }
+
+  // Driver method to call and verify kernels.
+  func.func @entry() {
+    %m42 = arith.constant dense<
+      [ [ 1.0, 0.0 ],
+        [ 3.1, 0.0 ],
+        [ 0.0, 2.0 ],
+        [ 0.0, 0.0 ] ]> : tensor<4x2xf64>
+    %m43 = arith.constant dense<
+      [ [ 1.0, 0.0, 1.0 ],
+        [ 1.0, 0.0, 0.5 ],
+        [ 0.0, 0.0, 1.0 ],
+        [ 5.0, 2.0, 0.0 ] ]> : tensor<4x3xf64>
+    %m24 = arith.constant dense<
+      [ [ 1.0, 0.0, 3.0, 0.0],
+        [ 0.0, 2.0, 0.0, 0.0] ]> : tensor<2x4xf64>
+    %m34 = arith.constant dense<
+      [ [ 1.0, 0.0, 1.0, 1.0],
+        [ 0.0, 0.5, 0.0, 0.0],
+        [ 1.0, 5.0, 2.0, 0.0] ]> : tensor<3x4xf64>
+    %m44 = arith.constant dense<
+      [ [ 0.0, 0.0, 1.5, 1.0],
+        [ 0.0, 3.5, 0.0, 0.0],
+        [ 1.0, 5.0, 2.0, 0.0],
+        [ 1.0, 0.5, 0.0, 0.0] ]> : tensor<4x4xf64>
+
+    %sm24cc = sparse_tensor.convert %m24 : tensor<2x4xf64> to tensor<2x4xf64, #MAT_C_C>
+    %sm34cd = sparse_tensor.convert %m34 : tensor<3x4xf64> to tensor<3x4xf64, #MAT_C_D>
+    %sm42cc = sparse_tensor.convert %m42 : tensor<4x2xf64> to tensor<4x2xf64, #MAT_C_C>
+    %sm43cd = sparse_tensor.convert %m43 : tensor<4x3xf64> to tensor<4x3xf64, #MAT_C_D>
+    %sm44dc = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C>
+
+    %sm24ccp = sparse_tensor.convert %m24 : tensor<2x4xf64> to tensor<2x4xf64, #MAT_C_C_P>
+    %sm34cdp = sparse_tensor.convert %m34 : tensor<3x4xf64> to tensor<3x4xf64, #MAT_C_D_P>
+    %sm42ccp = sparse_tensor.convert %m42 : tensor<4x2xf64> to tensor<4x2xf64, #MAT_C_C_P>
+    %sm43cdp = sparse_tensor.convert %m43 : tensor<4x3xf64> to tensor<4x3xf64, #MAT_C_D_P>
+    %sm44dcp = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C_P>
+
+    %sm43cd_dyn = sparse_tensor.convert %m43 : tensor<4x3xf64> to tensor<?x?xf64, #MAT_C_D>
+    %sm44dc_dyn = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<?x?xf64, #MAT_D_C>
+
+    // CHECK:    ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 3, 2, 1, 0, 1, 1, 0, 0.5, 0, 0, 1, 5, 2, 0, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %0 = call @concat_sparse_sparse(%sm24cc, %sm34cd, %sm44dc)
+               : (tensor<2x4xf64, #MAT_C_C>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C>
+    call @dump_mat_9x4(%0) : (tensor<9x4xf64, #MAT_C_C>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    %1 = call @concat_sparse_dense(%sm24cc, %sm34cd, %sm44dc)
+               : (tensor<2x4xf64, #MAT_C_C>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64>
+    call @dump_mat_dense_9x4(%1) : (tensor<9x4xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 3, 2, 1, 0, 1, 1, 0, 0.5, 0, 0, 1, 5, 2, 0, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %2 = call @concat_mix_sparse(%m24, %sm34cd, %sm44dc)
+               : (tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C>
+    call @dump_mat_9x4(%2) : (tensor<9x4xf64, #MAT_C_C>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    %3 = call @concat_mix_dense(%m24, %sm34cd, %sm44dc)
+               : (tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64>
+    call @dump_mat_dense_9x4(%3) : (tensor<9x4xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 1, 0, 1, 1, 1, 2, 0, 0.5, 5, 3.5, 5, 0.5, 3, 1, 0, 2, 1.5, 2, 1, 0, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %4 = call @concat_sparse_sparse_perm(%sm24ccp, %sm34cd, %sm44dc)
+               : (tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C_P>
+    call @dump_mat_perm_9x4(%4) : (tensor<9x4xf64, #MAT_C_C_P>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    %5 = call @concat_sparse_dense_perm(%sm24ccp, %sm34cdp, %sm44dc)
+               : (tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64>
+    call @dump_mat_dense_9x4(%5) : (tensor<9x4xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 3, 2, 1, 0, 1, 1, 0, 0.5, 0, 0, 1, 5, 2, 0, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %6 = call @concat_mix_sparse_perm(%m24, %sm34cdp, %sm44dc)
+               : (tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C>
+    call @dump_mat_9x4(%6) : (tensor<9x4xf64, #MAT_C_C>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 3, 0 ), ( 0, 2, 0, 0 ), ( 1, 0, 1, 1 ), ( 0, 0.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 0, 0, 1.5, 1 ), ( 0, 3.5, 0, 0 ), ( 1, 5, 2, 0 ), ( 1, 0.5, 0, 0 ) )
+    %7 = call @concat_mix_dense_perm(%m24, %sm34cd, %sm44dcp)
+               : (tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C_P>) -> tensor<9x4xf64>
+    call @dump_mat_dense_9x4(%7) : (tensor<9x4xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 1, 0, 1, 1.5, 1, 3.1, 1, 0, 0.5, 3.5, 2, 0, 0, 1, 1, 5, 2, 5, 2, 0, 1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %8 = call @concat_sparse_sparse_dim1(%sm42cc, %sm43cd, %sm44dc)
+               : (tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
+    call @dump_mat_4x9(%8) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    %9 = call @concat_sparse_dense_dim1(%sm42cc, %sm43cd, %sm44dc)
+               : (tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64>
+    call @dump_mat_dense_4x9(%9) : (tensor<4x9xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 1, 0, 1, 1.5, 1, 3.1, 1, 0, 0.5, 3.5, 2, 0, 0, 1, 1, 5, 2, 5, 2, 0, 1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %10 = call @concat_mix_sparse_dim1(%m42, %sm43cd, %sm44dc)
+               : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
+    call @dump_mat_4x9(%10) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    %11 = call @concat_mix_dense_dim1(%m42, %sm43cd, %sm44dc)
+               : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64>
+    call @dump_mat_dense_4x9(%11) : (tensor<4x9xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 3.1, 2, 1, 1, 0, 5, 0, 0, 0, 2, 1, 0.5, 1, 0, 1, 1, 3.5, 5, 0.5, 1.5, 2, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %12 = call @concat_sparse_sparse_perm_dim1(%sm42ccp, %sm43cd, %sm44dc)
+               : (tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C_P>
+    call @dump_mat_perm_4x9(%12) : (tensor<4x9xf64, #MAT_C_C_P>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    %13 = call @concat_sparse_dense_perm_dim1(%sm42ccp, %sm43cdp, %sm44dc)
+               : (tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64>
+    call @dump_mat_dense_4x9(%13) : (tensor<4x9xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 1, 0, 1, 1.5, 1, 3.1, 1, 0, 0.5, 3.5, 2, 0, 0, 1, 1, 5, 2, 5, 2, 0, 1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %14 = call @concat_mix_sparse_perm_dim1(%m42, %sm43cdp, %sm44dc)
+               : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
+    call @dump_mat_4x9(%14) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    %15 = call @concat_mix_dense_perm_dim1(%m42, %sm43cd, %sm44dcp)
+               : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C_P>) -> tensor<4x9xf64>
+    call @dump_mat_dense_4x9(%15) : (tensor<4x9xf64>) -> ()
+
+    // CHECK-NEXT: ( ( 1, 0, 1, 0, 1, 0, 0, 1.5, 1 ), ( 3.1, 0, 1, 0, 0.5, 0, 3.5, 0, 0 ), ( 0, 2, 0, 0, 1, 1, 5, 2, 0 ), ( 0, 0, 5, 2, 0, 1, 0.5, 0, 0 ) )
+    // CHECK-NEXT: ( 1, 1, 0, 1, 1.5, 1, 3.1, 1, 0, 0.5, 3.5, 2, 0, 0, 1, 1, 5, 2, 5, 2, 0, 1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 )
+    %16 = call @concat_mix_sparse_dyn(%m42, %sm43cd, %sm44dc)
+               : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<?x?xf64, #MAT_C_C>
+    call @dump_mat_dyn(%16) : (tensor<?x?xf64, #MAT_C_C>) -> ()
+
+    // Release resources.
+    bufferization.dealloc_tensor %sm24cc  : tensor<2x4xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %sm34cd  : tensor<3x4xf64, #MAT_C_D>
+    bufferization.dealloc_tensor %sm42cc  : tensor<4x2xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %sm43cd  : tensor<4x3xf64, #MAT_C_D>
+    bufferization.dealloc_tensor %sm44dc  : tensor<4x4xf64, #MAT_D_C>
+    bufferization.dealloc_tensor %sm24ccp : tensor<2x4xf64, #MAT_C_C_P>
+    bufferization.dealloc_tensor %sm34cdp : tensor<3x4xf64, #MAT_C_D_P>
+    bufferization.dealloc_tensor %sm42ccp : tensor<4x2xf64, #MAT_C_C_P>
+    bufferization.dealloc_tensor %sm43cdp : tensor<4x3xf64, #MAT_C_D_P>
+    bufferization.dealloc_tensor %sm44dcp : tensor<4x4xf64, #MAT_D_C_P>
+    bufferization.dealloc_tensor %0  : tensor<9x4xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %1  : tensor<9x4xf64>
+    bufferization.dealloc_tensor %2  : tensor<9x4xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %3  : tensor<9x4xf64>
+    bufferization.dealloc_tensor %4  : tensor<9x4xf64, #MAT_C_C_P>
+    bufferization.dealloc_tensor %5  : tensor<9x4xf64>
+    bufferization.dealloc_tensor %6  : tensor<9x4xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %7  : tensor<9x4xf64>
+    bufferization.dealloc_tensor %8  : tensor<4x9xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %9  : tensor<4x9xf64>
+    bufferization.dealloc_tensor %10 : tensor<4x9xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %11 : tensor<4x9xf64>
+    bufferization.dealloc_tensor %12 : tensor<4x9xf64, #MAT_C_C_P>
+    bufferization.dealloc_tensor %13 : tensor<4x9xf64>
+    bufferization.dealloc_tensor %14 : tensor<4x9xf64, #MAT_C_C>
+    bufferization.dealloc_tensor %15 : tensor<4x9xf64>
+    bufferization.dealloc_tensor %16 : tensor<?x?xf64, #MAT_C_C>
+    return
+  }
+}