[Mlir-commits] [mlir] a0c5b7e - [mlir][sparse] support for very narrow index and pointer types

Thu Apr 1 18:21:43 PDT 2021

Author: Aart Bik
Date: 2021-04-01T18:21:27-07:00
New Revision: a0c5b7e3b523764d089e2d843648acffcd496b79

URL: https://github.com/llvm/llvm-project/commit/a0c5b7e3b523764d089e2d843648acffcd496b79
DIFF: https://github.com/llvm/llvm-project/commit/a0c5b7e3b523764d089e2d843648acffcd496b79.diff

LOG: [mlir][sparse] support for very narrow index and pointer types

Rationale:
Small indices and values, when allowed by the required range of the
input tensors, can reduce the memory footprint of sparse tensors
even more. Note, however, that we must be careful zero extending
the values (since sparse tensors never use negatives for indexing),
but LLVM treats the index type as signed in most memory operations
(like the scatter and gather). This CL dots all the i's in this regard.

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D99777

Added: 
    mlir/test/Integration/Sparse/sparse_matvec.mlir
    mlir/test/Integration/data/wide.mtx

Modified: 
    mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
    mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
    mlir/lib/ExecutionEngine/SparseUtils.cpp
    mlir/test/CMakeLists.txt
    mlir/test/Dialect/Linalg/sparse_storage.mlir
    mlir/test/Dialect/Linalg/sparse_vector.mlir
    mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir
    mlir/test/Integration/Sparse/CPU/sparse_sum.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
index f4c84fc4366b0..ef8f1310d2acd 100644

--- a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
@@ -76,6 +76,10 @@ class TensorToPointersConverter
       name = "sparsePointers64";
     else if (eltType.isInteger(32))
       name = "sparsePointers32";
+    else if (eltType.isInteger(16))
+      name = "sparsePointers16";
+    else if (eltType.isInteger(8))
+      name = "sparsePointers8";
     else
       return failure();
     rewriter.replaceOpWithNewOp<CallOp>(
@@ -100,6 +104,10 @@ class TensorToIndicesConverter
       name = "sparseIndices64";
     else if (eltType.isInteger(32))
       name = "sparseIndices32";
+    else if (eltType.isInteger(16))
+      name = "sparseIndices16";
+    else if (eltType.isInteger(8))
+      name = "sparseIndices8";
     else
       return failure();
     rewriter.replaceOpWithNewOp<CallOp>(

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
index c0c1970290fca..9ed3282b02104 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
@@ -614,18 +614,21 @@ static void genBuffers(Merger &merger, CodeGen &codegen,
   }
 }
 
+/// Constructs vector type.
+static VectorType vectorType(CodeGen &codegen, Type etp) {
+  return VectorType::get(codegen.curVecLength, etp);
+}
+
 /// Constructs vector type from pointer.
 static VectorType vectorType(CodeGen &codegen, Value ptr) {
-  Type etp = ptr.getType().cast<MemRefType>().getElementType();
-  return VectorType::get(codegen.curVecLength, etp);
+  return vectorType(codegen, ptr.getType().cast<MemRefType>().getElementType());
 }
 
 /// Constructs vector iteration mask.
 static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter,
                            Value iv, Value lo, Value hi, Value step) {
   Location loc = iv.getLoc();
-  VectorType mtp =
-      VectorType::get(codegen.curVecLength, rewriter.getIntegerType(1));
+  VectorType mtp = vectorType(codegen, rewriter.getIntegerType(1));
   // Special case if the vector length evenly divides the trip count (for
   // example, "for i = 0, 128, 16"). A constant all-true mask is generated
   // so that all subsequent masked memory operations are immediately folded
@@ -683,7 +686,7 @@ static void genVectorStore(CodeGen &codegen, PatternRewriter &rewriter,
 /// optimizations to hoist the invariant broadcast out of the vector loop.
 static Value genVectorInvariantValue(CodeGen &codegen,
                                      PatternRewriter &rewriter, Value val) {
-  VectorType vtp = VectorType::get(codegen.curVecLength, val.getType());
+  VectorType vtp = vectorType(codegen, val.getType());
   return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
 }
 
@@ -747,15 +750,47 @@ static void genTensorStore(Merger &merger, CodeGen &codegen,
     rewriter.create<memref::StoreOp>(loc, rhs, ptr, args);
 }
 
-/// Generates a pointer/index load from the sparse storage scheme.
+/// Generates a pointer/index load from the sparse storage scheme. Narrower
+/// data types need to be zero extended before casting the value into the
+/// index type used for looping and indexing.
 static Value genLoad(CodeGen &codegen, PatternRewriter &rewriter, Location loc,
                      Value ptr, Value s) {
-  if (codegen.curVecLength > 1)
-    return genVectorLoad(codegen, rewriter, ptr, {s});
+  // See https://llvm.org/docs/GetElementPtr.html for some background on
+  // the complications described below.
+  if (codegen.curVecLength > 1) {
+    // Since the index vector is used in a subsequent gather/scatter operations,
+    // which effectively defines an unsigned pointer + signed index, we must
+    // zero extend the vector to an index width. For 8-bit and 16-bit values,
+    // an 32-bit index width suffices. For 32-bit values, zero extending the
+    // elements into 64-bit loses some performance since the 32-bit indexed
+    // gather/scatter is more efficient than the 64-bit index variant (in
+    // the future, we could introduce a flag that states the negative space
+    // of 32-bit indices is unused). For 64-bit values, there is no good way
+    // to state that the indices are unsigned, with creates the potential of
+    // incorrect address calculations in the unlikely case we need such
+    // extremely large offsets.
+    Type etp = ptr.getType().cast<MemRefType>().getElementType();
+    Value vload = genVectorLoad(codegen, rewriter, ptr, {s});
+    if (etp.getIntOrFloatBitWidth() < 32)
+      vload = rewriter.create<ZeroExtendIOp>(
+          loc, vload, vectorType(codegen, rewriter.getIntegerType(32)));
+    else if (etp.getIntOrFloatBitWidth() < 64)
+      vload = rewriter.create<ZeroExtendIOp>(
+          loc, vload, vectorType(codegen, rewriter.getIntegerType(64)));
+    return vload;
+  }
+  // For the scalar case, we simply zero extend narrower indices into 64-bit
+  // values before casting to index without a performance penalty. Here too,
+  // however, indices that already are 64-bit, in theory, cannot express the
+  // full range as explained above.
   Value load = rewriter.create<memref::LoadOp>(loc, ptr, s);
-  return load.getType().isa<IndexType>()
-             ? load
-             : rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());
+  if (!load.getType().isa<IndexType>()) {
+    if (load.getType().getIntOrFloatBitWidth() < 64)
+      load = rewriter.create<ZeroExtendIOp>(loc, load,
+                                            rewriter.getIntegerType(64));
+    load = rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());
+  }
+  return load;
 }
 
 /// Generates an invariant value.
@@ -959,8 +994,10 @@ static bool denseUnitStrides(Merger &merger, linalg::GenericOp op,
     if (!merger.isSparseTensor(t) && !linkedSparse(op, t)) {
       auto map = op.getIndexingMap(t);
       unsigned r = map.getNumResults();
-      if (r && map.getDimPosition(r - 1) != idx)
-        return false;
+      for (unsigned i = 0; i < r; i++) {
+        if (map.getDimPosition(i) == idx && i != r - 1)
+          return false;
+      }
     }
   }
   return true;

diff  --git a/mlir/lib/ExecutionEngine/SparseUtils.cpp b/mlir/lib/ExecutionEngine/SparseUtils.cpp
index 903b9f1151820..5b4af4ca85ddf 100644
--- a/mlir/lib/ExecutionEngine/SparseUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseUtils.cpp
@@ -113,12 +113,21 @@ struct SparseTensor {
 class SparseTensorStorageBase {
 public:
   virtual uint64_t getDimSize(uint64_t) = 0;
+
+  // Overhead storage.
   virtual void getPointers(std::vector<uint64_t> **, uint64_t) { fatal("p64"); }
   virtual void getPointers(std::vector<uint32_t> **, uint64_t) { fatal("p32"); }
+  virtual void getPointers(std::vector<uint16_t> **, uint64_t) { fatal("p16"); }
+  virtual void getPointers(std::vector<uint8_t> **, uint64_t) { fatal("p8"); }
   virtual void getIndices(std::vector<uint64_t> **, uint64_t) { fatal("i64"); }
   virtual void getIndices(std::vector<uint32_t> **, uint64_t) { fatal("i32"); }
+  virtual void getIndices(std::vector<uint16_t> **, uint64_t) { fatal("i16"); }
+  virtual void getIndices(std::vector<uint8_t> **, uint64_t) { fatal("i8"); }
+
+  // Primary storage.
   virtual void getValues(std::vector<double> **) { fatal("valf64"); }
   virtual void getValues(std::vector<float> **) { fatal("valf32"); }
+
   virtual ~SparseTensorStorageBase() {}
 
 private:
@@ -464,6 +473,22 @@ struct MemRef1DU32 {
   uint64_t strides[1];
 };
 
+struct MemRef1DU16 {
+  const uint16_t *base;
+  const uint16_t *data;
+  uint64_t off;
+  uint64_t sizes[1];
+  uint64_t strides[1];
+};
+
+struct MemRef1DU8 {
+  const uint8_t *base;
+  const uint8_t *data;
+  uint64_t off;
+  uint64_t sizes[1];
+  uint64_t strides[1];
+};
+
 struct MemRef1DF64 {
   const double *base;
   const double *data;
@@ -480,41 +505,42 @@ struct MemRef1DF32 {
   uint64_t strides[1];
 };
 
-enum TypeEnum : uint64_t { kF64 = 0, kF32 = 1, kU64 = 2, kU32 = 3 };
+enum OverheadTypeEnum : uint64_t { kU64 = 1, kU32 = 2, kU16 = 3, kU8 = 4 };
+enum PrimaryTypeEnum : uint64_t { kF64 = 1, kF32 = 2 };
+
+#define CASE(p, i, v, P, I, V)                                                 \
+  if (ptrTp == (p) && indTp == (i) && valTp == (v))                            \
+  return newSparseTensor<P, I, V>(filename, sparsity, asize)
 
 void *newSparseTensor(char *filename, bool *abase, bool *adata, uint64_t aoff,
                       uint64_t asize, uint64_t astride, uint64_t ptrTp,
                       uint64_t indTp, uint64_t valTp) {
   assert(astride == 1);
   bool *sparsity = abase + aoff;
-  if (ptrTp == kU64 && indTp == kU64 && valTp == kF64)
-    return newSparseTensor<uint64_t, uint64_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU64 && indTp == kU64 && valTp == kF32)
-    return newSparseTensor<uint64_t, uint64_t, float>(filename, sparsity,
-                                                      asize);
-  if (ptrTp == kU64 && indTp == kU32 && valTp == kF64)
-    return newSparseTensor<uint64_t, uint32_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU64 && indTp == kU32 && valTp == kF32)
-    return newSparseTensor<uint64_t, uint32_t, float>(filename, sparsity,
-                                                      asize);
-  if (ptrTp == kU32 && indTp == kU64 && valTp == kF64)
-    return newSparseTensor<uint32_t, uint64_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU32 && indTp == kU64 && valTp == kF32)
-    return newSparseTensor<uint32_t, uint64_t, float>(filename, sparsity,
-                                                      asize);
-  if (ptrTp == kU32 && indTp == kU32 && valTp == kF64)
-    return newSparseTensor<uint32_t, uint32_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU32 && indTp == kU32 && valTp == kF32)
-    return newSparseTensor<uint32_t, uint32_t, float>(filename, sparsity,
-                                                      asize);
+
+  // The most common cases: 64-bit or 32-bit overhead, double/float values.
+  CASE(kU64, kU64, kF64, uint64_t, uint64_t, double);
+  CASE(kU64, kU64, kF32, uint64_t, uint64_t, float);
+  CASE(kU64, kU32, kF64, uint64_t, uint32_t, double);
+  CASE(kU64, kU32, kF32, uint64_t, uint32_t, float);
+  CASE(kU32, kU64, kF64, uint32_t, uint64_t, double);
+  CASE(kU32, kU64, kF32, uint32_t, uint64_t, float);
+  CASE(kU32, kU32, kF64, uint32_t, uint32_t, double);
+  CASE(kU32, kU32, kF32, uint32_t, uint32_t, float);
+
+  // Some special cases: low overhead storage, double/float values.
+  CASE(kU16, kU16, kF64, uint16_t, uint16_t, double);
+  CASE(kU8, kU8, kF64, uint8_t, uint8_t, double);
+  CASE(kU16, kU16, kF32, uint16_t, uint16_t, float);
+  CASE(kU8, kU8, kF32, uint8_t, uint8_t, float);
+
+  // Unsupported case (add above if needed).
   fputs("unsupported combination of types\n", stderr);
   exit(1);
 }
 
+#undef CASE
+
 uint64_t sparseDimSize(void *tensor, uint64_t d) {
   return static_cast<SparseTensorStorageBase *>(tensor)->getDimSize(d);
 }
@@ -531,6 +557,18 @@ MemRef1DU32 sparsePointers32(void *tensor, uint64_t d) {
   return {v->data(), v->data(), 0, {v->size()}, {1}};
 }
 
+MemRef1DU16 sparsePointers16(void *tensor, uint64_t d) {
+  std::vector<uint16_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getPointers(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
+MemRef1DU8 sparsePointers8(void *tensor, uint64_t d) {
+  std::vector<uint8_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getPointers(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
 MemRef1DU64 sparseIndices64(void *tensor, uint64_t d) {
   std::vector<uint64_t> *v;
   static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
@@ -543,6 +581,18 @@ MemRef1DU32 sparseIndices32(void *tensor, uint64_t d) {
   return {v->data(), v->data(), 0, {v->size()}, {1}};
 }
 
+MemRef1DU16 sparseIndices16(void *tensor, uint64_t d) {
+  std::vector<uint16_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
+MemRef1DU8 sparseIndices8(void *tensor, uint64_t d) {
+  std::vector<uint8_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
 MemRef1DF64 sparseValuesF64(void *tensor) {
   std::vector<double> *v;
   static_cast<SparseTensorStorageBase *>(tensor)->getValues(&v);

diff  --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 775a462db53d9..7343c3b076ba6 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -36,6 +36,7 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
   # Copy test data over.
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.mtx
             ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.tns
+            ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/wide.mtx
           DESTINATION ${MLIR_INTEGRATION_TEST_DIR}/data/)
 endif()
 

diff  --git a/mlir/test/Dialect/Linalg/sparse_storage.mlir b/mlir/test/Dialect/Linalg/sparse_storage.mlir
index 81127513ea353..998b71f5a24d2 100644
--- a/mlir/test/Dialect/Linalg/sparse_storage.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_storage.mlir
@@ -51,7 +51,8 @@
 // CHECK-TYPE1: %[[B1:.*]] = index_cast %[[P1]] : i64 to index
 // CHECK-TYPE1: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE1:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi32>
-// CHECK-TYPE1:   %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index
+// CHECK-TYPE1:   %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64
+// CHECK-TYPE1:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE1:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE1:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE1:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
@@ -62,9 +63,11 @@
 // CHECK-TYPE2: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE2: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE2: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi32>
-// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[P0]] : i32 to index
+// CHECK-TYPE2: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64
+// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE2: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi32>
-// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[P1]] : i32 to index
+// CHECK-TYPE2: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64
+// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE2: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE2:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi64>
 // CHECK-TYPE2:   %[[INDC:.*]] = index_cast %[[IND0]] : i64 to index
@@ -78,12 +81,15 @@
 // CHECK-TYPE3: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE3: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE3: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi32>
-// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[P0]] : i32 to index
+// CHECK-TYPE3: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64
+// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE3: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi32>
-// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[P1]] : i32 to index
+// CHECK-TYPE3: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64
+// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE3: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE3:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi32>
-// CHECK-TYPE3:   %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index
+// CHECK-TYPE3:   %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64
+// CHECK-TYPE3:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE3:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE3:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE3:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
@@ -94,12 +100,15 @@
 // CHECK-TYPE4: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE4: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE4: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi16>
-// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[P0]] : i16 to index
+// CHECK-TYPE4: %[[Z0:.*]] = zexti %[[P0]] : i16 to i64
+// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE4: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi16>
-// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[P1]] : i16 to index
+// CHECK-TYPE4: %[[Z1:.*]] = zexti %[[P1]] : i16 to i64
+// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE4: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE4:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi16>
-// CHECK-TYPE4:   %[[INDC:.*]] = index_cast %[[IND0]] : i16 to index
+// CHECK-TYPE4:   %[[ZEXT:.*]] = zexti %[[IND0]] : i16 to i64
+// CHECK-TYPE4:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE4:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE4:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE4:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
@@ -110,12 +119,15 @@
 // CHECK-TYPE5: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE5: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE5: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi8>
-// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[P0]] : i8 to index
+// CHECK-TYPE5: %[[Z0:.*]] = zexti %[[P0]] : i8 to i64
+// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE5: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi8>
-// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[P1]] : i8 to index
+// CHECK-TYPE5: %[[Z1:.*]] = zexti %[[P1]] : i8 to i64
+// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE5: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE5:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi8>
-// CHECK-TYPE5:   %[[INDC:.*]] = index_cast %[[IND0]] : i8 to index
+// CHECK-TYPE5:   %[[ZEXT:.*]] = zexti %[[IND0]] : i8 to i64
+// CHECK-TYPE5:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE5:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE5:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE5:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64

diff  --git a/mlir/test/Dialect/Linalg/sparse_vector.mlir b/mlir/test/Dialect/Linalg/sparse_vector.mlir
index 882ef077cb013..87f1a406179d7 100644
--- a/mlir/test/Dialect/Linalg/sparse_vector.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_vector.mlir
@@ -85,12 +85,15 @@ func @scale_d(%arga: tensor<1024xf32>, %scale: f32, %argx: tensor<1024xf32>) ->
 // CHECK-VEC0-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC0-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC0:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC0:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC0:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC0:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC0:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC0:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC0:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC0:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC0:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC0:         %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0:         %[[ci:.*]] = index_cast %[[li]] : i32 to index
+// CHECK-VEC0:         %[[zi:.*]] = zexti %[[li]] : i32 to i64
+// CHECK-VEC0:         %[[ci:.*]] = index_cast %[[zi]] : i64 to index
 // CHECK-VEC0:         %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
 // CHECK-VEC0:         %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
 // CHECK-VEC0:         %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -102,12 +105,15 @@ func @scale_d(%arga: tensor<1024xf32>, %scale: f32, %argx: tensor<1024xf32>) ->
 // CHECK-VEC1-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC1-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC1:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC1:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC1:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC1:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC1:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC1:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC1:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC1:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC1:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC1:         %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1:         %[[ci:.*]] = index_cast %[[li]] : i32 to index
+// CHECK-VEC1:         %[[zi:.*]] = zexti %[[li]] : i32 to i64
+// CHECK-VEC1:         %[[ci:.*]] = index_cast %[[zi]] : i64 to index
 // CHECK-VEC1:         %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
 // CHECK-VEC1:         %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
 // CHECK-VEC1:         %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -120,17 +126,20 @@ func @scale_d(%arga: tensor<1024xf32>, %scale: f32, %argx: tensor<1024xf32>) ->
 // CHECK-VEC2-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC2-DAG:   %[[c16:.*]] = constant 16 : index
 // CHECK-VEC2:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC2:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC2:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC2:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
 // CHECK-VEC2:         %[[sub:.*]] = subi %[[s]], %[[i]] : index
 // CHECK-VEC2:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:         %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2:         %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
 // CHECK-VEC2:         %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
@@ -151,17 +160,20 @@ func @mul_s(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tensor<1024
 // CHECK-VEC2-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC2-DAG:   %[[c16:.*]] = constant 16 : index
 // CHECK-VEC2:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC2:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC2:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC2:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
 // CHECK-VEC2:         %[[sub:.*]] = subi %[[s]], %[[i]] : index
 // CHECK-VEC2:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:         %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2:         %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
 // CHECK-VEC2:         %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
@@ -303,13 +315,16 @@ func @reduction_17(%arga: tensor<17xf32>, %argb: tensor<17xf32>, %argx: tensor<f
 // CHECK-VEC0-DAG:   %[[c512:.*]] = constant 512 : index
 // CHECK-VEC0:       scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
 // CHECK-VEC0:         %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0:         %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC0:         %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC0:         %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC0:         %[[a:.*]] = addi %[[i]], %[[c1]] : index
 // CHECK-VEC0:         %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC0:         %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC0:         %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC0:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC0:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC0:           %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC0:           %[[cj:.*]] = index_cast %[[lj]] : i32 to index
+// CHECK-VEC0:           %[[zj:.*]] = zexti %[[lj]] : i32 to i64
+// CHECK-VEC0:           %[[cj:.*]] = index_cast %[[zj]] : i64 to index
 // CHECK-VEC0:           %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
 // CHECK-VEC0:           %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
 // CHECK-VEC0:           %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -324,13 +339,16 @@ func @reduction_17(%arga: tensor<17xf32>, %argb: tensor<17xf32>, %argx: tensor<f
 // CHECK-VEC1-DAG:   %[[c512:.*]] = constant 512 : index
 // CHECK-VEC1:       scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
 // CHECK-VEC1:         %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1:         %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC1:         %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC1:         %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC1:         %[[a:.*]] = addi %[[i]], %[[c1]] : index
 // CHECK-VEC1:         %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC1:         %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC1:         %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC1:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC1:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC1:           %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC1:           %[[cj:.*]] = index_cast %[[lj]] : i32 to index
+// CHECK-VEC1:           %[[zj:.*]] = zexti %[[lj]] : i32 to i64
+// CHECK-VEC1:           %[[cj:.*]] = index_cast %[[zj]] : i64 to index
 // CHECK-VEC1:           %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
 // CHECK-VEC1:           %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
 // CHECK-VEC1:           %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -346,18 +364,21 @@ func @reduction_17(%arga: tensor<17xf32>, %argb: tensor<17xf32>, %argx: tensor<f
 // CHECK-VEC2-DAG:   %[[c512:.*]] = constant 512 : index
 // CHECK-VEC2:       scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
 // CHECK-VEC2:         %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC2:         %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2:         %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2:         %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC2:         %[[a:.*]] = addi %[[i]], %[[c1]] : index
 // CHECK-VEC2:         %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC2:         %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2:         %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
 // CHECK-VEC2:           %[[sub:.*]] = subi %[[s]], %[[j]] : index
 // CHECK-VEC2:           %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:           %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2:           %[[zj:.*]] = zexti %[[lj]] : vector<16xi32> to vector<16xi64>
 // CHECK-VEC2:           %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2:           %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2:           %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:           %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2:           vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2:           vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
 // CHECK-VEC2:         }
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return

diff  --git a/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir b/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir
index a1070db5d577d..68d0ee620f9f6 100644
--- a/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir
+++ b/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir
@@ -66,7 +66,6 @@ module {
   func private @getTensorFilename(index) -> (!Filename)
   func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
   func private @delSparseTensor(!SparseTensor) -> ()
-  func private @print_memref_f32(%ptr : tensor<*xf32>)
 
   //
   // Main driver that reads matrix from file and calls the sparse kernel.
@@ -86,8 +85,8 @@ module {
     %sparse = constant true
     memref.store %sparse, %annotations[%c0] : memref<?xi1>
     memref.store %sparse, %annotations[%c1] : memref<?xi1>
-    %i32 = constant 3 : index
-    %f32 = constant 1 : index
+    %i32 = constant 2 : index
+    %f32 = constant 2 : index
 
     // Setup memory for the dense matrices and initialize.
     %adata = memref.alloc(%c5, %c10) : memref<?x?xf32>

diff  --git a/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir b/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir
index 10b0981dc58f5..6e067b336d9d5 100644
--- a/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir
+++ b/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir
@@ -58,7 +58,6 @@ module {
   func private @getTensorFilename(index) -> (!Filename)
   func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
   func private @delSparseTensor(!SparseTensor) -> ()
-  func private @print_memref_f64(%ptr : tensor<*xf64>)
 
   //
   // Main driver that reads matrix from file and calls the sparse kernel.
@@ -76,8 +75,8 @@ module {
     %sparse = constant true
     memref.store %sparse, %annotations[%c0] : memref<?xi1>
     memref.store %sparse, %annotations[%c1] : memref<?xi1>
-    %i64 = constant 2 : index
-    %f64 = constant 0 : index
+    %i64 = constant 1 : index
+    %f64 = constant 1 : index
 
     // Setup memory for a single reduction scalar,
     // initialized to zero.

diff  --git a/mlir/test/Integration/Sparse/sparse_matvec.mlir b/mlir/test/Integration/Sparse/sparse_matvec.mlir
new file mode 100644
index 0000000000000..41ee9ccc63b8a
--- /dev/null
+++ b/mlir/test/Integration/Sparse/sparse_matvec.mlir
@@ -0,0 +1,140 @@
+// RUN: mlir-opt %s \
+// RUN:   --test-sparsification="lower ptr-type=4 ind-type=4" \
+// RUN:   --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize  \
+// RUN:   --convert-vector-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+// RUN: mlir-opt %s \
+// RUN:   --test-sparsification="lower vectorization-strategy=2 ptr-type=4 ind-type=4 vl=16" \
+// RUN:   --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize  \
+// RUN:   --convert-vector-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+//
+// Use descriptive names for opaque pointers.
+//
+!Filename     = type !llvm.ptr<i8>
+!SparseTensor = type !llvm.ptr<i8>
+
+#matvec = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>, // A
+    affine_map<(i,j) -> (j)>,   // b
+    affine_map<(i,j) -> (i)>    // x (out)
+  ],
+  sparse = [
+    [ "D", "S" ], // A
+    [ "D"      ], // b
+    [ "D"      ]  // x
+  ],
+  iterator_types = ["parallel", "reduction"],
+  doc = "X(i) += A(i,j) * B(j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+  //
+  // The kernel expressed as an annotated Linalg op. The kernel multiplies
+  // a sparse matrix A with a dense vector b into a dense vector x.
+  //
+  func @kernel_matvec(%argA: !SparseTensor,
+                      %argb: tensor<?xf32>,
+                      %argx: tensor<?xf32>) -> tensor<?xf32> {
+    %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor<?x?xf32>
+    %0 = linalg.generic #matvec
+      ins(%arga, %argb: tensor<?x?xf32>, tensor<?xf32>)
+      outs(%argx: tensor<?xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = mulf %a, %b : f32
+        %1 = addf %x, %0 : f32
+        linalg.yield %1 : f32
+    } -> tensor<?xf32>
+    return %0 : tensor<?xf32>
+  }
+
+  //
+  // Runtime support library that is called directly from here.
+  //
+  func private @getTensorFilename(index) -> (!Filename)
+  func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
+  func private @delSparseTensor(!SparseTensor) -> ()
+
+  //
+  // Main driver that reads matrix from file and calls the sparse kernel.
+  //
+  func @entry() {
+    %f0 = constant 0.0 : f32
+    %c0 = constant 0 : index
+    %c1 = constant 1 : index
+    %c2 = constant 2 : index
+    %c4 = constant 4 : index
+    %c256 = constant 256 : index
+
+    // Mark inner dimension of the matrix as sparse and encode the
+    // storage scheme types (this must match the metadata in the
+    // alias above and compiler switches). In this case, we test
+    // that 8-bit indices and pointers work correctly.
+    %annotations = memref.alloc(%c2) : memref<?xi1>
+    %sparse = constant true
+    %dense = constant false
+    memref.store %dense, %annotations[%c0] : memref<?xi1>
+    memref.store %sparse, %annotations[%c1] : memref<?xi1>
+    %u8 = constant 4 : index
+    %f32 = constant 2 : index
+
+    // Read the sparse matrix from file, construct sparse storage.
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %a = call @newSparseTensor(%fileName, %annotations, %u8, %u8, %f32)
+      : (!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
+
+    // Initialize dense vectors.
+    %bdata = memref.alloc(%c256) : memref<?xf32>
+    %xdata = memref.alloc(%c4) : memref<?xf32>
+    scf.for %i = %c0 to %c256 step %c1 {
+      %k = addi %i, %c1 : index
+      %l = index_cast %k : index to i32
+      %f = sitofp %l : i32 to f32
+      memref.store %f, %bdata[%i] : memref<?xf32>
+    }
+    scf.for %i = %c0 to %c4 step %c1 {
+      memref.store %f0, %xdata[%i] : memref<?xf32>
+    }
+    %b = memref.tensor_load %bdata : memref<?xf32>
+    %x = memref.tensor_load %xdata : memref<?xf32>
+
+    // Call kernel.
+    %0 = call @kernel_matvec(%a, %b, %x)
+      : (!SparseTensor, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+    // Print the result for verification.
+    //
+    // CHECK: ( 1659, 1534, 21, 18315 )
+    //
+    %m = memref.buffer_cast %0 : memref<?xf32>
+    %v = vector.transfer_read %m[%c0], %f0: memref<?xf32>, vector<4xf32>
+    vector.print %v : vector<4xf32>
+
+    // Release the resources.
+    call @delSparseTensor(%a) : (!SparseTensor) -> ()
+    memref.dealloc %bdata : memref<?xf32>
+    memref.dealloc %xdata : memref<?xf32>
+
+    return
+  }
+}

diff  --git a/mlir/test/Integration/data/wide.mtx b/mlir/test/Integration/data/wide.mtx
new file mode 100644
index 0000000000000..6b5ee208afe12
--- /dev/null
+++ b/mlir/test/Integration/data/wide.mtx
@@ -0,0 +1,23 @@
+%%MatrixMarket matrix coordinate real general
+%
+% This is a test sparse matrix in Matrix Market Exchange Format.
+% see https://math.nist.gov/MatrixMarket
+%
+4 256 17
+1 1     1.0
+1 127   2.0
+1 128   3.0
+1 255   4.0
+2 2     5.0
+2 254   6.0
+3 3     7.0
+4 1     8.0
+4 2     9.0
+4 4    10.0
+4 99   11.0
+4 127  12.0
+4 128  13.0
+4 129  14.0
+4 250  15.0
+4 254  16.0
+4 256  17.0