[Mlir-commits] [mlir] e2d3db4 - [mlir][sparse] add casts to operations to lattice and exp builders

Thu Sep 9 08:49:59 PDT 2021

Author: Aart Bik
Date: 2021-09-09T08:49:50-07:00
New Revision: e2d3db42e556e68a7261a2a2c0b5eac4a41e60b3

URL: https://github.com/llvm/llvm-project/commit/e2d3db42e556e68a7261a2a2c0b5eac4a41e60b3
DIFF: https://github.com/llvm/llvm-project/commit/e2d3db42e556e68a7261a2a2c0b5eac4a41e60b3.diff

LOG: [mlir][sparse] add casts to operations to lattice and exp builders

Further enhance the set of operations that can be handled by the sparse compiler

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D109413

Added: 
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir

Modified: 
    mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
    mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 9f6657dbf6fbf..d396f7a50ef50 100644

--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -34,6 +34,16 @@ enum Kind {
   kFloorF,
   kNegF,
   kNegI,
+  kTruncF,
+  kExtF,
+  kCastFS, // signed
+  kCastFU, // unsigned
+  kCastSF, // signed
+  kCastUF, // unsigned
+  kCastS,  // signed
+  kCastU,  // unsigned
+  kTruncI,
+  kBitCast,
   // Binary operations.
   kMulF,
   kMulI,
@@ -73,8 +83,9 @@ struct TensorExp {
     Children children;
   };
 
-  /// Direct link to IR for an invariant. During code generation,
-  /// field is used to cache "hoisted" loop invariant tensor loads.
+  /// Direct link to IR for an invariant or the destination value (to
+  /// infer destination type) of a cast operation During code generation,
+  /// this field may be used to cache "hoisted" loop invariant tensor loads.
   Value val;
 };
 
@@ -115,6 +126,7 @@ class Merger {
 
   /// Adds a tensor expression. Returns its index.
   unsigned addExp(Kind k, unsigned e0, unsigned e1 = -1u, Value v = Value());
+  unsigned addExp(Kind k, unsigned e, Value v) { return addExp(k, e, -1u, v); }
   unsigned addExp(Kind k, Value v) { return addExp(k, -1u, -1u, v); }
 
   /// Adds an iteration lattice point. Returns its index.
@@ -140,7 +152,7 @@ class Merger {
   /// Maps the unary operator over the lattice set of the operand, i.e. each
   /// lattice point on an expression E is simply copied over, but with OP E
   /// as new expression. Returns the index of the new set.
-  unsigned mapSet(Kind kind, unsigned s0);
+  unsigned mapSet(Kind kind, unsigned s0, Value v = Value());
 
   /// Optimizes the iteration lattice points in the given set. This
   /// method should be called right before code generation to avoid
@@ -220,6 +232,7 @@ class Merger {
 private:
   bool maybeZero(unsigned e) const;
   bool isInvariant(unsigned e) const;
+  Type inferType(unsigned e, Value src);
 
   /// Traverses the SSA tree (possibly a DAG) to build a tensor expression.
   Optional<unsigned> buildTensorExp(linalg::GenericOp op, Value v);

diff  --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 43f11b8603615..99645a1d7d69d 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -14,9 +14,9 @@
 namespace mlir {
 namespace sparse_tensor {
 
-//
+//===----------------------------------------------------------------------===//
 // Constructors.
-//
+//===----------------------------------------------------------------------===//
 
 TensorExp::TensorExp(Kind k, unsigned x, unsigned y, Value v)
     : kind(k), val(v) {
@@ -37,6 +37,20 @@ TensorExp::TensorExp(Kind k, unsigned x, unsigned y, Value v)
     children.e0 = x;
     children.e1 = y;
     break;
+  case kTruncF:
+  case kExtF:
+  case kCastFS:
+  case kCastFU:
+  case kCastSF:
+  case kCastUF:
+  case kCastS:
+  case kCastU:
+  case kTruncI:
+  case kBitCast:
+    assert(x != -1u && y == -1u && v);
+    children.e0 = x;
+    children.e1 = y;
+    break;
   default:
     assert(x != -1u && y != -1u && !v);
     children.e0 = x;
@@ -53,9 +67,9 @@ LatPoint::LatPoint(unsigned n, unsigned e, unsigned b)
 LatPoint::LatPoint(const llvm::BitVector &b, unsigned e)
     : bits(b), simple(), exp(e) {}
 
-//
+//===----------------------------------------------------------------------===//
 // Lattice methods.
-//
+//===----------------------------------------------------------------------===//
 
 unsigned Merger::addExp(Kind k, unsigned e0, unsigned e1, Value v) {
   unsigned e = tensorExps.size();
@@ -109,11 +123,11 @@ unsigned Merger::takeDisj(Kind kind, unsigned s0, unsigned s1) {
   return s;
 }
 
-unsigned Merger::mapSet(Kind kind, unsigned s0) {
-  assert(kAbsF <= kind && kind <= kNegI);
+unsigned Merger::mapSet(Kind kind, unsigned s0, Value v) {
+  assert(kAbsF <= kind && kind <= kBitCast);
   unsigned s = addSet();
   for (unsigned p : latSets[s0]) {
-    unsigned e = addExp(kind, latPoints[p].exp);
+    unsigned e = addExp(kind, latPoints[p].exp, v);
     latPoints.push_back(LatPoint(latPoints[p].bits, e));
     latSets[s].push_back(latPoints.size() - 1);
   }
@@ -207,6 +221,16 @@ bool Merger::isConjunction(unsigned t, unsigned e) const {
   case kFloorF:
   case kNegF:
   case kNegI:
+  case kTruncF:
+  case kExtF:
+  case kCastFS:
+  case kCastFU:
+  case kCastSF:
+  case kCastUF:
+  case kCastS:
+  case kCastU:
+  case kTruncI:
+  case kBitCast:
     return isConjunction(t, tensorExps[e].children.e0);
   case kDivF: // note: x / c only
   case kDivS:
@@ -230,9 +254,9 @@ bool Merger::isConjunction(unsigned t, unsigned e) const {
 
 #ifndef NDEBUG
 
-//
+//===----------------------------------------------------------------------===//
 // Print methods (for debugging).
-//
+//===----------------------------------------------------------------------===//
 
 static const char *kindToOpSymbol(Kind kind) {
   switch (kind) {
@@ -250,6 +274,17 @@ static const char *kindToOpSymbol(Kind kind) {
     return "-";
   case kNegI:
     return "-";
+  case kTruncF:
+  case kExtF:
+  case kCastFS:
+  case kCastFU:
+  case kCastSF:
+  case kCastUF:
+  case kCastS:
+  case kCastU:
+  case kTruncI:
+  case kBitCast:
+    return "cast";
   case kMulF:
     return "*";
   case kMulI:
@@ -301,6 +336,16 @@ void Merger::dumpExp(unsigned e) const {
   case kFloorF:
   case kNegF:
   case kNegI:
+  case kTruncF:
+  case kExtF:
+  case kCastFS:
+  case kCastFU:
+  case kCastSF:
+  case kCastUF:
+  case kCastS:
+  case kCastU:
+  case kTruncI:
+  case kBitCast:
     llvm::dbgs() << kindToOpSymbol(tensorExps[e].kind) << " ";
     dumpExp(tensorExps[e].children.e0);
     break;
@@ -358,9 +403,9 @@ void Merger::dumpBits(const llvm::BitVector &bits) const {
 
 #endif // NDEBUG
 
-//
+//===----------------------------------------------------------------------===//
 // Builder methods.
-//
+//===----------------------------------------------------------------------===//
 
 unsigned Merger::buildLattices(unsigned e, unsigned i) {
   Kind kind = tensorExps[e].kind;
@@ -380,13 +425,24 @@ unsigned Merger::buildLattices(unsigned e, unsigned i) {
   case kFloorF:
   case kNegF:
   case kNegI:
+  case kTruncF:
+  case kExtF:
+  case kCastFS:
+  case kCastFU:
+  case kCastSF:
+  case kCastUF:
+  case kCastS:
+  case kCastU:
+  case kTruncI:
+  case kBitCast:
     // A zero preserving operation (viz. f(0) = 0, [Bik96,Ch5]) maps the
     // lattice set of the operand through the operator into a new set.
     //
     //  -y|!y | y |
     //  --+---+---+
     //    | 0 |-y |
-    return mapSet(kind, buildLattices(tensorExps[e].children.e0, i));
+    return mapSet(kind, buildLattices(tensorExps[e].children.e0, i),
+                  tensorExps[e].val);
   case kMulF:
   case kMulI:
   case kAndI:
@@ -469,6 +525,16 @@ bool Merger::isInvariant(unsigned e) const {
   return tensorExps[e].kind == kInvariant;
 }
 
+Type Merger::inferType(unsigned e, Value src) {
+  // Obtain the destination type from the cast node.
+  Type dtp = tensorExps[e].val.getType();
+  // Inspect source type. For vector types, apply the same
+  // vectorization to the destination type.
+  if (auto vtp = src.getType().dyn_cast<VectorType>())
+    return VectorType::get(vtp.getNumElements(), dtp);
+  return dtp;
+}
+
 Optional<unsigned> Merger::buildTensorExp(linalg::GenericOp op, Value v) {
   if (auto arg = v.dyn_cast<BlockArgument>()) {
     unsigned argN = arg.getArgNumber();
@@ -501,12 +567,32 @@ Optional<unsigned> Merger::buildTensorExp(linalg::GenericOp op, Value v) {
       if (isa<FloorFOp>(def))
         return addExp(kFloorF, e);
       if (isa<NegFOp>(def))
-        return addExp(kNegF, e);
-      // TODO: no negi in std?
+        return addExp(kNegF, e); // TODO: no negi in std?
+      if (isa<FPTruncOp>(def))
+        return addExp(kTruncF, e, v);
+      if (isa<FPExtOp>(def))
+        return addExp(kExtF, e, v);
+      if (isa<FPToSIOp>(def))
+        return addExp(kCastFS, e, v);
+      if (isa<FPToUIOp>(def))
+        return addExp(kCastFU, e, v);
+      if (isa<SIToFPOp>(def))
+        return addExp(kCastSF, e, v);
+      if (isa<UIToFPOp>(def))
+        return addExp(kCastUF, e, v);
+      if (isa<SignExtendIOp>(def))
+        return addExp(kCastS, e, v);
+      if (isa<ZeroExtendIOp>(def))
+        return addExp(kCastU, e, v);
+      if (isa<TruncateIOp>(def))
+        return addExp(kTruncI, e, v);
+      if (isa<BitcastOp>(def))
+        return addExp(kBitCast, e, v);
     }
   }
   // Construct binary operations if subexpressions can be built.
-  // TODO: see buildLattices() for an explanation of rejecting certain divisions
+  // TODO: see buildLattices() for an explanation of rejecting
+  //       certain division and shift operations
   if (def->getNumOperands() == 2) {
     auto x = buildTensorExp(op, def->getOperand(0));
     auto y = buildTensorExp(op, def->getOperand(1));
@@ -555,6 +641,7 @@ Value Merger::buildExp(PatternRewriter &rewriter, Location loc, unsigned e,
   case kTensor:
   case kInvariant:
     llvm_unreachable("unexpected non-op");
+  // Unary ops.
   case kAbsF:
     return rewriter.create<AbsFOp>(loc, v0);
   case kCeilF:
@@ -566,6 +653,27 @@ Value Merger::buildExp(PatternRewriter &rewriter, Location loc, unsigned e,
   case kNegI:
     assert(v1); // no negi in std
     return rewriter.create<SubIOp>(loc, v0, v1);
+  case kTruncF:
+    return rewriter.create<FPTruncOp>(loc, v0, inferType(e, v0));
+  case kExtF:
+    return rewriter.create<FPExtOp>(loc, v0, inferType(e, v0));
+  case kCastFS:
+    return rewriter.create<FPToSIOp>(loc, v0, inferType(e, v0));
+  case kCastFU:
+    return rewriter.create<FPToUIOp>(loc, v0, inferType(e, v0));
+  case kCastSF:
+    return rewriter.create<SIToFPOp>(loc, v0, inferType(e, v0));
+  case kCastUF:
+    return rewriter.create<UIToFPOp>(loc, v0, inferType(e, v0));
+  case kCastS:
+    return rewriter.create<SignExtendIOp>(loc, v0, inferType(e, v0));
+  case kCastU:
+    return rewriter.create<ZeroExtendIOp>(loc, v0, inferType(e, v0));
+  case kTruncI:
+    return rewriter.create<TruncateIOp>(loc, v0, inferType(e, v0));
+  case kBitCast:
+    return rewriter.create<BitcastOp>(loc, v0, inferType(e, v0));
+  // Binary ops.
   case kMulF:
     return rewriter.create<MulFOp>(loc, v0, v1);
   case kMulI:

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
new file mode 100644
index 0000000000000..8024033d47285
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
@@ -0,0 +1,277 @@
+// RUN: mlir-opt %s \
+// RUN:   --sparsification --sparse-tensor-conversion \
+// RUN:   --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
+// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm --convert-std-to-llvm | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+// Do the same run, but now with SIMDization as well. This should not change the outcome.
+//
+// RUN: mlir-opt %s \
+// RUN:   --sparsification="vectorization-strategy=2 vl=2 enable-simd-index32" --sparse-tensor-conversion \
+// RUN:   --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
+// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/test.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+
+#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
+
+#trait_cast = {
+  indexing_maps = [
+    affine_map<(i) -> (i)>,  // A (in)
+    affine_map<(i) -> (i)>   // X (out)
+  ],
+  iterator_types = ["parallel"],
+  doc = "X(i) = cast A(i)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to actual sparse
+// code, initializes a matching sparse storage scheme from a dense vector,
+// and runs the resulting code with the JIT compiler.
+//
+module {
+  //
+  // Various kernels that cast a sparse vector from one type to another.
+  // Standard supports the following casts.
+  //   sitofp
+  //   uitofp
+  //   fptosi
+  //   fptoui
+  //   fpext
+  //   fptrunc
+  //   sexti
+  //   zexti
+  //   trunci
+  //   bitcast
+  // Since all casts are "zero preserving" unary operations, lattice computation
+  // and conversion to sparse code is straightforward.
+  //
+  func @sparse_cast_s32_to_f32(%arga: tensor<10xi32, #SV>) -> tensor<10xf32> {
+    %argx = constant dense<0.0> : tensor<10xf32>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xi32, #SV>)
+      outs(%argx: tensor<10xf32>) {
+        ^bb(%a: i32, %x : f32):
+          %cst = sitofp %a : i32 to f32
+          linalg.yield %cst : f32
+    } -> tensor<10xf32>
+    return %0 : tensor<10xf32>
+  }
+  func @sparse_cast_u32_to_f32(%arga: tensor<10xi32, #SV>) -> tensor<10xf32> {
+    %argx = constant dense<0.0> : tensor<10xf32>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xi32, #SV>)
+      outs(%argx: tensor<10xf32>) {
+        ^bb(%a: i32, %x : f32):
+          %cst = uitofp %a : i32 to f32
+          linalg.yield %cst : f32
+    } -> tensor<10xf32>
+    return %0 : tensor<10xf32>
+  }
+  func @sparse_cast_f32_to_s32(%arga: tensor<10xf32, #SV>) -> tensor<10xi32> {
+    %argx = constant dense<0> : tensor<10xi32>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xf32, #SV>)
+      outs(%argx: tensor<10xi32>) {
+        ^bb(%a: f32, %x : i32):
+          %cst = fptosi %a : f32 to i32
+          linalg.yield %cst : i32
+    } -> tensor<10xi32>
+    return %0 : tensor<10xi32>
+  }
+  func @sparse_cast_f64_to_u32(%arga: tensor<10xf64, #SV>) -> tensor<10xi32> {
+    %argx = constant dense<0> : tensor<10xi32>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xf64, #SV>)
+      outs(%argx: tensor<10xi32>) {
+        ^bb(%a: f64, %x : i32):
+          %cst = fptoui %a : f64 to i32
+          linalg.yield %cst : i32
+    } -> tensor<10xi32>
+    return %0 : tensor<10xi32>
+  }
+  func @sparse_cast_f32_to_f64(%arga: tensor<10xf32, #SV>) -> tensor<10xf64> {
+    %argx = constant dense<0.0> : tensor<10xf64>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xf32, #SV>)
+      outs(%argx: tensor<10xf64>) {
+        ^bb(%a: f32, %x : f64):
+          %cst = fpext %a : f32 to f64
+          linalg.yield %cst : f64
+    } -> tensor<10xf64>
+    return %0 : tensor<10xf64>
+  }
+  func @sparse_cast_f64_to_f32(%arga: tensor<10xf64, #SV>) -> tensor<10xf32> {
+    %argx = constant dense<0.0> : tensor<10xf32>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xf64, #SV>)
+      outs(%argx: tensor<10xf32>) {
+        ^bb(%a: f64, %x : f32):
+          %cst = fptrunc %a : f64 to f32
+          linalg.yield %cst : f32
+    } -> tensor<10xf32>
+    return %0 : tensor<10xf32>
+  }
+  func @sparse_cast_s32_to_u64(%arga: tensor<10xi32, #SV>) -> tensor<10xi64> {
+    %argx = constant dense<0> : tensor<10xi64>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xi32, #SV>)
+      outs(%argx: tensor<10xi64>) {
+        ^bb(%a: i32, %x : i64):
+          %cst = sexti %a : i32 to i64
+          linalg.yield %cst : i64
+    } -> tensor<10xi64>
+    return %0 : tensor<10xi64>
+  }
+  func @sparse_cast_u32_to_s64(%arga: tensor<10xi32, #SV>) -> tensor<10xi64> {
+    %argx = constant dense<0> : tensor<10xi64>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xi32, #SV>)
+      outs(%argx: tensor<10xi64>) {
+        ^bb(%a: i32, %x : i64):
+          %cst = zexti %a : i32 to i64
+          linalg.yield %cst : i64
+    } -> tensor<10xi64>
+    return %0 : tensor<10xi64>
+  }
+  func @sparse_cast_i32_to_i8(%arga: tensor<10xi32, #SV>) -> tensor<10xi8> {
+    %argx = constant dense<0> : tensor<10xi8>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xi32, #SV>)
+      outs(%argx: tensor<10xi8>) {
+        ^bb(%a: i32, %x : i8):
+          %cst = trunci %a : i32 to i8
+          linalg.yield %cst : i8
+    } -> tensor<10xi8>
+    return %0 : tensor<10xi8>
+  }
+  func @sparse_cast_f32_as_s32(%arga: tensor<10xf32, #SV>) -> tensor<10xi32> {
+    %argx = constant dense<0> : tensor<10xi32>
+    %0 = linalg.generic #trait_cast
+      ins(%arga: tensor<10xf32, #SV>)
+      outs(%argx: tensor<10xi32>) {
+        ^bb(%a: f32, %x : i32):
+          %cst = bitcast %a : f32 to i32
+          linalg.yield %cst : i32
+    } -> tensor<10xi32>
+    return %0 : tensor<10xi32>
+  }
+
+  //
+  // Main driver that converts a dense tensor into a sparse tensor
+  // and then calls the sparse casting kernel.
+  //
+  func @entry() {
+    %z = constant 0 : index
+    %b = constant 0 : i8
+    %i = constant 0 : i32
+    %l = constant 0 : i64
+    %f = constant 0.0 : f32
+    %d = constant 0.0 : f64
+
+    // Initialize dense tensors, convert to a sparse vectors.
+    %0 = constant dense<[ -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 ]> : tensor<10xi32>
+    %1 = sparse_tensor.convert %0 : tensor<10xi32> to tensor<10xi32, #SV>
+    %2 = constant dense<[ -4.4, -3.3, -2.2, -1.1, 0.0, 1.1, 2.2, 3.3, 4.4, 305.5 ]> : tensor<10xf32>
+    %3 = sparse_tensor.convert %2 : tensor<10xf32> to tensor<10xf32, #SV>
+    %4 = constant dense<[ -4.4, -3.3, -2.2, -1.1, 0.0, 1.1, 2.2, 3.3, 4.4, 305.5 ]> : tensor<10xf64>
+    %5 = sparse_tensor.convert %4 : tensor<10xf64> to tensor<10xf64, #SV>
+    %6 = constant dense<[ 4294967295.0, 4294967294.0, 4294967293.0, 4294967292.0,
+                          0.0, 1.1, 2.2, 3.3, 4.4, 305.5 ]> : tensor<10xf64>
+    %7 = sparse_tensor.convert %6 : tensor<10xf64> to tensor<10xf64, #SV>
+
+    //
+    // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 )
+    //
+    %c0 = call @sparse_cast_s32_to_f32(%1) : (tensor<10xi32, #SV>) -> tensor<10xf32>
+    %m0 = memref.buffer_cast %c0 : memref<10xf32>
+    %v0 = vector.transfer_read %m0[%z], %f: memref<10xf32>, vector<10xf32>
+    vector.print %v0 : vector<10xf32>
+
+    //
+    // CHECK: ( 4.29497e+09, 4.29497e+09, 4.29497e+09, 4.29497e+09, 0, 1, 2, 3, 4, 305 )
+    //
+    %c1 = call @sparse_cast_u32_to_f32(%1) : (tensor<10xi32, #SV>) -> tensor<10xf32>
+    %m1 = memref.buffer_cast %c1 : memref<10xf32>
+    %v1 = vector.transfer_read %m1[%z], %f: memref<10xf32>, vector<10xf32>
+    vector.print %v1 : vector<10xf32>
+
+    //
+    // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 )
+    //
+    %c2 = call @sparse_cast_f32_to_s32(%3) : (tensor<10xf32, #SV>) -> tensor<10xi32>
+    %m2 = memref.buffer_cast %c2 : memref<10xi32>
+    %v2 = vector.transfer_read %m2[%z], %i: memref<10xi32>, vector<10xi32>
+    vector.print %v2 : vector<10xi32>
+
+    //
+    // CHECK: ( 4294967295, 4294967294, 4294967293, 4294967292, 0, 1, 2, 3, 4, 305 )
+    //
+    %c3 = call @sparse_cast_f64_to_u32(%7) : (tensor<10xf64, #SV>) -> tensor<10xi32>
+    %m3 = memref.buffer_cast %c3 : memref<10xi32>
+    %v3 = vector.transfer_read %m3[%z], %i: memref<10xi32>, vector<10xi32>
+    %vu = vector.bitcast %v3 : vector<10xi32> to vector<10xui32>
+    vector.print %vu : vector<10xui32>
+
+    //
+    // CHECK: ( -4.4, -3.3, -2.2, -1.1, 0, 1.1, 2.2, 3.3, 4.4, 305.5 )
+    //
+    %c4 = call @sparse_cast_f32_to_f64(%3) : (tensor<10xf32, #SV>) -> tensor<10xf64>
+    %m4 = memref.buffer_cast %c4 : memref<10xf64>
+    %v4 = vector.transfer_read %m4[%z], %d: memref<10xf64>, vector<10xf64>
+    vector.print %v4 : vector<10xf64>
+
+    //
+    // CHECK: ( -4.4, -3.3, -2.2, -1.1, 0, 1.1, 2.2, 3.3, 4.4, 305.5 )
+    //
+    %c5 = call @sparse_cast_f64_to_f32(%5) : (tensor<10xf64, #SV>) -> tensor<10xf32>
+    %m5 = memref.buffer_cast %c5 : memref<10xf32>
+    %v5 = vector.transfer_read %m5[%z], %f: memref<10xf32>, vector<10xf32>
+    vector.print %v5 : vector<10xf32>
+
+    //
+    // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 )
+    //
+    %c6 = call @sparse_cast_s32_to_u64(%1) : (tensor<10xi32, #SV>) -> tensor<10xi64>
+    %m6 = memref.buffer_cast %c6 : memref<10xi64>
+    %v6 = vector.transfer_read %m6[%z], %l: memref<10xi64>, vector<10xi64>
+    vector.print %v6 : vector<10xi64>
+
+    //
+    // CHECK: ( 4294967292, 4294967293, 4294967294, 4294967295, 0, 1, 2, 3, 4, 305 )
+    //
+    %c7 = call @sparse_cast_u32_to_s64(%1) : (tensor<10xi32, #SV>) -> tensor<10xi64>
+    %m7 = memref.buffer_cast %c7 : memref<10xi64>
+    %v7 = vector.transfer_read %m7[%z], %l: memref<10xi64>, vector<10xi64>
+    vector.print %v7 : vector<10xi64>
+
+    //
+    // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 49 )
+    //
+    %c8 = call @sparse_cast_i32_to_i8(%1) : (tensor<10xi32, #SV>) -> tensor<10xi8>
+    %m8 = memref.buffer_cast %c8 : memref<10xi8>
+    %v8 = vector.transfer_read %m8[%z], %b: memref<10xi8>, vector<10xi8>
+    vector.print %v8 : vector<10xi8>
+
+    //
+    // CHECK: ( -1064514355, -1068289229, -1072902963, -1081291571, 0, 1066192077, 1074580685, 1079194419, 1082969293, 1134084096 )
+    //
+    %c9 = call @sparse_cast_f32_as_s32(%3) : (tensor<10xf32, #SV>) -> tensor<10xi32>
+    %m9 = memref.buffer_cast %c9 : memref<10xi32>
+    %v9 = vector.transfer_read %m9[%z], %i: memref<10xi32>, vector<10xi32>
+    vector.print %v9 : vector<10xi32>
+
+    return
+  }
+}