[Mlir-commits] [mlir] 5da2133 - [mlir][sparse] generalize reduction support in sparse compiler

Wed Sep 22 12:36:54 PDT 2021

Author: Aart Bik
Date: 2021-09-22T12:36:46-07:00
New Revision: 5da21338bcd0fe377fc788b1b24cf055ab1977af

URL: https://github.com/llvm/llvm-project/commit/5da21338bcd0fe377fc788b1b24cf055ab1977af
DIFF: https://github.com/llvm/llvm-project/commit/5da21338bcd0fe377fc788b1b24cf055ab1977af.diff

LOG: [mlir][sparse] generalize reduction support in sparse compiler

Now not just SUM, but also PRODUCT, AND, OR, XOR. The reductions
MIN and MAX are still to be done (also depends on recognizing
these operations in cmp-select constructs).

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D110203

Added: 
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir

Modified: 
    mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
    mlir/test/Dialect/SparseTensor/sparse_vector.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index f733856443a9..28fb3a69c3be 100644

--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -28,11 +28,18 @@
 using namespace mlir;
 using namespace mlir::sparse_tensor;
 
+//===----------------------------------------------------------------------===//
+// Declarations of data structures.
+//===----------------------------------------------------------------------===//
+
 namespace {
 
 // Iteration graph sorting.
 enum SortMask { kSparseOnly = 0x0, kIncludeDense = 0x1, kIncludeUndef = 0x2 };
 
+// Reduction kinds.
+enum Reduction { kSum, kProduct, kAnd, kOr, kXor };
+
 // Code generation.
 struct CodeGen {
   CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops)
@@ -68,6 +75,7 @@ struct CodeGen {
   // is most effective; we could generalize to more outer and while-loops.
   unsigned redExp;
   Value redVal;
+  Reduction redKind;
   // Current vector length and mask.
   unsigned curVecLength;
   Value curVecMask;
@@ -75,8 +83,12 @@ struct CodeGen {
 
 } // namespace
 
-// Helper method to apply dimension ordering permutation.
-static unsigned perm(SparseTensorEncodingAttr &enc, unsigned d) {
+//===----------------------------------------------------------------------===//
+// Sparse compiler analysis methods.
+//===----------------------------------------------------------------------===//
+
+/// Helper method to apply dimension ordering permutation.
+static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d) {
   if (enc) {
     auto order = enc.getDimOrdering();
     if (order) {
@@ -87,8 +99,8 @@ static unsigned perm(SparseTensorEncodingAttr &enc, unsigned d) {
   return d;
 }
 
-// Helper method to translate dim level type to internal representation.
-static Dim toDim(SparseTensorEncodingAttr &enc, unsigned d) {
+/// Helper method to translate dim level type to internal representation.
+static Dim toDim(const SparseTensorEncodingAttr &enc, unsigned d) {
   if (enc) {
     SparseTensorEncodingAttr::DimLevelType tp = enc.getDimLevelType()[d];
     if (tp == SparseTensorEncodingAttr::DimLevelType::Compressed)
@@ -283,6 +295,83 @@ static bool isAdmissableTensorExp(Merger &merger, linalg::GenericOp op,
   return false;
 }
 
+//===----------------------------------------------------------------------===//
+// Sparse compiler synthesis methods.
+//===----------------------------------------------------------------------===//
+
+/// Maps reduction kind to name encoding.
+static StringRef getReductionName(Reduction kind) {
+  switch (kind) {
+  case kSum:
+    return "add";
+  case kProduct:
+    return "mul";
+  case kAnd:
+    return "and";
+  case kOr:
+    return "or";
+  case kXor:
+    return "xor";
+  }
+  llvm_unreachable("unknown reduction kind");
+}
+
+/// Maps operation to reduction.
+static Reduction getReduction(Kind kind) {
+  switch (kind) {
+  case Kind::kAddF:
+  case Kind::kAddI:
+  case Kind::kSubF:
+  case Kind::kSubI:
+    return kSum;
+  case Kind::kMulF:
+  case Kind::kMulI:
+    return kProduct;
+  case Kind::kAndI:
+    return kAnd;
+  case Kind::kOrI:
+    return kOr;
+  case Kind::kXorI:
+    return kXor;
+  default:
+    llvm_unreachable("unexpected reduction operator");
+  }
+}
+
+/// Generates an initial value for a vector reductions, following the scheme
+/// given in Chapter 5 of "The Software Vectorization Handbook", where the
+/// initial scalar value is correctly embedded in the vector reduction value,
+/// and a straightforward horizontal reduction will complete the operation.
+static Value genReductionInit(PatternRewriter &rewriter, Location loc,
+                              Reduction kind, VectorType vtp, Value r) {
+  switch (kind) {
+  case kSum:
+  case kXor: {
+    // Initialize reduction vector to: | 0 | .. | 0 | r |
+    Attribute zero = rewriter.getZeroAttr(vtp);
+    Value vec = rewriter.create<ConstantOp>(loc, vtp, zero);
+    return rewriter.create<vector::InsertElementOp>(loc, r, vec, 0);
+  }
+  case kProduct: {
+    // Initialize reduction vector to: | 1 | .. | 1 | r |
+    Type etp = vtp.getElementType();
+    Attribute one;
+    if (etp.isa<FloatType>())
+      one = rewriter.getFloatAttr(etp, 1.0);
+    else
+      one = rewriter.getIntegerAttr(etp, 1);
+    Value vec =
+        rewriter.create<ConstantOp>(loc, vtp, DenseElementsAttr::get(vtp, one));
+    return rewriter.create<vector::InsertElementOp>(loc, r, vec, 0);
+  }
+  case kAnd:
+  case kOr:
+    // Initialize reduction vector to: | r | .. | r | r |
+    return rewriter.create<vector::BroadcastOp>(loc, vtp, r);
+  }
+  llvm_unreachable("unknown reduction kind");
+}
+
 /// Maps sparse integer option to actual integral storage type.
 static Type genIntType(PatternRewriter &rewriter, unsigned width) {
   if (width == 0)
@@ -644,11 +733,15 @@ static Value genReductionStart(Merger &merger, CodeGen &codegen,
                                linalg::GenericOp op) {
   if (codegen.redVal)
     return codegen.redVal; // chained with previous for-loop
-  if (codegen.curVecLength > 1) {
-    // TODO: assumes + reductions for now
+  // Generate vector or scalar start of a reduction.
+  unsigned vl = codegen.curVecLength;
+  if (vl > 1) {
     VectorType vtp = vectorType(codegen, codegen.buffers[codegen.redExp]);
-    return rewriter.create<ConstantOp>(op.getLoc(), vtp,
-                                       rewriter.getZeroAttr(vtp));
+    assert(!merger.exp(codegen.redExp).val);
+    codegen.curVecLength = 1;
+    Value load = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
+    codegen.curVecLength = vl;
+    return genReductionInit(rewriter, op.getLoc(), codegen.redKind, vtp, load);
   }
   return genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
 }
@@ -661,19 +754,12 @@ static void genReductionEnd(Merger &merger, CodeGen &codegen,
     return;
   assert(codegen.curVecLength == 1);
   codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain
+  // Generate vector or scalar end of a reduction.
   if (auto vtp = red.getType().dyn_cast<VectorType>()) {
-    // TODO: assumes + reductions for now
-    StringAttr kind = rewriter.getStringAttr("add");
-    Value ld = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
-    // Integer reductions don't accept an accumulator.
-    if (vtp.getElementType().isa<IntegerType>()) {
-      red = rewriter.create<vector::ReductionOp>(op.getLoc(), ld.getType(),
-                                                 kind, red, ValueRange{});
-      red = rewriter.create<AddIOp>(op.getLoc(), red, ld);
-    } else {
-      red = rewriter.create<vector::ReductionOp>(op.getLoc(), ld.getType(),
-                                                 kind, red, ld);
-    }
+    StringRef name = getReductionName(codegen.redKind);
+    StringAttr kind = rewriter.getStringAttr(name);
+    red = rewriter.create<vector::ReductionOp>(
+        op.getLoc(), vtp.getElementType(), kind, red, ValueRange{});
   }
   genTensorStore(merger, codegen, rewriter, op, red);
 }
@@ -725,7 +811,8 @@ static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a,
 /// Hoists loop invariant tensor loads for which indices have been exhausted.
 static void genInvariants(Merger &merger, CodeGen &codegen,
                           PatternRewriter &rewriter, linalg::GenericOp op,
-                          unsigned exp, unsigned ldx, bool hoist) {
+                          unsigned exp, unsigned ldx, bool hoist,
+                          Kind last = Kind::kTensor) {
   if (exp == -1u)
     return;
   if (merger.exp(exp).kind == Kind::kTensor) {
@@ -743,6 +830,7 @@ static void genInvariants(Merger &merger, CodeGen &codegen,
     OpOperand *lhs = op.getOutputOperand(0);
     if (lhs == t) {
       codegen.redExp = hoist ? exp : -1u;
+      codegen.redKind = getReduction(last);
     } else if (atLevel) {
       merger.exp(exp).val =
           hoist ? genTensorLoad(merger, codegen, rewriter, op, exp) : Value();
@@ -751,10 +839,11 @@ static void genInvariants(Merger &merger, CodeGen &codegen,
     // Traverse into the binary operations. Note that we only hoist
     // tensor loads, since subsequent MLIR/LLVM passes know how to
     // deal with all other kinds of derived loop invariants.
+    Kind last = merger.exp(exp).kind;
     unsigned e0 = merger.exp(exp).children.e0;
     unsigned e1 = merger.exp(exp).children.e1;
-    genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist);
-    genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist);
+    genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist, last);
+    genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist, last);
   }
 }
 
@@ -1233,6 +1322,10 @@ static void genResult(Merger &merger, CodeGen &codegen,
   rewriter.replaceOp(op, result);
 }
 
+//===----------------------------------------------------------------------===//
+// Sparse compiler rewriting methods.
+//===----------------------------------------------------------------------===//
+
 namespace {
 
 /// Sparse rewriting rule for generic Lingalg operation.

diff  --git a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
index ee8f948eee4d..2b2492fc1db1 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
@@ -210,32 +210,38 @@ func @mul_s(%arga: tensor<1024xf32, #SparseVector>, %argb: tensor<1024xf32>, %ar
 //
 // CHECK-VEC1-LABEL: func @reduction_d
 // CHECK-VEC1-DAG:   %[[c0:.*]] = constant 0 : index
+// CHECK-VEC1-DAG:   %[[i0:.*]] = constant 0 : i32
 // CHECK-VEC1-DAG:   %[[c16:.*]] = constant 16 : index
 // CHECK-VEC1-DAG:   %[[c1024:.*]] = constant 1024 : index
 // CHECK-VEC1-DAG:   %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-VEC1:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) {
+// CHECK-VEC1:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
+// CHECK-VEC1:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[i0]] : i32] : vector<16xf32>
+// CHECK-VEC1:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
 // CHECK-VEC1:         %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
 // CHECK-VEC1:         %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
 // CHECK-VEC1:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
 // CHECK-VEC1:         %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32>
 // CHECK-VEC1:         scf.yield %[[a]] : vector<16xf32>
 // CHECK-VEC1:       }
-// CHECK-VEC1:       %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32
+// CHECK-VEC1:       %{{.*}} = vector.reduction "add", %[[red]] : vector<16xf32> into f32
 // CHECK-VEC1:       return
 //
 // CHECK-VEC2-LABEL: func @reduction_d
 // CHECK-VEC2-DAG:   %[[c0:.*]] = constant 0 : index
+// CHECK-VEC2-DAG:   %[[i0:.*]] = constant 0 : i32
 // CHECK-VEC2-DAG:   %[[c16:.*]] = constant 16 : index
 // CHECK-VEC2-DAG:   %[[c1024:.*]] = constant 1024 : index
 // CHECK-VEC2-DAG:   %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-VEC2:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) {
+// CHECK-VEC2:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
+// CHECK-VEC2:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[i0]] : i32] : vector<16xf32>
+// CHECK-VEC2:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
 // CHECK-VEC2:         %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
 // CHECK-VEC2:         %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
 // CHECK-VEC2:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
 // CHECK-VEC2:         %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32>
 // CHECK-VEC2:         scf.yield %[[a]] : vector<16xf32>
 // CHECK-VEC2:       }
-// CHECK-VEC2:       %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32
+// CHECK-VEC2:       %{{.*}} = vector.reduction "add", %[[red]] : vector<16xf32> into f32
 // CHECK-VEC2:       return
 //
 func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>, %argx: tensor<f32>) -> tensor<f32> {

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
new file mode 100644
index 000000000000..d2d04670cc17
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
@@ -0,0 +1,216 @@
+// RUN: mlir-opt %s \
+// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
+// RUN:   --sparsification --sparse-tensor-conversion \
+// RUN:   --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
+// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
+// RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+// Do the same run, but now with SIMDization as well. This should not change the outcome.
+//
+// RUN: mlir-opt %s \
+// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
+// RUN:   --sparsification="vectorization-strategy=2 vl=8" --sparse-tensor-conversion \
+// RUN:   --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
+// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
+// RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
+#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense"      ] }>
+
+#trait_reduction = {
+  indexing_maps = [
+    affine_map<(i) -> (i)>,  // a
+    affine_map<(i) -> ()>    // x (scalar out)
+  ],
+  iterator_types = ["reduction"],
+  doc = "x += OPER_i a(i)"
+}
+
+// An example of vector reductions.
+module {
+
+  func @sum_reduction_i32(%arga: tensor<32xi32, #SV>,
+                          %argx: tensor<i32>) -> tensor<i32> {
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xi32, #SV>)
+      outs(%argx: tensor<i32>) {
+        ^bb(%a: i32, %x: i32):
+          %0 = addi %x, %a : i32
+          linalg.yield %0 : i32
+    } -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func @sum_reduction_f32(%arga: tensor<32xf32, #SV>,
+                          %argx: tensor<f32>) -> tensor<f32> {
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xf32, #SV>)
+      outs(%argx: tensor<f32>) {
+        ^bb(%a: f32, %x: f32):
+          %0 = addf %x, %a : f32
+          linalg.yield %0 : f32
+    } -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
+                           %argx: tensor<i32>) -> tensor<i32> {
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xi32, #DV>)
+      outs(%argx: tensor<i32>) {
+        ^bb(%a: i32, %x: i32):
+          %0 = muli %x, %a : i32
+          linalg.yield %0 : i32
+    } -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
+                           %argx: tensor<f32>) -> tensor<f32> {
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xf32, #DV>)
+      outs(%argx: tensor<f32>) {
+        ^bb(%a: f32, %x: f32):
+          %0 = mulf %x, %a : f32
+          linalg.yield %0 : f32
+    } -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
+                          %argx: tensor<i32>) -> tensor<i32> {
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xi32, #DV>)
+      outs(%argx: tensor<i32>) {
+        ^bb(%a: i32, %x: i32):
+          %0 = and %x, %a : i32
+          linalg.yield %0 : i32
+    } -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
+                         %argx: tensor<i32>) -> tensor<i32> {
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xi32, #SV>)
+      outs(%argx: tensor<i32>) {
+        ^bb(%a: i32, %x: i32):
+          %0 = or %x, %a : i32
+          linalg.yield %0 : i32
+    } -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func @xor_reduction_i32(%arga: tensor<32xi32, #SV>,
+                          %argx: tensor<i32>) -> tensor<i32> {
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xi32, #SV>)
+      outs(%argx: tensor<i32>) {
+        ^bb(%a: i32, %x: i32):
+          %0 = xor %x, %a : i32
+          linalg.yield %0 : i32
+    } -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func @dump_i32(%arg0 : tensor<i32>) {
+    %m = memref.buffer_cast %arg0 : memref<i32>
+    %v = memref.load %m[] : memref<i32>
+    vector.print %v : i32
+    return
+  }
+
+  func @dump_f32(%arg0 : tensor<f32>) {
+    %m = memref.buffer_cast %arg0 : memref<f32>
+    %v = memref.load %m[] : memref<f32>
+    vector.print %v : f32
+    return
+  }
+
+  func @entry() {
+    %ri = constant dense< 7   > : tensor<i32>
+    %rf = constant dense< 2.0 > : tensor<f32>
+
+    %c_0_i32 = constant dense<[
+      0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0,
+      0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0
+    ]> : tensor<32xi32>
+
+    %c_0_f32 = constant dense<[
+      0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0,
+      2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
+    ]> : tensor<32xf32>
+
+    %c_1_i32 = constant dense<[
+      1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
+    ]> : tensor<32xi32>
+
+    %c_1_f32 = constant dense<[
+      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
+      1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+      1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
+      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
+    ]> : tensor<32xf32>
+
+    // Convert constants to annotated tensors.
+    %sparse_input_i32 = sparse_tensor.convert %c_0_i32
+      : tensor<32xi32> to tensor<32xi32, #SV>
+    %sparse_input_f32 = sparse_tensor.convert %c_0_f32
+      : tensor<32xf32> to tensor<32xf32, #SV>
+    %dense_input_i32 = sparse_tensor.convert %c_1_i32
+      : tensor<32xi32> to tensor<32xi32, #DV>
+    %dense_input_f32 = sparse_tensor.convert %c_1_f32
+      : tensor<32xf32> to tensor<32xf32, #DV>
+
+    // Call the kernels.
+    %0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
+       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
+    %1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
+       : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
+    %2 = call @prod_reduction_i32(%dense_input_i32, %ri)
+       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
+    %3 = call @prod_reduction_f32(%dense_input_f32, %rf)
+       : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
+    %4 = call @and_reduction_i32(%dense_input_i32, %ri)
+       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
+    %5 = call @or_reduction_i32(%sparse_input_i32, %ri)
+       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
+    %6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
+       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
+
+    // Verify results.
+    //
+    // CHECK: 26
+    // CHECK: 27.5
+    // CHECK: 3087
+    // CHECK: 168
+    // CHECK: 1
+    // CHECK: 15
+    // CHECK: 10
+    //
+    call @dump_i32(%0) : (tensor<i32>) -> ()
+    call @dump_f32(%1) : (tensor<f32>) -> ()
+    call @dump_i32(%2) : (tensor<i32>) -> ()
+    call @dump_f32(%3) : (tensor<f32>) -> ()
+    call @dump_i32(%4) : (tensor<i32>) -> ()
+    call @dump_i32(%5) : (tensor<i32>) -> ()
+    call @dump_i32(%6) : (tensor<i32>) -> ()
+
+    return
+  }
+}