[Mlir-commits] [mlir] 378f188 - [mlir][sparse] enhance sparse reduction support

Aart Bik llvmlistbot at llvm.org
Thu Jun 1 16:30:32 PDT 2023


Author: Aart Bik
Date: 2023-06-01T16:30:21-07:00
New Revision: 378f1885e3536ddf93e780f25a84ad493140ff42

URL: https://github.com/llvm/llvm-project/commit/378f1885e3536ddf93e780f25a84ad493140ff42
DIFF: https://github.com/llvm/llvm-project/commit/378f1885e3536ddf93e780f25a84ad493140ff42.diff

LOG: [mlir][sparse] enhance sparse reduction support

Formerly, we accepted and/prod reductions as a standard
reduction but these change the semantics after sparsification
by not looking at implicit zeros. Therefore, we only accept
standard reductions that are insensitive to implicit vs.
explicit zeros, and leave the more complex reductions to
the sparse_tensor.reduce custom reduction implementation.

Reviewed By: Peiming

Differential Revision: https://reviews.llvm.org/D151929

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
    mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
    mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index e37062f5f8104..e1bcccb9b8f09 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -1055,18 +1055,22 @@ def SparseTensor_ReduceOp : SparseTensor_Op<"reduce", [Pure, SameOperandsAndResu
   let summary = "Custom reduction operation utilized within linalg.generic";
   let description = [{
       Defines a computation with a `linalg.generic` operation that takes two
-      operands and an identity value and reduces all values down to a single
-      result based on the computation in the region.
+      operands and an identity value and reduces all stored values down to a
+      single result based on the computation in the region.
 
       The region must contain exactly one block taking two arguments. The block
       must end with a sparse_tensor.yield and the output must match the input
       argument types.
 
-      Note that this operation is only required for custom reductions beyond the
-      standard operations (add, mul, and, or, etc). The `linalg.generic`
-      `iterator_types` defines which indices are being reduced. When the associated
-      operands are used in an operation, a reduction will occur. The use of this
-      explicit `reduce` operation is not required in most cases.
+      Note that this operation is only required for custom reductions beyond
+      the standard reduction operations (add, sub, or, xor) that can be
+      sparsified by merely reducing the stored values. More elaborate reduction
+      operations (mul, and, min, max, etc.) would need to account for implicit
+      zeros as well. They can still be handled using this custom reduction
+      operation. The `linalg.generic` `iterator_types` defines which indices
+      are being reduced. When the associated operands are used in an operation,
+      a reduction will occur. The use of this explicit `reduce` operation
+      is not required in most cases.
 
       Example of Matrix->Vector reduction using max(product(x_i), 100):
 

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 4334290de6498..7064b4f0d2049 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -1848,6 +1848,24 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
     if (!findSparseAnnotations(env, idxReducBased))
       return failure();
 
+    // Only standard reduction operations (add, sub, or, xor) that can be
+    // sparsified by merely reducing the stored values are admissible. More
+    // elaborate reduction operations (such as mul, and, min, max) would need
+    // to know whether implicit zeros occur as well. They can still be
+    // implemented with a custom reduction operation, accepted here as well.
+    if (op.getNumReductionLoops() > 0) {
+      Operation *yield = op.getRegion().front().getTerminator();
+      assert(isa<linalg::YieldOp>(yield));
+      Operation *redop = yield->getOperand(0).getDefiningOp();
+      if (!isa<arith::AddFOp>(redop) && !isa<complex::AddOp>(redop) &&
+          !isa<arith::AddIOp>(redop) && !isa<arith::SubFOp>(redop) &&
+          !isa<complex::SubOp>(redop) && !isa<arith::SubIOp>(redop) &&
+          !isa<arith::OrIOp>(redop) && !isa<arith::XOrIOp>(redop) &&
+          !isa<ReduceOp>(redop)) {
+        return failure();
+      }
+    }
+
     // Constructs the tensor expressions tree from `op`, returns failure if the
     // tree can not be built or the tensor expression is inadmissible.
     if (failed(env.initTensorExp()))

diff  --git a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
index 53c6410a57cb8..09bbe410bfc16 100644
--- a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
@@ -5,213 +5,11 @@
 
 // -----
 
-// Check that we recognize a reduction with a mul operator.
-// We use two dimensions here to check that the vectorization
-// is not affected by how the outer loop is layed out.
-// In other words, we should be able to vectorize the sparse inner loop
-// regardless of whether the outer loop is dense or sparse.
-//
-// For this particular test, we expect:
-// With vectorization on:
-// dense scf.for
-//   init vector_accumulator = {scalar_accumulator, 1.0, 1.0, ...}
-//   sparse scf.for
-//     vectorized mul in vector_accumulator, vector_input
-//   horizontal reduction of the vector_accumulator to scalar_accumulator
-// final store of scalar_accumulaor
-//
-// With vectorization off:
-// dense scf.for
-//   sparse scf.for
-//     mul in accumulator
-// final store
-//
-// CHECK-ON-LABEL:   func.func @sparse_product_reduction_dense_sparse(
-// CHECK-ON-SAME:                                                     %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-ON-SAME:                                                     %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<1.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant dense<0.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-ON-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
-// CHECK-ON:           %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>
-// CHECK-ON:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_7]] step %[[VAL_6]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f64) {
-// CHECK-ON:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_13]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_16:.*]] = arith.addi %[[VAL_13]], %[[VAL_6]] : index
-// CHECK-ON:             %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_18:.*]] = vector.insertelement %[[VAL_14]], %[[VAL_3]]{{\[}}%[[VAL_5]] : index] : vector<8xf64>
-// CHECK-ON:             %[[VAL_19:.*]] = scf.for %[[VAL_20:.*]] = %[[VAL_15]] to %[[VAL_17]] step %[[VAL_2]] iter_args(%[[VAL_21:.*]] = %[[VAL_18]]) -> (vector<8xf64>) {
-// CHECK-ON:               %[[VAL_22:.*]] = affine.min #map(%[[VAL_17]], %[[VAL_20]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:               %[[VAL_23:.*]] = vector.create_mask %[[VAL_22]] : vector<8xi1>
-// CHECK-ON:               %[[VAL_24:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_20]]], %[[VAL_23]], %[[VAL_4]] : memref<?xf64>, vector<8xi1>, vector<8xf64> into vector<8xf64>
-// CHECK-ON:               %[[VAL_25:.*]] = arith.mulf %[[VAL_21]], %[[VAL_24]] : vector<8xf64>
-// CHECK-ON:               %[[VAL_26:.*]] = arith.select %[[VAL_23]], %[[VAL_25]], %[[VAL_21]] : vector<8xi1>, vector<8xf64>
-// CHECK-ON:               scf.yield %[[VAL_26]] : vector<8xf64>
-// CHECK-ON:             } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:             %[[VAL_27:.*]] = vector.reduction <mul>, %[[VAL_28:.*]] : vector<8xf64> into f64
-// CHECK-ON:             scf.yield %[[VAL_27]] : f64
-// CHECK-ON:           } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:           memref.store %[[VAL_29:.*]], %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_30:.*]] = bufferization.to_tensor %[[VAL_10]] : memref<f64>
-// CHECK-ON:           return %[[VAL_30]] : tensor<f64>
-// CHECK-ON:         }
-//
-// CHECK-OFF-LABEL:   func.func @sparse_product_reduction_dense_sparse(
-// CHECK-OFF-SAME:                                                     %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-OFF-SAME:                                                     %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:           %[[VAL_4:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
-// CHECK-OFF:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>
-// CHECK-OFF:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-OFF:           %[[VAL_8:.*]] = memref.load %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_9:.*]] = scf.for %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (f64) {
-// CHECK-OFF:             %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_10]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_13:.*]] = arith.addi %[[VAL_10]], %[[VAL_3]] : index
-// CHECK-OFF:             %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_13]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_15:.*]] = scf.for %[[VAL_16:.*]] = %[[VAL_12]] to %[[VAL_14]] step %[[VAL_3]] iter_args(%[[VAL_17:.*]] = %[[VAL_11]]) -> (f64) {
-// CHECK-OFF:               %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_16]]] : memref<?xf64>
-// CHECK-OFF:               %[[VAL_19:.*]] = arith.mulf %[[VAL_17]], %[[VAL_18]] : f64
-// CHECK-OFF:               scf.yield %[[VAL_19]] : f64
-// CHECK-OFF:             } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:             scf.yield %[[VAL_20:.*]] : f64
-// CHECK-OFF:           } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:           memref.store %[[VAL_21:.*]], %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_22:.*]] = bufferization.to_tensor %[[VAL_7]] : memref<f64>
-// CHECK-OFF:           return %[[VAL_22]] : tensor<f64>
-// CHECK-OFF:         }
-
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["dense","compressed"]}>
-
-#trait = {
-  indexing_maps = [
-    affine_map<(i,j) -> (i,j)>,  // a (in)
-    affine_map<(i,j) -> ()>      // x (out)
-  ],
-  iterator_types = ["reduction", "reduction"]
-}
-
-func.func @sparse_product_reduction_dense_sparse(%argx: tensor<f64>,
-                             %arga: tensor<?x128xf64, #SparseVector>)
- -> tensor<f64> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?x128xf64, #SparseVector>)
-      outs(%argx: tensor<f64>) {
-      ^bb(%a: f64, %x: f64):
-        %t = arith.mulf %x, %a: f64
-        linalg.yield %t : f64
-  } -> tensor<f64>
-  return %0 : tensor<f64>
-}
-
-// -----
-
-// Same as sparse_product_reduction_dense_sparse but with the outer loop being sparse.
-//
-// CHECK-ON-LABEL:   func.func @sparse_product_reduction_sparse_sparse(
-// CHECK-ON-SAME:                                                      %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-ON-SAME:                                                      %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<1.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant dense<0.000000e+00> : vector<8xf64>
-// CHECK-ON-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-ON:           %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:           %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf64>
-// CHECK-ON:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK-ON:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_6]] iter_args(%[[VAL_16:.*]] = %[[VAL_11]]) -> (f64) {
-// CHECK-ON:             %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_18:.*]] = arith.addi %[[VAL_15]], %[[VAL_6]] : index
-// CHECK-ON:             %[[VAL_19:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_18]]] : memref<?xindex>
-// CHECK-ON:             %[[VAL_20:.*]] = vector.insertelement %[[VAL_16]], %[[VAL_3]]{{\[}}%[[VAL_5]] : index] : vector<8xf64>
-// CHECK-ON:             %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_17]] to %[[VAL_19]] step %[[VAL_2]] iter_args(%[[VAL_23:.*]] = %[[VAL_20]]) -> (vector<8xf64>) {
-// CHECK-ON:               %[[VAL_24:.*]] = affine.min #map(%[[VAL_19]], %[[VAL_22]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:               %[[VAL_25:.*]] = vector.create_mask %[[VAL_24]] : vector<8xi1>
-// CHECK-ON:               %[[VAL_26:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_22]]], %[[VAL_25]], %[[VAL_4]] : memref<?xf64>, vector<8xi1>, vector<8xf64> into vector<8xf64>
-// CHECK-ON:               %[[VAL_27:.*]] = arith.mulf %[[VAL_23]], %[[VAL_26]] : vector<8xf64>
-// CHECK-ON:               %[[VAL_28:.*]] = arith.select %[[VAL_25]], %[[VAL_27]], %[[VAL_23]] : vector<8xi1>, vector<8xf64>
-// CHECK-ON:               scf.yield %[[VAL_28]] : vector<8xf64>
-// CHECK-ON:             } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:             %[[VAL_29:.*]] = vector.reduction <mul>, %[[VAL_30:.*]] : vector<8xf64> into f64
-// CHECK-ON:             scf.yield %[[VAL_29]] : f64
-// CHECK-ON:           } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:           memref.store %[[VAL_31:.*]], %[[VAL_10]][] : memref<f64>
-// CHECK-ON:           %[[VAL_32:.*]] = bufferization.to_tensor %[[VAL_10]] : memref<f64>
-// CHECK-ON:           return %[[VAL_32]] : tensor<f64>
-// CHECK-ON:         }
-//
-// CHECK-OFF-LABEL:   func.func @sparse_product_reduction_sparse_sparse(
-// CHECK-OFF-SAME:                                                     %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-OFF-SAME:                                                     %[[VAL_1:.*]]: tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) -> tensor<f64> {
-// CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x128xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf64>
-// CHECK-OFF:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK-OFF:           %[[VAL_8:.*]] = memref.load %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK-OFF:           %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK-OFF:           %[[VAL_11:.*]] = scf.for %[[VAL_12:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_3]] iter_args(%[[VAL_13:.*]] = %[[VAL_8]]) -> (f64) {
-// CHECK-OFF:             %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_12]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_3]] : index
-// CHECK-OFF:             %[[VAL_16:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_15]]] : memref<?xindex>
-// CHECK-OFF:             %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_3]] iter_args(%[[VAL_19:.*]] = %[[VAL_13]]) -> (f64) {
-// CHECK-OFF:               %[[VAL_20:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_18]]] : memref<?xf64>
-// CHECK-OFF:               %[[VAL_21:.*]] = arith.mulf %[[VAL_19]], %[[VAL_20]] : f64
-// CHECK-OFF:               scf.yield %[[VAL_21]] : f64
-// CHECK-OFF:             } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:             scf.yield %[[VAL_22:.*]] : f64
-// CHECK-OFF:           } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:           memref.store %[[VAL_23:.*]], %[[VAL_7]][] : memref<f64>
-// CHECK-OFF:           %[[VAL_24:.*]] = bufferization.to_tensor %[[VAL_7]] : memref<f64>
-// CHECK-OFF:           return %[[VAL_24]] : tensor<f64>
-// CHECK-OFF:         }
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed","compressed"]}>
-
-#trait = {
-  indexing_maps = [
-    affine_map<(i,j) -> (i,j)>,  // a (in)
-    affine_map<(i,j) -> ()>      // x (out)
-  ],
-  iterator_types = ["reduction", "reduction"]
-}
-
-func.func @sparse_product_reduction_sparse_sparse(%argx: tensor<f64>,
-                             %arga: tensor<?x128xf64, #SparseVector>)
- -> tensor<f64> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?x128xf64, #SparseVector>)
-      outs(%argx: tensor<f64>) {
-      ^bb(%a: f64, %x: f64):
-        %t = arith.mulf %x, %a: f64
-        linalg.yield %t : f64
-  } -> tensor<f64>
-  return %0 : tensor<f64>
-}
-
-// -----
-
-// sparse_product_reduction_dense_sparse and
-// sparse_product_reduction_sparse_sparse established that the outer loop
-// doesn't matter for vectorization.
-// As a result from this point forward, use tensors with fewer dimensions.
-
 // Check that we vectorize reductions with ori.
-// Note: The weird element type here is to check that we create the right
-// constant type for the pass-through value.
+
 // CHECK-ON-LABEL:   func.func @sparse_reduction_ori(
-// CHECK-ON-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-ON-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-ON-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-ON-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi13>
 // CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
@@ -238,8 +36,8 @@ func.func @sparse_product_reduction_sparse_sparse(%argx: tensor<f64>,
 // CHECK-ON:         }
 //
 // CHECK-OFF-LABEL:   func.func @sparse_reduction_ori(
-// CHECK-OFF-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-OFF-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-OFF-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-OFF-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
@@ -268,7 +66,7 @@ func.func @sparse_product_reduction_sparse_sparse(%argx: tensor<f64>,
 }
 
 func.func @sparse_reduction_ori(%argx: tensor<i13>,
-                             %arga: tensor<?xi13, #SparseVector>)
+                                %arga: tensor<?xi13, #SparseVector>)
  -> tensor<i13> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi13, #SparseVector>)
@@ -283,13 +81,12 @@ func.func @sparse_reduction_ori(%argx: tensor<i13>,
 // -----
 
 // Same test as sparse_reduction_ori except that the accumulator is on the
-// rhs of the operation.
-// This checks that we can recognize a reduction irrespective to where the
-// accumalator appears on commutative operations.
+// rhs of the operation. This checks that we can recognize a reduction
+// irrespective to where the accumulator appears on commutative operations.
 
 // CHECK-ON-LABEL:   func.func @sparse_reduction_ori_accumulator_on_rhs(
-// CHECK-ON-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-ON-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-ON-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-ON-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi13>
 // CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
@@ -316,8 +113,8 @@ func.func @sparse_reduction_ori(%argx: tensor<i13>,
 // CHECK-ON:         }
 //
 // CHECK-OFF-LABEL:   func.func @sparse_reduction_ori_accumulator_on_rhs(
-// CHECK-OFF-SAME:                                    %[[VAL_0:.*]]: tensor<i13>,
-// CHECK-OFF-SAME:                                    %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
+// CHECK-OFF-SAME:      %[[VAL_0:.*]]: tensor<i13>,
+// CHECK-OFF-SAME:      %[[VAL_1:.*]]: tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i13> {
 // CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
@@ -346,7 +143,7 @@ func.func @sparse_reduction_ori(%argx: tensor<i13>,
 }
 
 func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
-                             %arga: tensor<?xi13, #SparseVector>)
+                                                   %arga: tensor<?xi13, #SparseVector>)
  -> tensor<i13> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi13, #SparseVector>)
@@ -360,11 +157,11 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
 
 // -----
 
-// Check that we vectorize reduction with subi.
+// Check that we vectorize reductions with subi.
 //
 // CHECK-ON-LABEL:   func.func @sparse_reduction_subi(
-// CHECK-ON-SAME:                                     %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-ON-SAME:                                     %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
+// CHECK-ON-SAME:      %[[VAL_0:.*]]: tensor<i32>,
+// CHECK-ON-SAME:      %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
 // CHECK-ON-DAG:       %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-ON-DAG:       %[[VAL_4:.*]] = arith.constant dense<0> : vector<8xi32>
@@ -391,8 +188,8 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
 // CHECK-ON:         }
 //
 // CHECK-OFF-LABEL:   func.func @sparse_reduction_subi(
-// CHECK-OFF-SAME:                                     %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-OFF-SAME:                                     %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
+// CHECK-OFF-SAME:      %[[VAL_0:.*]]: tensor<i32>,
+// CHECK-OFF-SAME:      %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
 // CHECK-OFF-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:           %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
@@ -421,7 +218,7 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
 }
 
 func.func @sparse_reduction_subi(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
+                                 %arga: tensor<?xi32, #SparseVector>)
  -> tensor<i32> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi32, #SparseVector>)
@@ -435,10 +232,8 @@ func.func @sparse_reduction_subi(%argx: tensor<i32>,
 
 // -----
 
-// From this point forward, we essentially have the same test for all
-// arithmetic operation. This is for a code coverage perspective.
+// Check that we vectorize reductions with xor.
 
-// Check that we vectorize xor.
 // CHECK-ON-LABEL: func.func @sparse_reduction_xor(
 // CHECK-ON-SAME: %[[VAL_0:.*]]: tensor<i32>,
 // CHECK-ON-SAME: %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
@@ -512,156 +307,9 @@ func.func @sparse_reduction_xor(%argx: tensor<i32>,
 }
 
 // -----
-// Check that we vectorize and.
-// CHECK-ON-LABEL: func.func @sparse_reduction_and(
-// CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-ON-DAG:   %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:   %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi32>
-// CHECK-ON-DAG:   %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:   %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-ON:   %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:   %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-ON:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
-// CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.broadcast %[[VAL_9]] : i32 to vector<8xi32>
-// CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
-// CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
-// CHECK-ON:     %[[VAL_18:.*]] = vector.maskedload %[[VAL_7]]{{\[}}%[[VAL_14]]], %[[VAL_17]], %[[VAL_3]] : memref<?xi32>, vector<8xi1>, vector<8xi32> into vector<8xi32>
-// CHECK-ON:     %[[VAL_19:.*]] = arith.andi %[[VAL_15]], %[[VAL_18]] : vector<8xi32>
-// CHECK-ON:     %[[VAL_20:.*]] = arith.select %[[VAL_17]], %[[VAL_19]], %[[VAL_15]] : vector<8xi1>, vector<8xi32>
-// CHECK-ON:     scf.yield %[[VAL_20]] : vector<8xi32>
-// CHECK-ON:   } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:   %[[VAL_21:.*]] = vector.reduction <and>, %[[VAL_22:.*]] : vector<8xi32> into i32
-// CHECK-ON:   memref.store %[[VAL_21]], %[[VAL_8]][] : memref<i32>
-// CHECK-ON:   %[[VAL_23:.*]] = bufferization.to_tensor %[[VAL_8]] : memref<i32>
-// CHECK-ON:   return %[[VAL_23]] : tensor<i32>
-// CHECK-ON: }
-//
-// CHECK-OFF-LABEL: func.func @sparse_reduction_and(
-// CHECK-OFF-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-OFF-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-OFF-DAG:   %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-OFF:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_7]]) -> (i32) {
-// CHECK-OFF:     %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xi32>
-// CHECK-OFF:     %[[VAL_14:.*]] = arith.andi %[[VAL_12]], %[[VAL_13]] : i32
-// CHECK-OFF:     scf.yield %[[VAL_14]] : i32
-// CHECK-OFF:   } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:   memref.store %[[VAL_15:.*]], %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_16:.*]] = bufferization.to_tensor %[[VAL_6]] : memref<i32>
-// CHECK-OFF:   return %[[VAL_16]] : tensor<i32>
-// CHECK-OFF: }
-
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed"]}>
-
-#trait = {
-  indexing_maps = [
-    affine_map<(i) -> (i)>,  // a (in)
-    affine_map<(i) -> ()>    // x (out)
-  ],
-  iterator_types = ["reduction"]
-}
-
-func.func @sparse_reduction_and(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
- -> tensor<i32> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?xi32, #SparseVector>)
-      outs(%argx: tensor<i32>) {
-      ^bb(%a: i32, %x: i32):
-        %t = arith.andi %x, %a: i32
-        linalg.yield %t : i32
-  } -> tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-// Check that we vectorize muli.
-// CHECK-ON-LABEL: func.func @sparse_reduction_muli(
-// CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-ON-DAG:   %[[VAL_2:.*]] = arith.constant 8 : index
-// CHECK-ON-DAG:   %[[VAL_3:.*]] = arith.constant dense<1> : vector<8xi32>
-// CHECK-ON-DAG:   %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-ON-DAG:   %[[VAL_5:.*]] = arith.constant dense<0> : vector<8xi32>
-// CHECK-ON-DAG:   %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-ON:   %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-ON:   %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-ON:   %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_9]][] : memref<i32>
-// CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_13:.*]] = vector.insertelement %[[VAL_10]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xi32>
-// CHECK-ON:   %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_2]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (vector<8xi32>) {
-// CHECK-ON:     %[[VAL_17:.*]] = affine.min #map(%[[VAL_12]], %[[VAL_15]]){{\[}}%[[VAL_2]]]
-// CHECK-ON:     %[[VAL_18:.*]] = vector.create_mask %[[VAL_17]] : vector<8xi1>
-// CHECK-ON:     %[[VAL_19:.*]] = vector.maskedload %[[VAL_8]]{{\[}}%[[VAL_15]]], %[[VAL_18]], %[[VAL_5]] : memref<?xi32>, vector<8xi1>, vector<8xi32> into vector<8xi32>
-// CHECK-ON:     %[[VAL_20:.*]] = arith.muli %[[VAL_16]], %[[VAL_19]] : vector<8xi32>
-// CHECK-ON:     %[[VAL_21:.*]] = arith.select %[[VAL_18]], %[[VAL_20]], %[[VAL_16]] : vector<8xi1>, vector<8xi32>
-// CHECK-ON:     scf.yield %[[VAL_21]] : vector<8xi32>
-// CHECK-ON:   } {"Emitted from" = "linalg.generic"}
-// CHECK-ON:   %[[VAL_22:.*]] = vector.reduction <mul>, %[[VAL_23:.*]] : vector<8xi32> into i32
-// CHECK-ON:   memref.store %[[VAL_22]], %[[VAL_9]][] : memref<i32>
-// CHECK-ON:   %[[VAL_24:.*]] = bufferization.to_tensor %[[VAL_9]] : memref<i32>
-// CHECK-ON:   return %[[VAL_24]] : tensor<i32>
-// CHECK-ON: }
-//
-// CHECK-OFF-LABEL: func.func @sparse_reduction_muli(
-// CHECK-OFF-SAME:   %[[VAL_0:.*]]: tensor<i32>,
-// CHECK-OFF-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
-// CHECK-OFF-DAG:   %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-OFF:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>
-// CHECK-OFF:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xi32>
-// CHECK-OFF:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
-// CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK-OFF:   %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_7]]) -> (i32) {
-// CHECK-OFF:     %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xi32>
-// CHECK-OFF:     %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_13]] : i32
-// CHECK-OFF:     scf.yield %[[VAL_14]] : i32
-// CHECK-OFF:   } {"Emitted from" = "linalg.generic"}
-// CHECK-OFF:   memref.store %[[VAL_15:.*]], %[[VAL_6]][] : memref<i32>
-// CHECK-OFF:   %[[VAL_16:.*]] = bufferization.to_tensor %[[VAL_6]] : memref<i32>
-// CHECK-OFF:   return %[[VAL_16]] : tensor<i32>
-// CHECK-OFF: }
 
-#SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed"]}>
+// Check that we vectorize reductions with addi.
 
-#trait = {
-  indexing_maps = [
-    affine_map<(i) -> (i)>,  // a (in)
-    affine_map<(i) -> ()>    // x (out)
-  ],
-  iterator_types = ["reduction"]
-}
-
-func.func @sparse_reduction_muli(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
- -> tensor<i32> {
-  %0 = linalg.generic #trait
-     ins(%arga: tensor<?xi32, #SparseVector>)
-      outs(%argx: tensor<i32>) {
-      ^bb(%a: i32, %x: i32):
-        %t = arith.muli %x, %a: i32
-        linalg.yield %t : i32
-  } -> tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// -----
-// Check that we vectorize addi.
 // CHECK-ON-LABEL: func.func @sparse_reduction_addi(
 // CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<i32>,
 // CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<i32> {
@@ -722,7 +370,7 @@ func.func @sparse_reduction_muli(%argx: tensor<i32>,
 }
 
 func.func @sparse_reduction_addi(%argx: tensor<i32>,
-                             %arga: tensor<?xi32, #SparseVector>)
+                                 %arga: tensor<?xi32, #SparseVector>)
  -> tensor<i32> {
   %0 = linalg.generic #trait
      ins(%arga: tensor<?xi32, #SparseVector>)
@@ -735,7 +383,9 @@ func.func @sparse_reduction_addi(%argx: tensor<i32>,
 }
 
 // -----
-// Check that we vectorize subf.
+
+// Check that we vectorize reductions with subf.
+
 // CHECK-ON-LABEL: func.func @sparse_reduction_subf(
 // CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<f32>,
 // CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
@@ -809,10 +459,12 @@ func.func @sparse_reduction_subf(%argx: tensor<f32>,
 }
 
 // -----
-// Check that we vectorize addf.
+
+// Check that we vectorize reductions with addf.
+
 // CHECK-ON-LABEL: func.func @sparse_reduction_addf(
-// CHECK-ON-SAME:   %[[VAL_0:.*]]: tensor<f32>,
-// CHECK-ON-SAME:   %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
+// CHECK-ON-SAME:  %[[VAL_0:.*]]: tensor<f32>,
+// CHECK-ON-SAME:  %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
 // CHECK-ON-DAG:   %[[VAL_2:.*]] = arith.constant 8 : index
 // CHECK-ON-DAG:   %[[VAL_3:.*]] = arith.constant dense<0.000000e+00> : vector<8xf32>
 // CHECK-ON-DAG:   %[[VAL_4:.*]] = arith.constant 0 : index
@@ -839,8 +491,8 @@ func.func @sparse_reduction_subf(%argx: tensor<f32>,
 // CHECK-ON: }
 //
 // CHECK-OFF-LABEL: func.func @sparse_reduction_addf(
-// CHECK-OFF-SAME:    %[[VAL_0:.*]]: tensor<f32>,
-// CHECK-OFF-SAME:    %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
+// CHECK-OFF-SAME:  %[[VAL_0:.*]]: tensor<f32>,
+// CHECK-OFF-SAME:  %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<f32> {
 // CHECK-OFF-DAG:   %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref<?xindex>

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
index eb4b50b1dd4cc..c7fc8336f025e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
@@ -28,7 +28,6 @@
 // Reduction in this file _are_ supported by the AArch64 SVE backend
 
 #SV = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>
-#DV = #sparse_tensor.encoding<{ lvlTypes = [ "dense"      ] }>
 
 #trait_reduction = {
   indexing_maps = [
@@ -66,18 +65,6 @@ module {
     return %0 : tensor<f32>
   }
 
-  func.func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
-                          %argx: tensor<i32>) -> tensor<i32> {
-    %0 = linalg.generic #trait_reduction
-      ins(%arga: tensor<32xi32, #DV>)
-      outs(%argx: tensor<i32>) {
-        ^bb(%a: i32, %x: i32):
-          %0 = arith.andi %x, %a : i32
-          linalg.yield %0 : i32
-    } -> tensor<i32>
-    return %0 : tensor<i32>
-  }
-
   func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
                          %argx: tensor<i32>) -> tensor<i32> {
     %0 = linalg.generic #trait_reduction
@@ -130,59 +117,37 @@ module {
       2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
     ]> : tensor<32xf32>
 
-    %c_1_i32 = arith.constant dense<[
-      1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
-    ]> : tensor<32xi32>
-
-    %c_1_f32 = arith.constant dense<[
-      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
-      1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-      1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
-      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
-    ]> : tensor<32xf32>
-
     // Convert constants to annotated tensors.
     %sparse_input_i32 = sparse_tensor.convert %c_0_i32
       : tensor<32xi32> to tensor<32xi32, #SV>
     %sparse_input_f32 = sparse_tensor.convert %c_0_f32
       : tensor<32xf32> to tensor<32xf32, #SV>
-    %dense_input_i32 = sparse_tensor.convert %c_1_i32
-      : tensor<32xi32> to tensor<32xi32, #DV>
-    %dense_input_f32 = sparse_tensor.convert %c_1_f32
-      : tensor<32xf32> to tensor<32xf32, #DV>
 
     // Call the kernels.
     %0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
     %1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
        : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
-    %4 = call @and_reduction_i32(%dense_input_i32, %ri)
-       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
-    %5 = call @or_reduction_i32(%sparse_input_i32, %ri)
+    %2 = call @or_reduction_i32(%sparse_input_i32, %ri)
        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
-    %6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
+    %3 = call @xor_reduction_i32(%sparse_input_i32, %ri)
        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
 
     // Verify results.
     //
     // CHECK: 26
     // CHECK: 27.5
-    // CHECK: 1
     // CHECK: 15
     // CHECK: 10
     //
     call @dump_i32(%0) : (tensor<i32>) -> ()
     call @dump_f32(%1) : (tensor<f32>) -> ()
-    call @dump_i32(%4) : (tensor<i32>) -> ()
-    call @dump_i32(%5) : (tensor<i32>) -> ()
-    call @dump_i32(%6) : (tensor<i32>) -> ()
+    call @dump_i32(%2) : (tensor<i32>) -> ()
+    call @dump_i32(%3) : (tensor<i32>) -> ()
 
     // Release the resources.
     bufferization.dealloc_tensor %sparse_input_i32 : tensor<32xi32, #SV>
     bufferization.dealloc_tensor %sparse_input_f32 : tensor<32xf32, #SV>
-    bufferization.dealloc_tensor %dense_input_i32  : tensor<32xi32, #DV>
-    bufferization.dealloc_tensor %dense_input_f32  : tensor<32xf32, #DV>
 
     return
   }

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir
index bd232465107d5..c90c2c416cd83 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir
@@ -15,10 +15,6 @@
 // REDEFINE: %{option} = "enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
 // RUN: %{command}
 
-// Product reductions - kept in a seperate file as these are not supported by
-// the AArch64 SVE backend (so the set-up is a bit 
diff erent to
-// sparse_reducitons.mlir)
-
 #SV = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>
 #DV = #sparse_tensor.encoding<{ lvlTypes = [ "dense"      ] }>
 
@@ -28,32 +24,76 @@
     affine_map<(i) -> ()>    // x (scalar out)
   ],
   iterator_types = ["reduction"],
-  doc = "x += OPER_i a(i)"
+  doc = "x += PROD_CUSTOM_i a(i)"
 }
 
 // An example of vector reductions.
 module {
 
-  func.func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
-                           %argx: tensor<i32>) -> tensor<i32> {
+  func.func @prod_dreduction_i32(%arga: tensor<32xi32, #DV>,
+                                 %argx: tensor<i32>) -> tensor<i32> {
+    %c = tensor.extract %argx[] : tensor<i32>
     %0 = linalg.generic #trait_reduction
       ins(%arga: tensor<32xi32, #DV>)
       outs(%argx: tensor<i32>) {
-        ^bb(%a: i32, %x: i32):
-          %0 = arith.muli %x, %a : i32
-          linalg.yield %0 : i32
+        ^bb(%a: i32, %b: i32):
+          %1 = sparse_tensor.reduce %a, %b, %c : i32 {
+            ^bb0(%x: i32, %y: i32):
+              %2 = arith.muli %x, %y : i32
+              sparse_tensor.yield %2 : i32
+          }
+          linalg.yield %1 : i32
     } -> tensor<i32>
     return %0 : tensor<i32>
   }
 
-  func.func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
-                           %argx: tensor<f32>) -> tensor<f32> {
+  func.func @prod_dreduction_f32(%arga: tensor<32xf32, #DV>,
+                                 %argx: tensor<f32>) -> tensor<f32> {
+    %c = tensor.extract %argx[] : tensor<f32>
     %0 = linalg.generic #trait_reduction
       ins(%arga: tensor<32xf32, #DV>)
       outs(%argx: tensor<f32>) {
-        ^bb(%a: f32, %x: f32):
-          %0 = arith.mulf %x, %a : f32
-          linalg.yield %0 : f32
+        ^bb(%a: f32, %b: f32):
+          %1 = sparse_tensor.reduce %a, %b, %c : f32 {
+            ^bb0(%x: f32, %y: f32):
+              %2 = arith.mulf %x, %y : f32
+              sparse_tensor.yield %2 : f32
+          }
+          linalg.yield %1 : f32
+    } -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func @prod_sreduction_i32(%arga: tensor<32xi32, #SV>,
+                                 %argx: tensor<i32>) -> tensor<i32> {
+    %c = tensor.extract %argx[] : tensor<i32>
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xi32, #SV>)
+      outs(%argx: tensor<i32>) {
+        ^bb(%a: i32, %b: i32):
+          %1 = sparse_tensor.reduce %a, %b, %c : i32 {
+            ^bb0(%x: i32, %y: i32):
+              %2 = arith.muli %x, %y : i32
+              sparse_tensor.yield %2 : i32
+          }
+          linalg.yield %1 : i32
+    } -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func.func @prod_sreduction_f32(%arga: tensor<32xf32, #SV>,
+                                 %argx: tensor<f32>) -> tensor<f32> {
+    %c = tensor.extract %argx[] : tensor<f32>
+    %0 = linalg.generic #trait_reduction
+      ins(%arga: tensor<32xf32, #SV>)
+      outs(%argx: tensor<f32>) {
+        ^bb(%a: f32, %b: f32):
+          %1 = sparse_tensor.reduce %a, %b, %c : f32 {
+            ^bb0(%x: f32, %y: f32):
+              %2 = arith.mulf %x, %y : f32
+              sparse_tensor.yield %2 : f32
+          }
+          linalg.yield %1 : f32
     } -> tensor<f32>
     return %0 : tensor<f32>
   }
@@ -74,6 +114,20 @@ module {
     %ri = arith.constant dense< 7   > : tensor<i32>
     %rf = arith.constant dense< 2.0 > : tensor<f32>
 
+    // Vectors with a few zeros.
+    %c_0_i32 = arith.constant dense<[
+      1, 1, 7, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 0, 1, 1, 7, 3
+    ]> : tensor<32xi32>
+
+    %c_0_f32 = arith.constant dense<[
+      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
+      1.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+      1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0,
+      1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0
+    ]> : tensor<32xf32>
+
+    // Vectors with no zeros.
     %c_1_i32 = arith.constant dense<[
       1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
@@ -87,28 +141,64 @@ module {
     ]> : tensor<32xf32>
 
     // Convert constants to annotated tensors.
-    %dense_input_i32 = sparse_tensor.convert %c_1_i32
+    %d0_i32 = sparse_tensor.convert %c_0_i32
       : tensor<32xi32> to tensor<32xi32, #DV>
-    %dense_input_f32 = sparse_tensor.convert %c_1_f32
+    %d0_f32 = sparse_tensor.convert %c_0_f32
       : tensor<32xf32> to tensor<32xf32, #DV>
+    %s0_i32 = sparse_tensor.convert %c_0_i32
+      : tensor<32xi32> to tensor<32xi32, #SV>
+    %s0_f32 = sparse_tensor.convert %c_0_f32
+      : tensor<32xf32> to tensor<32xf32, #SV>
+    %d1_i32 = sparse_tensor.convert %c_1_i32
+      : tensor<32xi32> to tensor<32xi32, #DV>
+    %d1_f32 = sparse_tensor.convert %c_1_f32
+      : tensor<32xf32> to tensor<32xf32, #DV>
+    %s1_i32 = sparse_tensor.convert %c_1_i32
+      : tensor<32xi32> to tensor<32xi32, #SV>
+    %s1_f32 = sparse_tensor.convert %c_1_f32
+      : tensor<32xf32> to tensor<32xf32, #SV>
 
     // Call the kernels.
-    %2 = call @prod_reduction_i32(%dense_input_i32, %ri)
-       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
-    %3 = call @prod_reduction_f32(%dense_input_f32, %rf)
-       : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
-
-    // Verify results.
+    %0 = call @prod_dreduction_i32(%d0_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
+    %1 = call @prod_dreduction_f32(%d0_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
+    %2 = call @prod_sreduction_i32(%s0_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
+    %3 = call @prod_sreduction_f32(%s0_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
+    %4 = call @prod_dreduction_i32(%d1_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
+    %5 = call @prod_dreduction_f32(%d1_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
+    %6 = call @prod_sreduction_i32(%s1_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
+    %7 = call @prod_sreduction_f32(%s1_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
+
+    // Verify results. Note that the custom reduction gave permission
+    // to treat an explicit vs implicit zero 
diff erently to compute the
+    // full product reduction. A "standard" product reduction would
+    // have to return 0 for any implicit zero occurrence too.
     //
+    // CHECK: 0
+    // CHECK: 3087
+    // CHECK: 14
+    // CHECK: 3087
+    // CHECK: 168
     // CHECK: 3087
     // CHECK: 168
     //
+    call @dump_i32(%0) : (tensor<i32>) -> ()
+    call @dump_f32(%1) : (tensor<f32>) -> ()
     call @dump_i32(%2) : (tensor<i32>) -> ()
     call @dump_f32(%3) : (tensor<f32>) -> ()
+    call @dump_i32(%4) : (tensor<i32>) -> ()
+    call @dump_f32(%5) : (tensor<f32>) -> ()
+    call @dump_i32(%6) : (tensor<i32>) -> ()
+    call @dump_f32(%7) : (tensor<f32>) -> ()
 
     // Release the resources.
-    bufferization.dealloc_tensor %dense_input_i32  : tensor<32xi32, #DV>
-    bufferization.dealloc_tensor %dense_input_f32  : tensor<32xf32, #DV>
+    bufferization.dealloc_tensor %d0_i32 : tensor<32xi32, #DV>
+    bufferization.dealloc_tensor %d0_f32 : tensor<32xf32, #DV>
+    bufferization.dealloc_tensor %s0_i32 : tensor<32xi32, #SV>
+    bufferization.dealloc_tensor %s0_f32 : tensor<32xf32, #SV>
+    bufferization.dealloc_tensor %d1_i32 : tensor<32xi32, #DV>
+    bufferization.dealloc_tensor %d1_f32 : tensor<32xf32, #DV>
+    bufferization.dealloc_tensor %s1_i32 : tensor<32xi32, #SV>
+    bufferization.dealloc_tensor %s1_f32 : tensor<32xf32, #SV>
 
     return
   }


        


More information about the Mlir-commits mailing list