[Mlir-commits] [mlir] faf7cd9 - [mlir][sparse] merger extension to support sparsifying arith::CmpI/CmpF operation

Thu Jun 15 10:26:55 PDT 2023

Author: Peiming Liu
Date: 2023-06-15T17:26:50Z
New Revision: faf7cd97d07c6c2242a96f5daf94fb05cda0c115

URL: https://github.com/llvm/llvm-project/commit/faf7cd97d07c6c2242a96f5daf94fb05cda0c115
DIFF: https://github.com/llvm/llvm-project/commit/faf7cd97d07c6c2242a96f5daf94fb05cda0c115.diff

LOG: [mlir][sparse] merger extension to support sparsifying arith::CmpI/CmpF operation

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D152761

Added: 
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir

Modified: 
    mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
    mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
    mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
    mlir/test/Dialect/SparseTensor/sparse_2d.mlir
    mlir/unittests/Dialect/SparseTensor/MergerTest.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 42dc6d52f3c5a..e166da529c14d 100644

--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -45,7 +45,7 @@ struct TensorExp final {
   // The `y`, `v`, and `op` parameters either must or must not be
   // `kInvalidId`/`nullptr`, depending on the value of the `k` parameter;
   // however, they have uniform C++ types regardless of the value of `k`.
-  TensorExp(Kind k, unsigned x, ExprId y, Value v, Operation *op);
+  TensorExp(Kind k, unsigned x, ExprId y, Value v, Operation *op, Attribute a);
 
   /// Tensor expression kind.
   Kind kind;
@@ -71,6 +71,10 @@ struct TensorExp final {
   /// kBinaryBranch, this holds the YieldOp for the left or right half
   /// to be merged into a nested scf loop.
   Operation *op;
+
+  /// An optional attribute that is required to determine the semantics of the
+  /// operations. E.g., CmpPredicateAttr for CmpI/CmpF operations.
+  Attribute attr;
 };
 
 /// Tensor expression kind.
@@ -79,6 +83,10 @@ struct TensorExp final {
 /// That is, its argument is a `LoopId` identifying the loop-variable
 /// in question, and its value will be the current iteration's value
 /// of that loop-variable.  See the `LoopId` documentation for more details.
+///
+/// The `kSynZero` leaf kind is for representing a synthetic zero value, which
+/// can be introduced when sparsifying operations like `arith::cmp` to generate
+/// `arith::cmp %lhs, %syn_zero` when the rhs operand is absent.
 //
 // TODO: Modify this definition so that the numeric values already encode
 // the `ExpArity` (while extending the notion of "arity" to include not
@@ -89,6 +97,7 @@ struct TensorExp final {
 enum class TensorExp::Kind {
   // Leaf.
   kTensor = 0,
+  kSynZero,
   kInvariant,
   kLoopVar,
   // Unary operations.
@@ -143,6 +152,8 @@ enum class TensorExp::Kind {
   kAndI,
   kOrI,
   kXorI,
+  kCmpI,
+  kCmpF,
   kShrS, // signed
   kShrU, // unsigned
   kShlI,
@@ -246,13 +257,16 @@ class Merger {
   ExprId addLoopVarExp(LoopId i);
   /// Constructs a new invariant expression, and returns its identifier.
   ExprId addInvariantExp(Value v);
+  /// Constructs a new synthetic zero expression.
+  ExprId addSynZeroExp();
   /// Constructs a new unary or binary expression, and returns its identifier.
   ExprId addExp(TensorExp::Kind k, ExprId e0, ExprId e1 = detail::kInvalidId,
-                Operation *op = nullptr);
+                Operation *op = nullptr, Attribute attr = nullptr);
   /// Constructs a new sesquinary expression, and returns its identifier.
   /// Currently no sesquinary `Kind` allows specifying the `op`, but we
   /// allow it anyways because `mapSet` is designed to allow it.
-  ExprId addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op = nullptr);
+  ExprId addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op = nullptr,
+                Attribute attr = nullptr);
 
   /// Constructs a new iteration lattice point, and returns its identifier.
   LatPointId addLat(TensorId t, LoopId i, ExprId e);
@@ -265,26 +279,29 @@ class Merger {
   /// of `LoopId` (effectively constructing a larger "intersection" of those
   /// loops) with a newly constructed tensor (sub)expression of given kind.
   /// Returns the identifier of the new lattice point.
-  LatPointId conjLat(TensorExp::Kind kind, LatPointId p0, LatPointId p1,
+  LatPointId conjLat(ExprId e, LatPointId p0, LatPointId p1,
                      Operation *op = nullptr);
 
   /// Conjunctive merge of two lattice sets: `(s0 /\_op s1)`.
   /// Returns the identifier of the new set.
-  LatSetId conjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                   Operation *op = nullptr);
+  LatSetId conjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op = nullptr);
 
   /// Disjunctive merge of two lattice sets: `(s0 /\_op s1, s0, s1)`.
   /// Returns the identifier of the new set.
-  LatSetId disjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                   Operation *op = nullptr);
+  LatSetId disjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op = nullptr);
+
+  /// Disjunctive merge of two lattice sets and also set one of the operand to
+  /// zero: `(s0 /\_op s1 (e0 op e1), s0 (0 op e0), s1 (e1 op 0))`.
+  /// Returns the identifier of the new set.
+  LatSetId disjSetWithZero(ExprId e, LatSetId s0, LatSetId s1);
 
   /// Disjunctive merge of two lattice sets with custom handling of the
   /// overlap, left, and right regions.  Any region may be left missing
   /// in the output.  Returns the identifier of the new set.
-  LatSetId combiSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                    Operation *orig, bool includeLeft, TensorExp::Kind ltrans,
-                    Operation *opleft, bool includeRight,
-                    TensorExp::Kind rtrans, Operation *opright);
+  LatSetId combiSet(ExprId e, LatSetId s0, LatSetId s1, Operation *orig,
+                    bool includeLeft, TensorExp::Kind ltrans, Operation *opleft,
+                    bool includeRight, TensorExp::Kind rtrans,
+                    Operation *opright);
 
   /// Maps the unary operator over the lattice set of the operand, i.e. each
   /// lattice point on an expression E is simply copied over, but with OP E
@@ -292,6 +309,12 @@ class Merger {
   LatSetId mapSet(TensorExp::Kind kind, LatSetId s, Value v = Value(),
                   Operation *op = nullptr);
 
+  /// Maps the binary operator to the same operation but with one of its operand
+  /// set to zero, i.e. each lattice point on an expression E is simply copied
+  /// over, but with `OP 0 E` (if lhsZero == true) or `OP E 0` (if lhsZero ==
+  /// false) as new expression. Returns the identifier of the new set.
+  LatSetId mapBinWithSynZeroSet(ExprId e, LatSetId s, bool lhsZero);
+
   /// Optimizes the iteration lattice points in the given set. This
   /// method should be called right before code generation to avoid
   /// generating redundant loops and conditions.

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 7e69a737b0661..428bc49d14ac9 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -1154,11 +1154,11 @@ static Value relinkBranch(CodegenEnv &env, RewriterBase &rewriter, Block *block,
 /// Recursively generates tensor expression.
 static Value genExp(CodegenEnv &env, RewriterBase &rewriter, ExprId e,
                     LoopId ldx) {
-  linalg::GenericOp op = env.op();
-  Location loc = op.getLoc();
-
   if (e == ::mlir::sparse_tensor::detail::kInvalidId)
     return Value();
+
+  linalg::GenericOp op = env.op();
+  Location loc = op.getLoc();
   const TensorExp &exp = env.exp(e);
   const auto kind = exp.kind;
   if (kind == TensorExp::Kind::kTensor)
@@ -1171,8 +1171,22 @@ static Value genExp(CodegenEnv &env, RewriterBase &rewriter, ExprId e,
   if (kind == TensorExp::Kind::kReduce)
     env.startCustomReduc(e); // enter custom
 
-  Value v0 = genExp(env, rewriter, exp.children.e0, ldx);
-  Value v1 = genExp(env, rewriter, exp.children.e1, ldx);
+  Value v0, v1;
+  // If either lhs/rhs is a synthetic zero, we infer the type for the zero value
+  // based on the type of the other operand.
+  if (exp.children.e0 != ::mlir::sparse_tensor::detail::kInvalidId &&
+      env.exp(exp.children.e0).kind == TensorExp::Kind::kSynZero) {
+    v1 = genExp(env, rewriter, exp.children.e1, ldx);
+    v0 = constantZero(rewriter, loc, v1.getType());
+  } else if (exp.children.e1 != ::mlir::sparse_tensor::detail::kInvalidId &&
+             env.exp(exp.children.e1).kind == TensorExp::Kind::kSynZero) {
+    v0 = genExp(env, rewriter, exp.children.e0, ldx);
+    v1 = constantZero(rewriter, loc, v0.getType());
+  } else {
+    v0 = genExp(env, rewriter, exp.children.e0, ldx);
+    v1 = genExp(env, rewriter, exp.children.e1, ldx);
+  }
+
   Value ee;
   if (kind == TensorExp::Kind::kReduce && (!v0 || !v1)) {
     // custom reduce did not receive a value
@@ -1248,7 +1262,8 @@ static void genInvariants(CodegenEnv &env, OpBuilder &builder, ExprId exp,
         env.merger().clearExprValue(exp);
     }
   } else if (env.exp(exp).kind != TensorExp::Kind::kInvariant &&
-             env.exp(exp).kind != TensorExp::Kind::kLoopVar) {
+             env.exp(exp).kind != TensorExp::Kind::kLoopVar &&
+             env.exp(exp).kind != TensorExp::Kind::kSynZero) {
     // Traverse into the binary operations. Note that we only hoist
     // tensor loads, since subsequent MLIR/LLVM passes know how to
     // deal with all other kinds of derived loop invariants.

diff  --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 6ec5d42a78c36..7a39aa48d8706 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -31,6 +31,7 @@ static ExpArity getExpArity(TensorExp::Kind k) {
   case TensorExp::Kind::kTensor:
   case TensorExp::Kind::kInvariant:
   case TensorExp::Kind::kLoopVar:
+  case TensorExp::Kind::kSynZero:
     return ExpArity::kNullary;
   case TensorExp::Kind::kAbsF:
   case TensorExp::Kind::kAbsC:
@@ -89,6 +90,8 @@ static ExpArity getExpArity(TensorExp::Kind k) {
   case TensorExp::Kind::kSubF:
   case TensorExp::Kind::kSubC:
   case TensorExp::Kind::kSubI:
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
     return ExpArity::kBinary;
   }
   llvm_unreachable("unexpected kind");
@@ -99,7 +102,7 @@ static ExpArity getExpArity(TensorExp::Kind k) {
 //===----------------------------------------------------------------------===//
 
 TensorExp::TensorExp(TensorExp::Kind k, unsigned x, ExprId y, Value v,
-                     Operation *o)
+                     Operation *o, Attribute a)
     : kind(k), val(v), op(o) {
   switch (kind) {
   // Leaf.
@@ -107,6 +110,9 @@ TensorExp::TensorExp(TensorExp::Kind k, unsigned x, ExprId y, Value v,
     assert(x != detail::kInvalidId && y == detail::kInvalidId && !v && !o);
     tensor = x;
     return;
+  case TensorExp::Kind::kSynZero:
+    assert(x == detail::kInvalidId && y == detail::kInvalidId && !v && !o);
+    return;
   case TensorExp::Kind::kInvariant:
     assert(x == detail::kInvalidId && y == detail::kInvalidId && v && !o);
     return;
@@ -191,6 +197,13 @@ TensorExp::TensorExp(TensorExp::Kind k, unsigned x, ExprId y, Value v,
     children.e0 = x;
     children.e1 = y;
     return;
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
+    assert(x != detail::kInvalidId && y != detail::kInvalidId && !v && !o);
+    attr = a;
+    children.e0 = x;
+    children.e1 = y;
+    return;
   case TensorExp::Kind::kBinary:
   case TensorExp::Kind::kReduce:
     assert(x != detail::kInvalidId && y != detail::kInvalidId && !v && o);
@@ -228,7 +241,7 @@ ExprId Merger::addTensorExp(TensorId t) {
   assert(isValidTensorId(t));
   const ExprId eNew(tensorExps.size());
   tensorExps.emplace_back(TensorExp::Kind::kTensor, t, detail::kInvalidId,
-                          Value(), nullptr);
+                          Value(), nullptr, nullptr);
   return eNew;
 }
 
@@ -236,28 +249,37 @@ ExprId Merger::addLoopVarExp(LoopId i) {
   assert(isValidLoopId(i));
   const ExprId eNew(tensorExps.size());
   tensorExps.emplace_back(TensorExp::Kind::kLoopVar, i, detail::kInvalidId,
-                          Value(), nullptr);
+                          Value(), nullptr, nullptr);
   return eNew;
 }
 
 ExprId Merger::addInvariantExp(Value v) {
   const ExprId eNew(tensorExps.size());
   tensorExps.emplace_back(TensorExp::Kind::kInvariant, detail::kInvalidId,
-                          detail::kInvalidId, v, nullptr);
+                          detail::kInvalidId, v, nullptr, nullptr);
   return eNew;
 }
 
-ExprId Merger::addExp(TensorExp::Kind k, ExprId e0, ExprId e1, Operation *op) {
+ExprId Merger::addSynZeroExp() {
+  const ExprId eNew(tensorExps.size());
+  tensorExps.emplace_back(TensorExp::Kind::kSynZero, detail::kInvalidId,
+                          detail::kInvalidId, Value(), nullptr, nullptr);
+  return eNew;
+}
+
+ExprId Merger::addExp(TensorExp::Kind k, ExprId e0, ExprId e1, Operation *op,
+                      Attribute attr) {
   assert(k > TensorExp::Kind::kLoopVar);
   const ExprId eNew(tensorExps.size());
-  tensorExps.emplace_back(k, e0, e1, Value(), op);
+  tensorExps.emplace_back(k, e0, e1, Value(), op, attr);
   return eNew;
 }
 
-ExprId Merger::addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op) {
+ExprId Merger::addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op,
+                      Attribute attr) {
   assert(k > TensorExp::Kind::kLoopVar);
   const ExprId eNew(tensorExps.size());
-  tensorExps.emplace_back(k, e, detail::kInvalidId, v, op);
+  tensorExps.emplace_back(k, e, detail::kInvalidId, v, op, attr);
   return eNew;
 }
 
@@ -283,31 +305,33 @@ LatSetId Merger::addSet() {
   return sNew;
 }
 
-LatPointId Merger::conjLat(TensorExp::Kind kind, LatPointId p0, LatPointId p1,
+LatPointId Merger::conjLat(ExprId e, LatPointId p0, LatPointId p1,
                            Operation *op) {
+  TensorExp::Kind kind = exp(e).kind;
+  Attribute attr = exp(e).attr;
   const LatPointId pNew(latPoints.size());
   const auto &point0 = lat(p0);
   const auto &point1 = lat(p1);
   BitVector bits(point0.bits);
   bits |= point1.bits;
-  const ExprId e = addExp(kind, point0.exp, point1.exp, op);
-  latPoints.emplace_back(bits, e);
+  const ExprId ne = addExp(kind, point0.exp, point1.exp, op, attr);
+  latPoints.emplace_back(bits, ne);
   return pNew;
 }
 
-LatSetId Merger::conjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                         Operation *op) {
+LatSetId Merger::conjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op) {
   const LatSetId sNew = addSet();
   auto &setNew = latSets[sNew];
   for (const LatPointId p0 : set(s0))
     for (const LatPointId p1 : set(s1))
-      setNew.push_back(conjLat(kind, p0, p1, op));
+      setNew.push_back(conjLat(e, p0, p1, op));
   return sNew;
 }
 
-LatSetId Merger::disjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                         Operation *op) {
-  const LatSetId sNew = conjSet(kind, s0, s1, op);
+LatSetId Merger::disjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op) {
+  const LatSetId sNew = conjSet(e, s0, s1, op);
+  TensorExp::Kind kind = exp(e).kind;
+
   // Followed by all in s0.
   latSets[sNew].append(latSets[s0]);
   // Map binary 0-y to unary -y.
@@ -323,12 +347,35 @@ LatSetId Merger::disjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
   return sNew;
 }
 
-LatSetId Merger::combiSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                          Operation *orig, bool includeLeft,
-                          TensorExp::Kind ltrans, Operation *opleft,
-                          bool includeRight, TensorExp::Kind rtrans,
-                          Operation *opright) {
-  const LatSetId sNew = conjSet(kind, s0, s1, orig);
+LatSetId Merger::disjSetWithZero(ExprId e, LatSetId s0, LatSetId s1) {
+  assert(exp(e).kind == TensorExp::Kind::kCmpI ||
+         exp(e).kind == TensorExp::Kind::kCmpF);
+  const LatSetId sNew = conjSet(e, s0, s1, nullptr);
+
+  ExprId e0 = exp(e).children.e0;
+  ExprId e1 = exp(e).children.e1;
+  if (exp(e0).kind == TensorExp::Kind::kSynZero ||
+      exp(e1).kind == TensorExp::Kind::kSynZero) {
+    // lhs and rhs can't be synthetic zero at the same time.
+    assert(exp(e0).kind != exp(e1).kind);
+    // If one of the operands has already been assigned to zero (the
+    // element is absent in the corresponding operand), then we do not
+    // need to build disjunctive set for it.
+    return sNew;
+  }
+
+  auto lhsSet = mapBinWithSynZeroSet(e, s0, false);
+  auto rhsSet = mapBinWithSynZeroSet(e, s1, true);
+  latSets[sNew].append(latSets[lhsSet]);
+  latSets[sNew].append(latSets[rhsSet]);
+  return sNew;
+}
+
+LatSetId Merger::combiSet(ExprId e, LatSetId s0, LatSetId s1, Operation *orig,
+                          bool includeLeft, TensorExp::Kind ltrans,
+                          Operation *opleft, bool includeRight,
+                          TensorExp::Kind rtrans, Operation *opright) {
+  const LatSetId sNew = conjSet(e, s0, s1, orig);
   // Left Region.
   if (includeLeft) {
     if (opleft)
@@ -356,6 +403,23 @@ LatSetId Merger::mapSet(TensorExp::Kind kind, LatSetId s0, Value v,
   return sNew;
 }
 
+LatSetId Merger::mapBinWithSynZeroSet(ExprId e, LatSetId s0, bool lhsZero) {
+  TensorExp::Kind kind = exp(e).kind;
+  Attribute a = exp(e).attr;
+  assert(TensorExp::Kind::kMulF <= kind && kind <= TensorExp::Kind::kShlI);
+  // Must be a binary operation.
+  const LatSetId sNew = addSet();
+  auto &setNew = latSets[sNew];
+  const ExprId zeroExp = addSynZeroExp();
+  for (const LatPointId p : set(s0)) {
+    const auto &point = latPoints[p];
+    ExprId newExp = lhsZero ? addExp(kind, zeroExp, point.exp, nullptr, a)
+                            : addExp(kind, point.exp, zeroExp, nullptr, a);
+    setNew.push_back(addLat(point.bits, newExp));
+  }
+  return sNew;
+}
+
 LatSetId Merger::optimizeSet(LatSetId s0) {
   const LatSetId sNew = addSet();
   auto &setNew = latSets[sNew];
@@ -418,7 +482,8 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Slice on dense level has `locate` property as well, and can be optimized.
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto dlt = getLvlType(b);
-      if (!isCompressedDLT(dlt) && !isSingletonDLT(dlt) && !isCompressedWithHiDLT(dlt)) {
+      if (!isCompressedDLT(dlt) && !isSingletonDLT(dlt) &&
+          !isCompressedWithHiDLT(dlt)) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -505,6 +570,7 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
     return expr.tensor == t;
   case TensorExp::Kind::kInvariant:
   case TensorExp::Kind::kLoopVar:
+  case TensorExp::Kind::kSynZero:
     return false;
   // Unary operations.
   case TensorExp::Kind::kAbsF:
@@ -576,6 +642,8 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
   case TensorExp::Kind::kSubI:
   case TensorExp::Kind::kOrI:
   case TensorExp::Kind::kXorI:
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
   case TensorExp::Kind::kBinary:
     return false;
   }
@@ -585,7 +653,8 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
 bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto dlt = getLvlType(b);
-    if (isCompressedDLT(dlt) || isSingletonDLT(dlt) || isCompressedWithHiDLT(dlt))
+    if (isCompressedDLT(dlt) || isSingletonDLT(dlt) ||
+        isCompressedWithHiDLT(dlt))
       return true;
   }
   return hasSparseIdxReduction(bits);
@@ -613,6 +682,8 @@ static const char *kindToOpSymbol(TensorExp::Kind kind) {
     return "invariant";
   case TensorExp::Kind::kLoopVar:
     return "index";
+  case TensorExp::Kind::kSynZero:
+    return "0";
   // Unary operations.
   case TensorExp::Kind::kAbsF:
   case TensorExp::Kind::kAbsC:
@@ -693,6 +764,9 @@ static const char *kindToOpSymbol(TensorExp::Kind kind) {
     return ">>";
   case TensorExp::Kind::kShlI:
     return "<<";
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
+    return "cmp";
   case TensorExp::Kind::kBinary:
     return "binary";
   case TensorExp::Kind::kReduce:
@@ -715,6 +789,9 @@ void Merger::dumpExp(ExprId e) const {
   case TensorExp::Kind::kInvariant:
     llvm::dbgs() << "invariant";
     break;
+  case TensorExp::Kind::kSynZero:
+    llvm::dbgs() << "0";
+    break;
   case TensorExp::Kind::kLoopVar:
     llvm::dbgs() << "loopvar_" << expr.loop;
     break;
@@ -776,11 +853,16 @@ void Merger::dumpExp(ExprId e) const {
   case TensorExp::Kind::kShrS:
   case TensorExp::Kind::kShrU:
   case TensorExp::Kind::kShlI:
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
   case TensorExp::Kind::kBinary:
   case TensorExp::Kind::kReduce:
     llvm::dbgs() << "(";
     dumpExp(expr.children.e0);
-    llvm::dbgs() << " " << kindToOpSymbol(expr.kind) << " ";
+    llvm::dbgs() << " " << kindToOpSymbol(expr.kind);
+    if (expr.attr)
+      llvm::dbgs() << "{" << expr.attr << "}";
+    llvm::dbgs() << " ";
     dumpExp(expr.children.e1);
     llvm::dbgs() << ")";
     break;
@@ -839,6 +921,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
   // Leaf.
   case TensorExp::Kind::kTensor:
   case TensorExp::Kind::kInvariant:
+  case TensorExp::Kind::kSynZero:
   case TensorExp::Kind::kLoopVar: {
     // Either the loop-var is really used in the tensor expression, or it is
     // set to the undefined loop-var in that level. An invariant expression,
@@ -928,7 +1011,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
       YieldOp absentYield = cast<YieldOp>(absentBlock.getTerminator());
       const Value absentVal = absentYield.getResult();
       const ExprId rhs = addInvariantExp(absentVal);
-      return disjSet(kind, child0, buildLattices(rhs, i), unop);
+      return disjSet(e, child0, buildLattices(rhs, i), unop);
     }
   // Binary operations.
   case TensorExp::Kind::kMulF:
@@ -947,7 +1030,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
     {
       const ExprId e0 = expr.children.e0;
       const ExprId e1 = expr.children.e1;
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i));
     }
   case TensorExp::Kind::kDivF:
   case TensorExp::Kind::kDivC:
@@ -970,7 +1053,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
       const ExprId e0 = expr.children.e0;
       const ExprId e1 = expr.children.e1;
       assert(!maybeZero(e1));
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i));
     }
   case TensorExp::Kind::kAddF:
   case TensorExp::Kind::kAddC:
@@ -990,7 +1073,21 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
     {
       const ExprId e0 = expr.children.e0;
       const ExprId e1 = expr.children.e1;
-      return disjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return disjSet(e, buildLattices(e0, i), buildLattices(e1, i));
+    }
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
+    // An comparison operation needs to be performed
+    // for the disjunction of sparse iteration spaces.
+    //
+    //   x < y |  !y   |   y   |
+    //  -------+-------+-------+
+    //     !x  |   0   | 0 < y |
+    //      x  | x < 0 | x < y |
+    {
+      const ExprId e0 = expr.children.e0;
+      const ExprId e1 = expr.children.e1;
+      return disjSetWithZero(e, buildLattices(e0, i), buildLattices(e1, i));
     }
   case TensorExp::Kind::kShrS:
   case TensorExp::Kind::kShrU:
@@ -1002,7 +1099,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
       const ExprId e0 = expr.children.e0;
       const ExprId e1 = expr.children.e1;
       assert(isInvariant(e1));
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i));
     }
   case TensorExp::Kind::kBinary:
     // A custom binary operation.
@@ -1033,9 +1130,9 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
       }
       bool includeLeft = binop.getLeftIdentity() || !leftRegion.empty();
       bool includeRight = binop.getRightIdentity() || !rightRegion.empty();
-      return combiSet(TensorExp::Kind::kBinary, child0, child1, binop,
-                      includeLeft, TensorExp::Kind::kBinaryBranch, leftYield,
-                      includeRight, TensorExp::Kind::kBinaryBranch, rightYield);
+      return combiSet(e, child0, child1, binop, includeLeft,
+                      TensorExp::Kind::kBinaryBranch, leftYield, includeRight,
+                      TensorExp::Kind::kBinaryBranch, rightYield);
     }
   case TensorExp::Kind::kReduce:
     // A custom reduce operation.
@@ -1043,7 +1140,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
       const ExprId e0 = expr.children.e0;
       const ExprId e1 = expr.children.e1;
       Operation *const op = expr.op;
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i), op);
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i), op);
     }
   }
   llvm_unreachable("unexpected expression kind");
@@ -1261,6 +1358,37 @@ std::optional<ExprId> Merger::buildTensorExp(linalg::GenericOp op, Value v) {
         return addExp(TensorExp::Kind::kShrU, e0, e1);
       if (isa<arith::ShLIOp>(def) && isInvariant(e1))
         return addExp(TensorExp::Kind::kShlI, e0, e1);
+      if (auto ci = dyn_cast<arith::CmpIOp>(def)) {
+        if (ci.getPredicate() == arith::CmpIPredicate::eq &&
+            ci.getPredicate() == arith::CmpIPredicate::sle &&
+            ci.getPredicate() == arith::CmpIPredicate::sge &&
+            ci.getPredicate() == arith::CmpIPredicate::ule &&
+            ci.getPredicate() == arith::CmpIPredicate::uge) {
+          // We can not sparsify comparison with equal, this is because 0 <= 0
+          // yields true, and thus densifies the result.
+          return std::nullopt;
+        }
+
+        return addExp(TensorExp::Kind::kCmpI, e0, e1, nullptr,
+                      ci.getPredicateAttr());
+      }
+      if (auto cf = dyn_cast<arith::CmpFOp>(def)) {
+        if (cf.getPredicate() == arith::CmpFPredicate::OEQ &&
+            cf.getPredicate() == arith::CmpFPredicate::OGE &&
+            cf.getPredicate() == arith::CmpFPredicate::OLE &&
+            cf.getPredicate() == arith::CmpFPredicate::ONE &&
+            cf.getPredicate() == arith::CmpFPredicate::UEQ &&
+            cf.getPredicate() == arith::CmpFPredicate::UGE &&
+            cf.getPredicate() == arith::CmpFPredicate::ULE &&
+            cf.getPredicate() == arith::CmpFPredicate::ORD &&
+            cf.getPredicate() == arith::CmpFPredicate::UNO) {
+          // We can not sparsify comparison with equal, this is because 0 <= 0
+          // yields true, and thus densifies the result.
+          return std::nullopt;
+        }
+        return addExp(TensorExp::Kind::kCmpF, e0, e1, nullptr,
+                      cf.getPredicateAttr());
+      }
       if (auto binop = dyn_cast<sparse_tensor::BinaryOp>(def)) {
         if (isAdmissibleBranch(binop, binop.getOverlapRegion()) &&
             (binop.getLeftIdentity() ||
@@ -1342,6 +1470,7 @@ Value Merger::buildExp(RewriterBase &rewriter, Location loc, ExprId e, Value v0,
   case TensorExp::Kind::kTensor:
   case TensorExp::Kind::kInvariant:
   case TensorExp::Kind::kLoopVar:
+  case TensorExp::Kind::kSynZero:
     llvm_unreachable("unexpected non-op");
   // Unary operations.
   case TensorExp::Kind::kAbsF:
@@ -1458,6 +1587,14 @@ Value Merger::buildExp(RewriterBase &rewriter, Location loc, ExprId e, Value v0,
     return rewriter.create<arith::ShRUIOp>(loc, v0, v1);
   case TensorExp::Kind::kShlI:
     return rewriter.create<arith::ShLIOp>(loc, v0, v1);
+  case TensorExp::Kind::kCmpI: {
+    auto predicate = llvm::cast<arith::CmpIPredicateAttr>(expr.attr);
+    return rewriter.create<arith::CmpIOp>(loc, predicate, v0, v1);
+  }
+  case TensorExp::Kind::kCmpF: {
+    auto predicate = llvm::cast<arith::CmpFPredicateAttr>(expr.attr);
+    return rewriter.create<arith::CmpFOp>(loc, predicate, v0, v1);
+  }
   case TensorExp::Kind::kBinaryBranch: // semi-ring ops with custom logic.
     return insertYieldOp(rewriter, loc, *expr.op->getBlock()->getParent(),
                          {v0});

diff  --git a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
index 664121aae5a37..14187063e13c2 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
@@ -52,6 +52,43 @@ func.func @add_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg
   return %0 : tensor<32x16xf32>
 }
 
+// CHECK-LABEL:   func.func @cmp_dd(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_10]] : memref<32x16xi1>)
+// CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index
+// CHECK:               %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
+// CHECK:               %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref<?xf32>
+// CHECK:               %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_11]], %[[VAL_12]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_17:.*]] = arith.cmpf ult, %[[VAL_15]], %[[VAL_16]] : f32
+// CHECK:               memref.store %[[VAL_17]], %[[VAL_10]]{{\[}}%[[VAL_11]], %[[VAL_12]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_18:.*]] = bufferization.to_tensor %[[VAL_10]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_18]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tdd>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_dd(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@@ -151,6 +188,73 @@ func.func @add_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg
   return %0 : tensor<32x16xf32>
 }
 
+// CHECK-LABEL:   func.func @cmp_ds(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>)
+// CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_17]]] : memref<?xindex>
+// CHECK:             %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_16]], %[[VAL_21:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_18]] : index
+// CHECK:               scf.condition(%[[VAL_22]]) %[[VAL_20]], %[[VAL_21]] : index, index
+// CHECK:             } do {
+// CHECK:             ^bb0(%[[VAL_23:.*]]: index, %[[VAL_24:.*]]: index):
+// CHECK:               %[[VAL_25:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_26:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:               scf.if %[[VAL_26]] {
+// CHECK:                 %[[VAL_27:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_23]]] : memref<?xf32>
+// CHECK:                 %[[VAL_28:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xf32>
+// CHECK:                 %[[VAL_29:.*]] = arith.cmpf ult, %[[VAL_27]], %[[VAL_28]] : f32
+// CHECK:                 memref.store %[[VAL_29]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xi1>
+// CHECK:               } else {
+// CHECK:                 scf.if %[[VAL_8]] {
+// CHECK:                   %[[VAL_30:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_31:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_30]] : f32
+// CHECK:                   memref.store %[[VAL_31]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xi1>
+// CHECK:                 } else {
+// CHECK:                 }
+// CHECK:               }
+// CHECK:               %[[VAL_32:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:               %[[VAL_33:.*]] = arith.addi %[[VAL_23]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_34:.*]] = arith.select %[[VAL_32]], %[[VAL_33]], %[[VAL_23]] : index
+// CHECK:               %[[VAL_35:.*]] = arith.addi %[[VAL_24]], %[[VAL_7]] : index
+// CHECK:               scf.yield %[[VAL_34]], %[[VAL_35]] : index, index
+// CHECK:             } attributes
+// CHECK:             scf.for %[[VAL_36:.*]] = %[[VAL_37:.*]]#1 to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_38:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_36]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_39:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_38]] : f32
+// CHECK:               memref.store %[[VAL_39]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_36]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_40:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_40]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tds>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_ds(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@@ -258,6 +362,78 @@ func.func @add_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg
   return %0 : tensor<32x16xf32>
 }
 
+// CHECK-LABEL:   func.func @cmp_sd(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>> to memref<?xf32>
+// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>)
+// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_15]], %[[VAL_19:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_16]] : index
+// CHECK:             scf.condition(%[[VAL_20]]) %[[VAL_18]], %[[VAL_19]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_21:.*]]: index, %[[VAL_22:.*]]: index):
+// CHECK:             %[[VAL_23:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_21]]] : memref<?xindex>
+// CHECK:             %[[VAL_24:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:             scf.if %[[VAL_24]] {
+// CHECK:               scf.for %[[VAL_25:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:                 %[[VAL_26:.*]] = arith.muli %[[VAL_21]], %[[VAL_3]] : index
+// CHECK:                 %[[VAL_27:.*]] = arith.addi %[[VAL_26]], %[[VAL_25]] : index
+// CHECK:                 %[[VAL_28:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_27]]] : memref<?xf32>
+// CHECK:                 %[[VAL_29:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_25]]] : memref<32x16xf32>
+// CHECK:                 %[[VAL_30:.*]] = arith.cmpf ult, %[[VAL_28]], %[[VAL_29]] : f32
+// CHECK:                 memref.store %[[VAL_30]], %[[VAL_14]]{{\[}}%[[VAL_22]], %[[VAL_25]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               scf.if %[[VAL_8]] {
+// CHECK:                 scf.for %[[VAL_31:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:                   %[[VAL_32:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_33:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_32]] : f32
+// CHECK:                   memref.store %[[VAL_33]], %[[VAL_14]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<32x16xi1>
+// CHECK:                 }
+// CHECK:               } else {
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_34:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_21]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_36:.*]] = arith.select %[[VAL_34]], %[[VAL_35]], %[[VAL_21]] : index
+// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_22]], %[[VAL_7]] : index
+// CHECK:             scf.yield %[[VAL_36]], %[[VAL_37]] : index, index
+// CHECK:           } attributes
+// CHECK:           scf.for %[[VAL_38:.*]] = %[[VAL_39:.*]]#1 to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:             scf.for %[[VAL_40:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_41:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_38]], %[[VAL_40]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_42:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_41]] : f32
+// CHECK:               memref.store %[[VAL_42]], %[[VAL_14]]{{\[}}%[[VAL_38]], %[[VAL_40]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_43:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_43]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tsd>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_sd(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@@ -392,6 +568,106 @@ func.func @add_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg
   return %0 : tensor<32x16xf32>
 }
 
+// CHECK-LABEL:   func.func @cmp_ss(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf32>
+// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK:           %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_16]] : memref<32x16xi1>)
+// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK:           %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_17]], %[[VAL_21:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_18]] : index
+// CHECK:             scf.condition(%[[VAL_22]]) %[[VAL_20]], %[[VAL_21]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_23:.*]]: index, %[[VAL_24:.*]]: index):
+// CHECK:             %[[VAL_25:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:             %[[VAL_26:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:             scf.if %[[VAL_26]] {
+// CHECK:               %[[VAL_27:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_29:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:               %[[VAL_30:.*]]:2 = scf.while (%[[VAL_31:.*]] = %[[VAL_27]], %[[VAL_32:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:                 %[[VAL_33:.*]] = arith.cmpi ult, %[[VAL_31]], %[[VAL_29]] : index
+// CHECK:                 scf.condition(%[[VAL_33]]) %[[VAL_31]], %[[VAL_32]] : index, index
+// CHECK:               } do {
+// CHECK:               ^bb0(%[[VAL_34:.*]]: index, %[[VAL_35:.*]]: index):
+// CHECK:                 %[[VAL_36:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_34]]] : memref<?xindex>
+// CHECK:                 %[[VAL_37:.*]] = arith.cmpi eq, %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:                 scf.if %[[VAL_37]] {
+// CHECK:                   %[[VAL_38:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_34]]] : memref<?xf32>
+// CHECK:                   %[[VAL_39:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_40:.*]] = arith.cmpf ult, %[[VAL_38]], %[[VAL_39]] : f32
+// CHECK:                   memref.store %[[VAL_40]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xi1>
+// CHECK:                 } else {
+// CHECK:                   scf.if %[[VAL_8]] {
+// CHECK:                     %[[VAL_41:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xf32>
+// CHECK:                     %[[VAL_42:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_41]] : f32
+// CHECK:                     memref.store %[[VAL_42]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xi1>
+// CHECK:                   } else {
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:                 %[[VAL_43:.*]] = arith.cmpi eq, %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:                 %[[VAL_44:.*]] = arith.addi %[[VAL_34]], %[[VAL_7]] : index
+// CHECK:                 %[[VAL_45:.*]] = arith.select %[[VAL_43]], %[[VAL_44]], %[[VAL_34]] : index
+// CHECK:                 %[[VAL_46:.*]] = arith.addi %[[VAL_35]], %[[VAL_7]] : index
+// CHECK:                 scf.yield %[[VAL_45]], %[[VAL_46]] : index, index
+// CHECK:               } attributes
+// CHECK:               scf.for %[[VAL_47:.*]] = %[[VAL_48:.*]]#1 to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:                 %[[VAL_49:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_47]]] : memref<32x16xf32>
+// CHECK:                 %[[VAL_50:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_49]] : f32
+// CHECK:                 memref.store %[[VAL_50]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_47]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               scf.if %[[VAL_8]] {
+// CHECK:                 scf.for %[[VAL_51:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:                   %[[VAL_52:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_51]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_53:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_52]] : f32
+// CHECK:                   memref.store %[[VAL_53]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_51]]] : memref<32x16xi1>
+// CHECK:                 }
+// CHECK:               } else {
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_54:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:             %[[VAL_55:.*]] = arith.addi %[[VAL_23]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_56:.*]] = arith.select %[[VAL_54]], %[[VAL_55]], %[[VAL_23]] : index
+// CHECK:             %[[VAL_57:.*]] = arith.addi %[[VAL_24]], %[[VAL_7]] : index
+// CHECK:             scf.yield %[[VAL_56]], %[[VAL_57]] : index, index
+// CHECK:           } attributes
+// CHECK:           scf.for %[[VAL_58:.*]] = %[[VAL_59:.*]]#1 to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:             scf.for %[[VAL_60:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_61:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_58]], %[[VAL_60]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_62:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_61]] : f32
+// CHECK:               memref.store %[[VAL_62]], %[[VAL_16]]{{\[}}%[[VAL_58]], %[[VAL_60]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_63:.*]] = bufferization.to_tensor %[[VAL_16]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_63]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tss>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_ss(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@@ -599,6 +875,180 @@ func.func @add_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T
   return %0 : tensor<32x16xf32>
 }
 
+// CHECK-LABEL:   func.func @cmp_ss_ss(
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:           %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_3]] : i1) outs(%[[VAL_17]] : memref<32x16xi1>)
+// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           %[[VAL_22:.*]]:2 = scf.while (%[[VAL_23:.*]] = %[[VAL_18]], %[[VAL_24:.*]] = %[[VAL_20]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_25:.*]] = arith.cmpi ult, %[[VAL_23]], %[[VAL_19]] : index
+// CHECK:             %[[VAL_26:.*]] = arith.cmpi ult, %[[VAL_24]], %[[VAL_21]] : index
+// CHECK:             %[[VAL_27:.*]] = arith.andi %[[VAL_25]], %[[VAL_26]] : i1
+// CHECK:             scf.condition(%[[VAL_27]]) %[[VAL_23]], %[[VAL_24]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_28:.*]]: index, %[[VAL_29:.*]]: index):
+// CHECK:             %[[VAL_30:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:             %[[VAL_31:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:             %[[VAL_32:.*]] = arith.cmpi ult, %[[VAL_31]], %[[VAL_30]] : index
+// CHECK:             %[[VAL_33:.*]] = arith.select %[[VAL_32]], %[[VAL_31]], %[[VAL_30]] : index
+// CHECK:             %[[VAL_34:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_35:.*]] = arith.cmpi eq, %[[VAL_31]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_36:.*]] = arith.andi %[[VAL_34]], %[[VAL_35]] : i1
+// CHECK:             scf.if %[[VAL_36]] {
+// CHECK:               %[[VAL_37:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:               %[[VAL_38:.*]] = arith.addi %[[VAL_28]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_39:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_38]]] : memref<?xindex>
+// CHECK:               %[[VAL_40:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:               %[[VAL_41:.*]] = arith.addi %[[VAL_29]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_42:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_41]]] : memref<?xindex>
+// CHECK:               %[[VAL_43:.*]]:2 = scf.while (%[[VAL_44:.*]] = %[[VAL_37]], %[[VAL_45:.*]] = %[[VAL_40]]) : (index, index) -> (index, index) {
+// CHECK:                 %[[VAL_46:.*]] = arith.cmpi ult, %[[VAL_44]], %[[VAL_39]] : index
+// CHECK:                 %[[VAL_47:.*]] = arith.cmpi ult, %[[VAL_45]], %[[VAL_42]] : index
+// CHECK:                 %[[VAL_48:.*]] = arith.andi %[[VAL_46]], %[[VAL_47]] : i1
+// CHECK:                 scf.condition(%[[VAL_48]]) %[[VAL_44]], %[[VAL_45]] : index, index
+// CHECK:               } do {
+// CHECK:               ^bb0(%[[VAL_49:.*]]: index, %[[VAL_50:.*]]: index):
+// CHECK:                 %[[VAL_51:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_49]]] : memref<?xindex>
+// CHECK:                 %[[VAL_52:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_50]]] : memref<?xindex>
+// CHECK:                 %[[VAL_53:.*]] = arith.cmpi ult, %[[VAL_52]], %[[VAL_51]] : index
+// CHECK:                 %[[VAL_54:.*]] = arith.select %[[VAL_53]], %[[VAL_52]], %[[VAL_51]] : index
+// CHECK:                 %[[VAL_55:.*]] = arith.cmpi eq, %[[VAL_51]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_56:.*]] = arith.cmpi eq, %[[VAL_52]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_57:.*]] = arith.andi %[[VAL_55]], %[[VAL_56]] : i1
+// CHECK:                 scf.if %[[VAL_57]] {
+// CHECK:                   %[[VAL_58:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_49]]] : memref<?xf32>
+// CHECK:                   %[[VAL_59:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_50]]] : memref<?xf32>
+// CHECK:                   %[[VAL_60:.*]] = arith.cmpf ult, %[[VAL_58]], %[[VAL_59]] : f32
+// CHECK:                   memref.store %[[VAL_60]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_54]]] : memref<32x16xi1>
+// CHECK:                 } else {
+// CHECK:                   %[[VAL_61:.*]] = arith.cmpi eq, %[[VAL_51]], %[[VAL_54]] : index
+// CHECK:                   scf.if %[[VAL_61]] {
+// CHECK:                     %[[VAL_62:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_49]]] : memref<?xf32>
+// CHECK:                     %[[VAL_63:.*]] = arith.cmpf ult, %[[VAL_62]], %[[VAL_6]] : f32
+// CHECK:                     memref.store %[[VAL_63]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_54]]] : memref<32x16xi1>
+// CHECK:                   } else {
+// CHECK:                     %[[VAL_64:.*]] = arith.cmpi eq, %[[VAL_52]], %[[VAL_54]] : index
+// CHECK:                     scf.if %[[VAL_64]] {
+// CHECK:                       %[[VAL_65:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_50]]] : memref<?xf32>
+// CHECK:                       %[[VAL_66:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_65]] : f32
+// CHECK:                       memref.store %[[VAL_66]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_54]]] : memref<32x16xi1>
+// CHECK:                     } else {
+// CHECK:                     }
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:                 %[[VAL_67:.*]] = arith.cmpi eq, %[[VAL_51]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_68:.*]] = arith.addi %[[VAL_49]], %[[VAL_5]] : index
+// CHECK:                 %[[VAL_69:.*]] = arith.select %[[VAL_67]], %[[VAL_68]], %[[VAL_49]] : index
+// CHECK:                 %[[VAL_70:.*]] = arith.cmpi eq, %[[VAL_52]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_71:.*]] = arith.addi %[[VAL_50]], %[[VAL_5]] : index
+// CHECK:                 %[[VAL_72:.*]] = arith.select %[[VAL_70]], %[[VAL_71]], %[[VAL_50]] : index
+// CHECK:                 scf.yield %[[VAL_69]], %[[VAL_72]] : index, index
+// CHECK:               } attributes
+// CHECK:               scf.for %[[VAL_73:.*]] = %[[VAL_74:.*]]#0 to %[[VAL_39]] step %[[VAL_5]] {
+// CHECK:                 %[[VAL_75:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_73]]] : memref<?xindex>
+// CHECK:                 %[[VAL_76:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_73]]] : memref<?xf32>
+// CHECK:                 %[[VAL_77:.*]] = arith.cmpf ult, %[[VAL_76]], %[[VAL_6]] : f32
+// CHECK:                 memref.store %[[VAL_77]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_75]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:               scf.for %[[VAL_78:.*]] = %[[VAL_79:.*]]#1 to %[[VAL_42]] step %[[VAL_5]] {
+// CHECK:                 %[[VAL_80:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_78]]] : memref<?xindex>
+// CHECK:                 %[[VAL_81:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_78]]] : memref<?xf32>
+// CHECK:                 %[[VAL_82:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_81]] : f32
+// CHECK:                 memref.store %[[VAL_82]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_80]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               %[[VAL_83:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_33]] : index
+// CHECK:               scf.if %[[VAL_83]] {
+// CHECK:                 %[[VAL_84:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                 %[[VAL_85:.*]] = arith.addi %[[VAL_28]], %[[VAL_5]] : index
+// CHECK:                 %[[VAL_86:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_85]]] : memref<?xindex>
+// CHECK:                 scf.for %[[VAL_87:.*]] = %[[VAL_84]] to %[[VAL_86]] step %[[VAL_5]] {
+// CHECK:                   %[[VAL_88:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_87]]] : memref<?xindex>
+// CHECK:                   %[[VAL_89:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_87]]] : memref<?xf32>
+// CHECK:                   %[[VAL_90:.*]] = arith.cmpf ult, %[[VAL_89]], %[[VAL_6]] : f32
+// CHECK:                   memref.store %[[VAL_90]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_88]]] : memref<32x16xi1>
+// CHECK:                 }
+// CHECK:               } else {
+// CHECK:                 %[[VAL_91:.*]] = arith.cmpi eq, %[[VAL_31]], %[[VAL_33]] : index
+// CHECK:                 scf.if %[[VAL_91]] {
+// CHECK:                   %[[VAL_92:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:                   %[[VAL_93:.*]] = arith.addi %[[VAL_29]], %[[VAL_5]] : index
+// CHECK:                   %[[VAL_94:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_93]]] : memref<?xindex>
+// CHECK:                   scf.for %[[VAL_95:.*]] = %[[VAL_92]] to %[[VAL_94]] step %[[VAL_5]] {
+// CHECK:                     %[[VAL_96:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_95]]] : memref<?xindex>
+// CHECK:                     %[[VAL_97:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_95]]] : memref<?xf32>
+// CHECK:                     %[[VAL_98:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_97]] : f32
+// CHECK:                     memref.store %[[VAL_98]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_96]]] : memref<32x16xi1>
+// CHECK:                   }
+// CHECK:                 } else {
+// CHECK:                 }
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_99:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_100:.*]] = arith.addi %[[VAL_28]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_101:.*]] = arith.select %[[VAL_99]], %[[VAL_100]], %[[VAL_28]] : index
+// CHECK:             %[[VAL_102:.*]] = arith.cmpi eq, %[[VAL_31]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_103:.*]] = arith.addi %[[VAL_29]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_104:.*]] = arith.select %[[VAL_102]], %[[VAL_103]], %[[VAL_29]] : index
+// CHECK:             scf.yield %[[VAL_101]], %[[VAL_104]] : index, index
+// CHECK:           } attributes
+// CHECK:           scf.for %[[VAL_105:.*]] = %[[VAL_106:.*]]#0 to %[[VAL_19]] step %[[VAL_5]] {
+// CHECK:             %[[VAL_107:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_105]]] : memref<?xindex>
+// CHECK:             %[[VAL_108:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_105]]] : memref<?xindex>
+// CHECK:             %[[VAL_109:.*]] = arith.addi %[[VAL_105]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_110:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_109]]] : memref<?xindex>
+// CHECK:             scf.for %[[VAL_111:.*]] = %[[VAL_108]] to %[[VAL_110]] step %[[VAL_5]] {
+// CHECK:               %[[VAL_112:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_111]]] : memref<?xindex>
+// CHECK:               %[[VAL_113:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_111]]] : memref<?xf32>
+// CHECK:               %[[VAL_114:.*]] = arith.cmpf ult, %[[VAL_113]], %[[VAL_6]] : f32
+// CHECK:               memref.store %[[VAL_114]], %[[VAL_17]]{{\[}}%[[VAL_107]], %[[VAL_112]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           scf.for %[[VAL_115:.*]] = %[[VAL_116:.*]]#1 to %[[VAL_21]] step %[[VAL_5]] {
+// CHECK:             %[[VAL_117:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_115]]] : memref<?xindex>
+// CHECK:             %[[VAL_118:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_115]]] : memref<?xindex>
+// CHECK:             %[[VAL_119:.*]] = arith.addi %[[VAL_115]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_120:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_119]]] : memref<?xindex>
+// CHECK:             scf.for %[[VAL_121:.*]] = %[[VAL_118]] to %[[VAL_120]] step %[[VAL_5]] {
+// CHECK:               %[[VAL_122:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_121]]] : memref<?xindex>
+// CHECK:               %[[VAL_123:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_121]]] : memref<?xf32>
+// CHECK:               %[[VAL_124:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_123]] : f32
+// CHECK:               memref.store %[[VAL_124]], %[[VAL_17]]{{\[}}%[[VAL_117]], %[[VAL_122]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_125:.*]] = bufferization.to_tensor %[[VAL_17]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_125]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #Tss>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tss>, tensor<32x16xf32, #Tss>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 #BatchedVector = #sparse_tensor.encoding<{
   lvlTypes = [ "dense", "compressed-hi" ],
 }>
@@ -671,22 +1121,22 @@ func.func @add_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T
 // CHECK:               %[[VAL_60:.*]] = arith.addi %[[VAL_31]], %[[VAL_4]] : index
 // CHECK:               %[[VAL_61:.*]] = arith.select %[[VAL_59]], %[[VAL_60]], %[[VAL_31]] : index
 // CHECK:               scf.yield %[[VAL_58]], %[[VAL_61]], %[[VAL_62:.*]] : index, index, tensor<2x3xf64, #{{.*}}>>
-// CHECK:             } attributes {"Emitted from" = "linalg.generic"}
+// CHECK:             } attributes
 // CHECK:             %[[VAL_63:.*]] = scf.for %[[VAL_64:.*]] = %[[VAL_65:.*]]#0 to %[[VAL_18]] step %[[VAL_4]] iter_args(%[[VAL_66:.*]] = %[[VAL_65]]#2)
 // CHECK:               %[[VAL_67:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_64]]] : memref<?xindex>
 // CHECK:               %[[VAL_68:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_64]]] : memref<?xf64>
 // CHECK:               %[[VAL_69:.*]] = sparse_tensor.insert %[[VAL_68]] into %[[VAL_66]]{{\[}}%[[VAL_13]], %[[VAL_67]]] : tensor<2x3xf64, #{{.*}}>>
 // CHECK:               scf.yield %[[VAL_69]] : tensor<2x3xf64, #{{.*}}>>
-// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             }
 // CHECK:             %[[VAL_70:.*]] = scf.for %[[VAL_71:.*]] = %[[VAL_72:.*]]#1 to %[[VAL_22]] step %[[VAL_4]] iter_args(%[[VAL_73:.*]] = %[[VAL_74:.*]])
 // CHECK:               %[[VAL_75:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_71]]] : memref<?xindex>
 // CHECK:               %[[VAL_76:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_71]]] : memref<?xf64>
 // CHECK:               %[[VAL_77:.*]] = arith.negf %[[VAL_76]] : f64
 // CHECK:               %[[VAL_78:.*]] = sparse_tensor.insert %[[VAL_77]] into %[[VAL_73]]{{\[}}%[[VAL_13]], %[[VAL_75]]] : tensor<2x3xf64, #{{.*}}>>
 // CHECK:               scf.yield %[[VAL_78]] : tensor<2x3xf64, #{{.*}}>>
-// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             }
 // CHECK:             scf.yield %[[VAL_79:.*]] : tensor<2x3xf64, #{{.*}}>>
-// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           }
 // CHECK:           %[[VAL_80:.*]] = sparse_tensor.load %[[VAL_81:.*]] hasInserts : tensor<2x3xf64, #{{.*}}>>
 // CHECK:           return %[[VAL_80]] : tensor<2x3xf64, #{{.*}}>>
 // CHECK:         }
@@ -1140,9 +1590,9 @@ func.func @scale(%arga: tensor<?x?xf64, #Tds>, %argx: tensor<?x?xf64>) -> tensor
 // CHECK:                 %[[VAL_30:.*]] = arith.mulf %[[VAL_27]], %[[VAL_29]] : f32
 // CHECK:                 %[[VAL_31:.*]] = arith.addf %[[VAL_26]], %[[VAL_30]] : f32
 // CHECK:                 memref.store %[[VAL_31]], %[[VAL_14]]{{\[}}%[[VAL_18]], %[[VAL_25]]] : memref<?x?xf32>
-// CHECK:               } {"Emitted from" = "linalg.generic"}
-// CHECK:             } {"Emitted from" = "linalg.generic"}
-// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
 // CHECK:           %[[VAL_32:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<?x?xf32>
 // CHECK:           return %[[VAL_32]] : tensor<?x?xf32>
 // CHECK:         }

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
new file mode 100644
index 0000000000000..33861a79f6805
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
@@ -0,0 +1,146 @@
+// DEFINE: %{option} = "enable-runtime-library=false"
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE:  -e entry -entry-point-result=void  \
+// DEFINE:  -shared-libs=%mlir_c_runner_utils | \
+// DEFINE: FileCheck %s
+//
+// RUN: %{compile} | %{run}
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
+// RUN: %{compile} | %{run}
+
+// Do the same run, but now with direct IR generation and, if available, VLA
+// vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA"
+// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \
+// REDEFINE:   --entry-function=entry_lli \
+// REDEFINE:   --extra-module=%S/Inputs/main_for_lli.ll \
+// REDEFINE:   %VLA_ARCH_ATTR_OPTIONS \
+// REDEFINE:   --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \
+// REDEFINE: FileCheck %s
+// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run}
+
+#DCSR = #sparse_tensor.encoding<{
+  lvlTypes = [ "compressed", "compressed" ]
+}>
+
+#trait = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>, // A
+    affine_map<(i,j) -> (i,j)>, // B
+    affine_map<(i,j) -> (i,j)>  // x (out)
+  ],
+  iterator_types = ["parallel", "parallel"],
+  doc = "X(i, j) = cmp A(i,j) B(i, j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+  func.func @cmp_all_dense(%arga: tensor<4x4xf64>,
+                           %argb: tensor<4x4xf64>,
+                           %argx: tensor<4x4xi8>) -> tensor<4x4xi8> {
+    %0 = linalg.generic #trait
+       ins(%arga, %argb: tensor<4x4xf64>, tensor<4x4xf64>)
+      outs(%argx: tensor<4x4xi8>) {
+        ^bb(%a: f64, %b: f64, %x: i8):
+          %0 = arith.cmpf ult, %a, %b : f64
+          %1 = arith.extui %0 : i1 to i8
+          linalg.yield %1 : i8
+    } -> tensor<4x4xi8>
+    return %0 : tensor<4x4xi8>
+  }
+
+  func.func @cmp_lhs_sparse(%arga: tensor<4x4xf64, #DCSR>,
+                            %argb: tensor<4x4xf64>) -> tensor<4x4xi8, #DCSR> {
+    %argx = bufferization.alloc_tensor() : tensor<4x4xi8, #DCSR>
+    %0 = linalg.generic #trait
+       ins(%arga, %argb: tensor<4x4xf64, #DCSR>, tensor<4x4xf64>)
+      outs(%argx: tensor<4x4xi8, #DCSR>) {
+        ^bb(%a: f64, %b: f64, %x: i8):
+          %0 = arith.cmpf ult, %a, %b : f64
+          %1 = arith.extui %0 : i1 to i8
+          linalg.yield %1 : i8
+    } -> tensor<4x4xi8, #DCSR>
+    return %0 : tensor<4x4xi8, #DCSR>
+  }
+
+  func.func @cmp_all_sparse(%arga: tensor<4x4xf64, #DCSR>,
+                            %argb: tensor<4x4xf64, #DCSR>) -> tensor<4x4xi8, #DCSR> {
+    %argx = bufferization.alloc_tensor() : tensor<4x4xi8, #DCSR>
+    %0 = linalg.generic #trait
+       ins(%arga, %argb: tensor<4x4xf64, #DCSR>, tensor<4x4xf64, #DCSR>)
+      outs(%argx: tensor<4x4xi8, #DCSR>) {
+        ^bb(%a: f64, %b: f64, %x: i8):
+          %0 = arith.cmpf ult, %a, %b : f64
+          %1 = arith.extui %0 : i1 to i8
+          linalg.yield %1 : i8
+    } -> tensor<4x4xi8, #DCSR>
+    return %0 : tensor<4x4xi8, #DCSR>
+  }
+
+  //
+  // Main driver that constructs matrix and calls the sparse kernel to perform
+  // element-wise comparison.
+  //
+  func.func @entry() {
+    %d0 = arith.constant 0 : i8
+    %c0 = arith.constant 0 : index
+
+    %lhs_dn = arith.constant dense<
+      [ [ 0.0, 0.0, 1.5, 1.0],
+        [ 0.0, 3.5, 0.0, 0.0],
+        [ 1.0, 5.0, 2.0, 0.0],
+        [ 1.0, 0.5, 0.0, 0.0] ]> : tensor<4x4xf64>
+
+    %rhs_dn = arith.constant dense<
+      [ [ 0.0, 1.5, 1.0, 1.5],
+        [ 3.5, 0.0, 0.0, 0.0],
+        [ 5.0, 2.0, 0.0, 2.0],
+        [ 0.5, 0.0, 0.0, 0.0] ]> : tensor<4x4xf64>
+
+    %lhs_sp = sparse_tensor.convert %lhs_dn : tensor<4x4xf64> to tensor<4x4xf64, #DCSR>
+    %rhs_sp = sparse_tensor.convert %rhs_dn : tensor<4x4xf64> to tensor<4x4xf64, #DCSR>
+
+    %output = arith.constant dense<0> : tensor<4x4xi8>
+    %all_dn_out = call @cmp_all_dense(%lhs_dn, %rhs_dn, %output)
+            : (tensor<4x4xf64>, tensor<4x4xf64>, tensor<4x4xi8>) -> tensor<4x4xi8>
+    %lhs_sp_out = call @cmp_lhs_sparse(%lhs_sp, %rhs_dn)
+            : (tensor<4x4xf64, #DCSR>, tensor<4x4xf64>) -> tensor<4x4xi8, #DCSR>
+    %all_sp_out = call @cmp_all_sparse(%lhs_sp, %rhs_sp)
+            : (tensor<4x4xf64, #DCSR>, tensor<4x4xf64, #DCSR>) -> tensor<4x4xi8, #DCSR>
+
+    //
+    // All should have the same result.
+    //
+    // CHECK-COUNT-3: ( ( 0, 1, 0, 1 ), ( 1, 0, 0, 0 ), ( 1, 0, 0, 1 ), ( 0, 0, 0, 0 ) )
+    %v = vector.transfer_read %all_dn_out[%c0, %c0], %d0
+       : tensor<4x4xi8>, vector<4x4xi8>
+    vector.print %v : vector<4x4xi8>
+
+    %lhs_sp_ret = sparse_tensor.convert %lhs_sp_out
+      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
+    %v1 = vector.transfer_read %lhs_sp_ret[%c0, %c0], %d0
+      : tensor<4x4xi8>, vector<4x4xi8>
+    vector.print %v1 : vector<4x4xi8>
+
+    %rhs_sp_ret = sparse_tensor.convert %all_sp_out
+      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
+    %v2 = vector.transfer_read %rhs_sp_ret[%c0, %c0], %d0
+      : tensor<4x4xi8>, vector<4x4xi8>
+    vector.print %v2 : vector<4x4xi8>
+
+
+    bufferization.dealloc_tensor %lhs_sp : tensor<4x4xf64, #DCSR>
+    bufferization.dealloc_tensor %rhs_sp : tensor<4x4xf64, #DCSR>
+    bufferization.dealloc_tensor %lhs_sp_out : tensor<4x4xi8, #DCSR>
+    bufferization.dealloc_tensor %all_sp_out : tensor<4x4xi8, #DCSR>
+
+    return
+  }
+}

diff  --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index 38c3b363b585f..b854ce8d7aa8a 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -34,7 +34,9 @@ namespace {
   DO(subi, TensorExp::Kind::kSubI)                                             \
   DO(andi, TensorExp::Kind::kAndI)                                             \
   DO(xori, TensorExp::Kind::kXorI)                                             \
-  DO(ori, TensorExp::Kind::kOrI)
+  DO(ori, TensorExp::Kind::kOrI)                                               \
+  DO(cmpf, TensorExp::Kind::kCmpF)                                             \
+  DO(cmpi, TensorExp::Kind::kCmpI)
 
 // TODO: Disjunctive binary operations that need special handling are not
 // included, e.g., Division are not tested (for now) as it need a constant
@@ -109,6 +111,7 @@ struct Pattern {
   /// Constructors.
   /// Rather than using these, please use the readable helper constructor
   /// functions below to make tests more readable.
+  Pattern() : kind(TensorExp::Kind::kSynZero) {}
   Pattern(TensorId tid) : kind(TensorExp::Kind::kTensor), tid(tid) {}
   Pattern(TensorExp::Kind kind, PatternRef e0, PatternRef e1)
       : kind(kind), children(e0, e1) {
@@ -122,6 +125,7 @@ struct Pattern {
 ///
 
 static Pattern tensorPattern(TensorId tid) { return Pattern(tid); }
+static Pattern synZeroPattern() { return Pattern(); }
 
 #define IMPL_BINOP_PATTERN(OP, KIND)                                           \
   LLVM_ATTRIBUTE_UNUSED static Pattern OP##Pattern(PatternRef e0,              \
@@ -232,6 +236,9 @@ class MergerTestBase : public ::testing::Test {
     // Leaf.
     case TensorExp::Kind::kTensor:
       return tensorExp.tensor == pattern.tid;
+    case TensorExp::Kind::kSynZero:
+      // Already checked kind equivalence @L233
+      return true;
     case TensorExp::Kind::kInvariant:
       llvm_unreachable("invariant not handled yet");
     case TensorExp::Kind::kLoopVar:
@@ -289,6 +296,8 @@ class MergerTestBase : public ::testing::Test {
     case TensorExp::Kind::kAndI:
     case TensorExp::Kind::kOrI:
     case TensorExp::Kind::kXorI:
+    case TensorExp::Kind::kCmpF:
+    case TensorExp::Kind::kCmpI:
     case TensorExp::Kind::kShrS:
     case TensorExp::Kind::kShrU:
     case TensorExp::Kind::kShlI:
@@ -752,6 +761,79 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)
 
 FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_CONJ)
 
+/// Vector element-wise comparison (disjunction) of 2 vectors. i.e.;
+///   a(i) = b(i) + c(i)
+/// which should form the 3 lattice points
+/// {
+///   lat( i_00 i_01 / (tensor_0 cmp tensor_1) )
+///   lat( i_00 / tensor_0 cmp 0 )
+///   lat( i_01 / 0 cmp tensor_1 )
+/// }
+/// and after optimization, the lattice points do not change (as there is no
+/// duplicated point and all input vectors are sparse vector).
+/// {
+///   lat( i_00 i_01 / (tensor_0 cmp tensor_1) )
+///   lat( i_00 / tensor_0 cmp 0 )
+///   lat( i_01 / 0 cmp tensor_1 )
+/// }
+TEST_F(MergerTest3T1L, vector_cmp) {
+  const auto e = cmpiExpr(tensor(0), tensor(1));
+  const auto l0 = lid(0);
+  const auto t0 = tid(0);
+  const auto t1 = tid(1);
+  PatternRef zero = synZeroPattern();
+  PatternRef p0 = tensorPattern(t0);
+  PatternRef p1 = tensorPattern(t1);
+  auto s = merger.buildLattices(e, l0);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+                            loopsToBits({{l0, t0}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+  s = merger.optimizeSet(s);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+                            loopsToBits({{l0, t0}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+}
+
+/// Vector element-wise comparsion (disjunction) of 2 vectors, i.e.;
+///   a(i) = b(i) cmp c(i)
+/// which should form the 3 lattice points
+/// {
+///   lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) )
+///   lat( i_00 / sparse_tensor_0 cmp 0)
+///   lat( i_01 / 0 cmp dense_tensor_1 )
+/// }
+/// which should be optimized to
+/// {
+///   lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ) (not singleton)
+///   lat( i_01 / 0 cmp dense_tensor_0 ) ()
+/// }
+///
+/// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense 
diff 
+/// with lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ).
+TEST_F(MergerTest3T1LD, vector_cmp) {
+  const auto e = cmpiExpr(tensor(0), tensor(1));
+  const auto l0 = lid(0);
+  const auto t0 = tid(0);
+  const auto t1 = tid(1);
+  PatternRef zero = synZeroPattern();
+  PatternRef p0 = tensorPattern(t0);
+  PatternRef p1 = tensorPattern(t1);
+  auto s = merger.buildLattices(e, l0);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+                            loopsToBits({{l0, t0}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+  s = merger.optimizeSet(s);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+}
+
 #undef IMPL_MERGER_TEST_OPTIMIZED_CONJ
 
 // TODO: mult-dim tests