[Mlir-commits] [mlir] 2cd1592 - [mlir][sparse] implement index redution on dense level (for CSR)

Mon Apr 17 09:36:38 PDT 2023

Author: Peiming Liu
Date: 2023-04-17T16:36:31Z
New Revision: 2cd15925f4485fc618bc33c1337e1b6f63d84ef6

URL: https://github.com/llvm/llvm-project/commit/2cd15925f4485fc618bc33c1337e1b6f63d84ef6
DIFF: https://github.com/llvm/llvm-project/commit/2cd15925f4485fc618bc33c1337e1b6f63d84ef6.diff

LOG: [mlir][sparse] implement index redution on dense level (for CSR)

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D147550

Added: 
    

Modified: 
    mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
    mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
    mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
    mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
index b624aaddd21df..c1608530b7b1f 100644

--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -29,6 +29,8 @@ using namespace mlir::sparse_tensor;
   (builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, l, r)           \
        .getResult())
 
+#define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, lhs, rhs))
+
 #define C_IDX(v) (constantIndex(builder, loc, v))
 
 /// Generates a pointer/index load from the sparse storage scheme. Narrower
@@ -500,16 +502,17 @@ void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) {
       assert(sliceStack[tid].back().slicedOnLvl == lvl);
       sliceStack[tid].pop_back();
     } else {
-      // Else this is a resolved-slice, and advance posit similar to TACO.
-      Value c1 = C_IDX(1), c2 = C_IDX(2);
-
-      // pIdx += 2, we finished the current lvl, advance the pointer index of
-      // the previous level by two to skip the [pLo, pHi] for current level.
-      Value sPtrBuf = slicePosBuffer[tid][lvl].back();
-      Value curP = genIndexLoad(builder, loc, sPtrBuf, c1);
-      Value nexP = builder.create<arith::AddIOp>(loc, curP, c2);
-      // TODO: we could probably use an SSA value for it.
-      builder.create<memref::StoreOp>(loc, nexP, sPtrBuf, c1);
+      if (!isDenseDLT(lvlTypes[tid][lvl])) {
+        // Else this is a resolved-slice, and advance posit similar to TACO.
+        Value c1 = C_IDX(1), c2 = C_IDX(2);
+        // pIdx += 2, we finished the current lvl, advance the pointer index of
+        // the previous level by two to skip the [pLo, pHi] for current level.
+        Value sPtrBuf = slicePosBuffer[tid][lvl].back();
+        Value curP = genIndexLoad(builder, loc, sPtrBuf, c1);
+        Value nexP = builder.create<arith::AddIOp>(loc, curP, c2);
+        // TODO: we could probably use an SSA value for it.
+        builder.create<memref::StoreOp>(loc, nexP, sPtrBuf, c1);
+      }
     }
   }
   loopSeqStack.pop_back();
@@ -547,11 +550,9 @@ Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) {
   }
 }
 
-Operation *LoopEmitter::emitForLoopOverTensorAtLvl(OpBuilder &builder,
-                                                   Location loc, TensorId tid,
-                                                   Level dstLvl,
-                                                   MutableArrayRef<Value> reduc,
-                                                   bool isParallel) {
+Operation *LoopEmitter::emitForLoopOverTensorAtLvl(
+    OpBuilder &builder, Location loc, TensorId tid, Level dstLvl, Value lo,
+    Value hi, MutableArrayRef<Value> reduc, bool isParallel) {
   bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) ||
                       isSingletonDLT(lvlTypes[tid][dstLvl]);
 
@@ -561,9 +562,6 @@ Operation *LoopEmitter::emitForLoopOverTensorAtLvl(OpBuilder &builder,
   // biggest range).
   const Level srcLvl = reassoc.front();
   Value step = C_IDX(1);
-  Value lo = isSparseCond ? posits[tid][srcLvl]        // current offset
-                          : loopSeqStack.back().first; // universal index
-  Value hi = highs[tid][srcLvl];
 
   Operation *loop = nullptr;
   Value iv;
@@ -682,7 +680,7 @@ Operation *LoopEmitter::enterLoopOverTensorAtLvl(
     ArrayRef<Level> lvls, MutableArrayRef<Value> reduc, bool isParallel) {
   // TODO: support multiple return on parallel for?
   assert(!isParallel || reduc.size() <= 1);
-  bool isSparseCond = false, isSliceCond = false;
+  bool isSparseCond = false, isSparseSliceCond = false;
   size_t tid = tids.front(), lvl = lvls.front();
 
   // Finds out the tensor level that we should use to generate loops. Amongs all
@@ -691,25 +689,25 @@ Operation *LoopEmitter::enterLoopOverTensorAtLvl(
     assert(lvlTypes[t].size() > l);         // Must be a valid tid, dim pair
     assert(!coords[t][l] ||                 // We cannot re-enter the same level
            !dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop
-    auto dimType = lvlTypes[t][l];
+    auto lvlType = lvlTypes[t][l];
     // Must be a recognizable DLT.
-    assert(isDenseDLT(dimType) || isCompressedDLT(dimType) ||
-           isSingletonDLT(dimType));
+    assert(isDenseDLT(lvlType) || isCompressedDLT(lvlType) ||
+           isSingletonDLT(lvlType));
 
-    // This is a slice-driven loop.
-    if (!dependentLvlMap[t][l].empty()) {
-      assert(!isSliceCond && !isSparseCond);
-      isSliceCond = true;
+    // This is a slice-driven loop on sparse level.
+    if (!dependentLvlMap[t][l].empty() && !isDenseDLT(lvlType)) {
+      assert(!isSparseSliceCond && !isSparseCond);
+      isSparseSliceCond = true;
       tid = t;
       lvl = l;
       continue;
     }
 
-    bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType);
+    bool isSparse = isCompressedDLT(lvlType) || isSingletonDLT(lvlType);
     // We can at most have one sparse input, otherwise, a while loop is
     // required to co-iterate multiple sparse tensors.
     assert(!isSparseCond || !isSparse);
-    assert(!isSliceCond || !isSparseCond);
+    assert(!isSparseSliceCond || !isSparseCond);
     if (isSparse) {
       tid = t;
       lvl = l;
@@ -717,10 +715,27 @@ Operation *LoopEmitter::enterLoopOverTensorAtLvl(
     isSparseCond = isSparseCond || isSparse;
   }
 
+  DimLevelType lvlType = lvlTypes[tid][lvl];
+  // TODO: Dense slice driven loop can be generated using for loop as well.
+  assert(!isSparseSliceCond || !isDenseDLT(lvlType));
+  bool isDenseSliceCond =
+      isDenseDLT(lvlType) && !dependentLvlMap[tid][lvl].empty();
+  // if the slice is fully reduced, we can now use TACO-based algorithm to
+  // iterate it.
+
+  Operation *l = nullptr;
+
+  // At most one tensor used as condition in for loop;
+  SmallVector<TensorId, 1> condTid;
+  SmallVector<Level, 1> condLvl;
+  // There Might be multiple dense slice driven tensor.
+  SmallVector<TensorId> sliceTids;
+  SmallVector<Level> sliceLvls;
+  SmallVector<bool> sliceReduc;
+
   // Generates loops 
diff erently depending on whether we need a slice-driven
   // loop or a simple level traversal loop.
-  Operation *l = nullptr;
-  if (isSliceCond) {
+  if (isSparseSliceCond) {
     bool fullyReduced = depFullyReduced(tid, lvl);
     if (!fullyReduced) {
       l = emitSliceDrivenLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc);
@@ -733,22 +748,63 @@ Operation *LoopEmitter::enterLoopOverTensorAtLvl(
           lvl, reduc);
     }
     levelReducedDep[tid][lvl]++;
-    // We can also prepare for next dim here in advance
-    // Pushes the loop into stack.
-    loopStack.emplace_back(
-        ArrayRef<TensorId>(), ArrayRef<Level>(), ArrayRef<TensorId>(tid),
-        ArrayRef<Level>(lvl), ArrayRef<bool>(fullyReduced), l,
-        builder.getInsertionBlock(), coords[tid][lvl], loopTag);
+    sliceTids.push_back(tid);
+    sliceLvls.push_back(lvl);
+    sliceReduc.push_back(fullyReduced);
   } else {
-    l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc, isParallel);
-    // We can also prepare for next dim here in advance
-    // Pushes the loop into stack.
-    loopStack.emplace_back(ArrayRef<TensorId>(tid), ArrayRef<Level>(lvl),
-                           ArrayRef<TensorId>(), ArrayRef<Level>(),
-                           ArrayRef<bool>(), l, builder.getInsertionBlock(),
-                           coords[tid][lvl], loopTag);
+    Value lo = isSparseCond ? posits[tid][lvl]           // current offset
+                            : loopSeqStack.back().first; // universal index
+    Value hi = highs[tid][lvl];
+    if (isDenseSliceCond) {
+      bool fullyReduced = depFullyReduced(tid, lvl);
+      Value sliceSz = sliceSizes[tid][lvl][sliceStack[tid].back().depth - 1];
+      // Adjust for loop hi for dense slice-driven loop.
+      if (fullyReduced) {
+        hi = sliceSz;
+        condTid.push_back(tid);
+        condLvl.push_back(lvl);
+      } else {
+        hi = builder.create<arith::SubIOp>(loc, lvlSizes[tid][lvl], sliceSz);
+        hi = builder.create<arith::AddIOp>(loc, hi, C_IDX(1));
+      }
+    } else {
+      condTid.push_back(tid);
+      condLvl.push_back(lvl);
+    }
+    l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, lo, hi, reduc,
+                                   isParallel);
   }
-
+  Value iv = coords[tid][lvl];
+  for (auto [t, l] : llvm::zip(tids, lvls)) {
+    // We only need to handle slice-driven loops on dense level here.
+    // If it is a slice-driven loop on sparse level, it needs a while loop to
+    // insert break statements, and it must have been handled correctly in L692.
+    if (!dependentLvlMap[t][l].empty() && isDenseDLT(lvlTypes[t][l])) {
+      // Pushes sliced levels to build correct LoopInfo.
+      bool fullyReduc = depFullyReduced(t, l);
+      SliceInfo &info = sliceStack[t].back();
+      if (fullyReduc) {
+        posits[t][l] =
+            genAddress(builder, loc, t, l,
+                       builder.create<arith::AddIOp>(loc, info.offset, iv));
+      } else {
+        // Puts sliced dense loop into LoopInfo so that LoopEmitter knows how to
+        // exit it.
+        sliceTids.push_back(t);
+        sliceLvls.push_back(l);
+        sliceReduc.push_back(fullyReduc);
+        // Update the slice information as we enter the new loop.
+        assert(*info.slicedOnLvl == l);
+        info.minCrd = info.offset = iv;
+        info.isNonEmpty = constantI1(builder, loc, true);
+        levelReducedDep[t][l]++;
+      }
+    }
+  }
+  // NOTE: we can also prepare for next dim here in advance
+  // Pushes the loop into stack.
+  loopStack.emplace_back(condTid, condLvl, sliceTids, sliceLvls, sliceReduc, l,
+                         builder.getInsertionBlock(), iv, loopTag);
   // Emit extra locals.
   emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tids, lvls);
   return l;
@@ -1106,6 +1162,10 @@ void LoopEmitter::emitExtraLocalsForTensorsAtDenseLvls(OpBuilder &builder,
   assert(tids.size() == lvls.size());
   for (auto [tid, lvl] : llvm::zip(tids, lvls)) {
     if (isDenseDLT(lvlTypes[tid][lvl])) {
+      // Slice-driven dense level should have be handled already.
+      if (!dependentLvlMap[tid][lvl].empty())
+        continue;
+
       auto enc = getSparseTensorEncoding(tensors[tid].getType());
       if (enc && !isSparseOutput(tid)) {
         bool validPos = lvl == 0 || posits[tid][lvl - 1];
@@ -1127,6 +1187,18 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
                               MutableArrayRef<Value> reduc) {
   const LoopInfo &loopInfo = loopStack.back();
   rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
+  for (auto [tid, lvl, reduced] : llvm::zip(
+           loopInfo.slicedTids, loopInfo.slicedLvls, loopInfo.sliceReduced)) {
+    SliceInfo &info = sliceStack[tid].back();
+    assert(isDenseDLT(lvlTypes[tid][lvl]));
+    assert(*info.slicedOnLvl == lvl && !reduced);
+    (void)reduced;
+    // Resets slices pointers as the resolved slices are invalidated after we
+    // moves forward to the next slice.
+    invalidateSliceIterIdx(rewriter, loc, tid, lvl);
+    info.minCrd = info.offset = info.isNonEmpty = Value();
+    levelReducedDep[tid][lvl]--;
+  }
   if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
     if (!reduc.empty()) {
       assert(reduc.size() == forOp.getNumResults());
@@ -1220,6 +1292,8 @@ void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
   unsigned delta = 0;
   for (auto [tid, lvl, resolved] : llvm::zip(
            loopInfo.slicedTids, loopInfo.slicedLvls, loopInfo.sliceReduced)) {
+    // TODO: handle dense.
+    assert(isCompressedDLT(lvlTypes[tid][lvl]));
     levelReducedDep[tid][lvl]--;
     if (!resolved) {
       genSliceNextInduction(builder, loc, whileOp, tid, lvl, operands, o);
@@ -1338,18 +1412,15 @@ unsigned LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const {
   return totalDependencies;
 }
 
-const LoopEmitter::SliceInfo &LoopEmitter::getFinalSliceOnLvl(TensorId tid,
-                                                              Level lvl) {
+const LoopEmitter::SliceInfo &LoopEmitter::getMostRecentSliceOnLvl(TensorId tid,
+                                                                   Level lvl) {
   // Finds the most-recent slice using a reverse iteration.
   for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie;
        it++) {
     if (it->slicedOnLvl == lvl) { // the level matched
-      // Must be the final slice we need to fully reduced the expression too.
-      assert(it->depth == dependentLvlMap[tid][lvl].size() - 1);
       return *it;
     }
   }
-
   llvm_unreachable("Failed to find sliceInfo");
 }
 
@@ -1366,9 +1437,7 @@ const LoopEmitter::SliceInfo &LoopEmitter::getFinalSliceOnLvl(TensorId tid,
 std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop(
     OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset,
     Value size, TensorId tid, Level lvl, ValueRange userReduc, bool genYield,
-    llvm::function_ref<void(OpBuilder &, Location, Value,
-                            MutableArrayRef<Value>)>
-        bodyBuilder) {
+    LoopBodyBuilder bodyBuilder) {
   Value c1 = C_IDX(1);
   Value sliceHi = builder.create<arith::AddIOp>(loc, offset, size);
 
@@ -1454,40 +1523,106 @@ std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop(
 //   }
 // }
 ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse(
-    OpBuilder &builder, Location loc, Value offset, TensorId tid, Level lvl,
-    size_t depth, ValueRange userReduc,
-    llvm::function_ref<void(OpBuilder &, Location, Value,
-                            MutableArrayRef<Value>)>
-        bodyBuilder) {
-
+    OpBuilder &builder, Location loc, TensorId tid,
+    ArrayRef<const SliceInfo *> unResLvls, ValueRange userReduc,
+    LoopBodyBuilder bodyBuilder) {
+  // assert(unResLvls.size() == 1 && "TODO");
   Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
 
-  // TODO: it only works on all compressed tensor.
-  Value sPtrBuf = slicePosBuffer[tid][lvl][depth];
-  Value pSt = c2;                                      // pointer starting index
-  Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
-
-  auto forOp =
-      scf::buildLoopNest(
-          builder, loc, pSt, mSz, c2, userReduc,
-          [this, c1, tid, lvl, offset, sPtrBuf,
-           bodyBuilder](OpBuilder &builder, Location loc, ValueRange ivs,
-                        ValueRange iterArgs) -> scf::ValueVector {
+  const SliceInfo &frontSlice = *unResLvls.back();
+  Level firstLvl = *frontSlice.slicedOnLvl;
+  assert(!lvlFullyResolved(tid, firstLvl) && "TODO");
+
+  // FIXME: it is not zero when the first level is fully resolved.
+  Value pos = c0;
+  OpBuilder::InsertPoint ip;
+  SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end());
+  scf::ForOp outerMost = nullptr;
+  if (!lvlFullyResolved(tid, firstLvl)) {
+    if (isCompressedDLT(lvlTypes[tid][firstLvl])) {
+      unsigned depth = frontSlice.depth - 1;
+      Value offset = frontSlice.offset;
+      Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth];
+      Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
+      outerMost = builder.create<scf::ForOp>(
+          loc, c2, mSz, c2, innerArgs,
+          [this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos, &innerArgs](
+              OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) {
             // generate traversal for each level.
-            Value loopLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front());
-            Value loopHi = genIndexLoad(
-                builder, loc, sPtrBuf,
-                builder.create<arith::AddIOp>(loc, ivs.front(), c1));
-            return genSliceLvlTraverseLoop(builder, loc, loopLo, loopHi, offset,
-                                           sliceSizes[tid][lvl].back(), tid,
-                                           lvl, iterArgs, true, bodyBuilder)
-                .second;
-          })
-          .loops.front();
+            Value loopLo = genIndexLoad(builder, loc, sPtrBuf, iv);
+            Value loopHi = genIndexLoad(builder, loc, sPtrBuf, ADDI(iv, c1));
+            ValueRange itArgs =
+                genSliceLvlTraverseLoop(
+                    builder, loc, loopLo, loopHi, offset,
+                    sliceSizes[tid][firstLvl].back(), tid, firstLvl, iterArgs,
+                    false,
+                    [&](OpBuilder &builder, Location, Value iv,
+                        MutableArrayRef<Value> reduc) {
+                      ip = builder.saveInsertionPoint();
+                      pos = iv;
+                      innerArgs.assign(reduc.begin(), reduc.end());
+                    })
+                    .second;
+            builder.create<scf::YieldOp>(loc, itArgs);
+          });
+    } else if (isDenseDLT(lvlTypes[tid][firstLvl])) {
+      assert(firstLvl == 0); // This must be the first level.
+      Value lb = frontSlice.offset;
+      Value sliceSz =
+          sliceSizes[tid][*frontSlice.slicedOnLvl][frontSlice.depth - 1];
+      Value ub = ADDI(lb, sliceSz);
+      outerMost = builder.create<scf::ForOp>(
+          loc, lb, ub, c1, innerArgs,
+          [&](OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) {
+            ip = builder.saveInsertionPoint();
+            pos = iv;
+            innerArgs.assign(iterArgs.begin(), iterArgs.end());
+          });
+    }
+    // We generated the loop for the first slice above, now remove it.
+    unResLvls = unResLvls.drop_back();
+  }
 
+  // Reset the insertion point into the loop body.
+  builder.restoreInsertionPoint(ip);
+  if (!unResLvls.empty()) {
+    // Fills in dense slices levels in between.
+    SmallVector<Value> lbs, ubs, steps, lvlSzs;
+    for (const SliceInfo *slice : llvm::reverse(unResLvls)) {
+      Level sliceLvl = *slice->slicedOnLvl;
+      assert(isDenseDLT(lvlTypes[tid][sliceLvl]));
+      Value offset = slice->offset;
+      Value sliceSz = sliceSizes[tid][sliceLvl][slice->depth - 1];
+      lbs.push_back(offset);
+      ubs.push_back(builder.create<arith::AddIOp>(loc, offset, sliceSz));
+      steps.push_back(c1);
+      lvlSzs.push_back(lvlSizes[tid][sliceLvl]);
+    }
+    auto denseNest = scf::buildLoopNest(
+        builder, loc, lbs, ubs, steps, innerArgs,
+        [&innerArgs, &lvlSzs, &pos,
+         bodyBuilder](OpBuilder &builder, Location loc, ValueRange ivs,
+                      ValueRange iterArgs) -> scf::ValueVector {
+          for (auto em : llvm::enumerate(ivs)) {
+            // Linearizes postion: pos = (pos * lvlsize) + iv;
+            pos = builder.create<arith::MulIOp>(loc, pos, lvlSzs[em.index()]);
+            pos = builder.create<arith::AddIOp>(loc, pos, em.value());
+          }
+          innerArgs.assign(iterArgs.begin(), iterArgs.end());
+          // Generates user request loop body.
+          // TODO: we do not have to check inbound for dense levels
+          bodyBuilder(builder, loc, pos, innerArgs);
+          return innerArgs;
+        });
+    builder.create<scf::YieldOp>(loc, denseNest.results);
+  } else {
+    // Generates user request loop body.
+    bodyBuilder(builder, loc, pos, innerArgs);
+    builder.create<scf::YieldOp>(loc, innerArgs);
+  }
   // Insert after current while operation.
-  builder.setInsertionPointAfter(forOp);
-  return forOp.getResults();
+  builder.setInsertionPointAfter(outerMost);
+  return outerMost.getResults();
 }
 
 void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc,
@@ -1495,6 +1630,13 @@ void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc,
   assert(lvl == 0 && "TODO: handle non-first level");
   Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2), c3 = C_IDX(3),
         c4 = C_IDX(4);
+  if (isDenseDLT(lvlTypes[tid][lvl])) {
+    // Dense slice begin is trivial.
+    sliceStack[tid].emplace_back(/*minCoord=*/c0, /*offset=*/c0,
+                                 /*nonEmpty=*/constantI1(builder, loc, true),
+                                 lvl, /*depth=*/1);
+    return;
+  }
   Value size = sliceSizes[tid][0][0];
   Value sPtrBuf = slicePosBuffer[tid][0][0];
   Value pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1);
@@ -1540,18 +1682,41 @@ void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc,
 // }
 void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
                                           TensorId tid, Level lvl) {
-  assert(isCompressedDLT(lvlTypes[tid][lvl]));
   Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
-  const SliceInfo &sliceInfo = sliceStack[tid].back();
-  unsigned prevLvl = *sliceInfo.slicedOnLvl;
-  assert(lvl >= prevLvl);
-  // Either lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one
+  unsigned depth = levelReducedDep[tid][lvl];
+  Value size = sliceSizes[tid][lvl][depth];
+  // Dense slice begin is trivial
+  if (isDenseDLT(lvlTypes[tid][lvl])) {
+    sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), lvl,
+                                 depth + 1);
+    return;
+  }
+
+  assert(isCompressedDLT(lvlTypes[tid][lvl]));
+  // Unhandled Cases:
+  //
+  // 1st, lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one
   // variable need to be reduced on the same level).
-  // Or lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a
+  //
+  // 2nd, lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a
   // simple dim expression in between).
-  assert(lvl == prevLvl + 1 && "TODO: not yet implemented");
+  assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1);
+
   // Check slice stack integrity.
-  assert(slicePosBuffer[tid][prevLvl].size() == sliceInfo.depth);
+  assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth);
+
+  SmallVector<const SliceInfo *> unResSlices;
+  for (Level curLvl = lvl; curLvl >= 1; curLvl--) {
+    Level prevLvl = curLvl - 1;
+    unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl));
+    if (!isDenseDLT(lvlTypes[tid][prevLvl]) || lvlFullyResolved(tid, prevLvl)) {
+      break;
+    }
+  }
+
+  assert(!unResSlices.empty() &&
+         !lvlFullyResolved(tid, *unResSlices.front()->slicedOnLvl));
+
   Value sPtrBuf = slicePosBuffer[tid][lvl].back();
   SmallVector<Value, 3> reduc = {
       constantI1(builder, loc, false), // isNonEmpty
@@ -1560,7 +1725,7 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
   };
 
   ValueRange result = genUnResolvedSliceTreeTraverse(
-      builder, loc, sliceInfo.offset, tid, prevLvl, sliceInfo.depth - 1, reduc,
+      builder, loc, tid, unResSlices, reduc,
       [this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc,
                                         Value iv,
                                         MutableArrayRef<Value> reduc) {
@@ -1606,8 +1771,6 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
         curMemSz = builder.create<arith::AddIOp>(loc, curMemSz, c2);
       });
 
-  unsigned depth = levelReducedDep[tid][lvl];
-  Value size = sliceSizes[tid][lvl][depth];
   Value isNonEmpty = result[0];
   Value minCrd = result[1];
   // Two metadata [memSize, idx].
@@ -1624,6 +1787,10 @@ bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
   Value c1 = C_IDX(1), c2 = C_IDX(2);
 
   if (depFullyReduced(tid, lvl)) {
+    // Do not need to prepare for slice driven loop on dense level after it is
+    // fully reduced.
+    if (isDenseDLT(lvlTypes[tid][lvl]))
+      return true;
     // If constraints on the tensor is fully resolved. We do not need to
     // generates slice begin any more, instead we fall back to TACO-based
     // algorithm to (co)iterates over the slice.
@@ -1703,6 +1870,16 @@ bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
   return false;
 }
 
+void LoopEmitter::invalidateSliceIterIdx(OpBuilder &builder, Location loc,
+                                         TensorId tid, Level lvl) {
+  for (unsigned i = 0; i <= lvl; i++) {
+    if (!isDenseDLT(lvlTypes[tid][i])) {
+      builder.create<memref::StoreOp>(loc, C_IDX(0),
+                                      slicePosBuffer[tid][i].back(), C_IDX(1));
+    }
+  }
+}
+
 void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
                                         const Operation *op, TensorId tid,
                                         Level lvl,
@@ -1712,14 +1889,11 @@ void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
     llvm_unreachable("TODO");
 
   // else generate code to compute next non empty slice.
-  Value c0 = C_IDX(0);
-  Value c1 = C_IDX(1);
-  Value c2 = C_IDX(2);
+  Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
 
   auto whileOp = llvm::cast<scf::WhileOp>(op);
   SliceInfo &info = sliceStack[tid].back();
   assert(info.slicedOnLvl == lvl);
-
   //
   // We forward to the next non empty slice by
   // if (minCrd > offset) {
@@ -1735,8 +1909,7 @@ void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
   Value absOffset = info.offset;
   // Resets slices pointers as the resolved slices are invalidated after we
   // moves forward to the next slice.
-  for (unsigned i = 0; i <= lvl; i++)
-    builder.create<memref::StoreOp>(loc, c0, slicePosBuffer[tid][i].back(), c1);
+  invalidateSliceIterIdx(builder, loc, tid, lvl);
 
   SmallVector<Value, 3> reduc = {info.minCrd, info.isNonEmpty, absOffset};
   Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1];
@@ -1949,4 +2122,5 @@ Operation *LoopEmitter::emitSliceDrivenLoopOverTensorAtLvl(
 }
 
 #undef CMPI
+#undef ADDI
 #undef C_IDX

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
index 5bbb68198e0f5..554f24b16f8d6 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
@@ -264,6 +264,9 @@ class LoopEmitter {
     unsigned depth; // the depth (relative to dependentDimMap[tid][lvl]).
   };
 
+  using LoopBodyBuilder = llvm::function_ref<void(OpBuilder &, Location, Value,
+                                                  MutableArrayRef<Value>)>;
+
   /// Linearizes address for dense dimension (i.e., p = (i * d0) + j).
   Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl,
                    Value iv);
@@ -318,11 +321,13 @@ class LoopEmitter {
                                             ArrayRef<TensorId> tids,
                                             ArrayRef<Level> lvls);
 
-  /// Emits a for loop to iterate over a dense level, or a sparse level that has
-  /// not been sliced.
+  /// Emits a for loop to iterate over a tensor level with the provided lower
+  /// bound `lo` and upper bound `hi`.
+  /// Apart from iterating just single tensor level, for loops can be used for
+  /// slice-driven loop on dense level too.
   Operation *emitForLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
-                                        TensorId tid, Level lvl,
-                                        MutableArrayRef<Value> reduc,
+                                        TensorId tid, Level lvl, Value lo,
+                                        Value hi, MutableArrayRef<Value> reduc,
                                         bool isParallel);
 
   /// Emits a while loop to iterate over a sparse level that has been sliced.
@@ -405,9 +410,16 @@ class LoopEmitter {
 
   /// Retrieves the most recent slice on lvl. To reduce affine expression like
   /// d0 + d1 + d2, we need two slices (one of size d1 + d2, and the other of
-  /// size d2). This methods returns the latter slice (of size d2), which is
-  /// also the final slice on the level.
-  const SliceInfo &getFinalSliceOnLvl(TensorId tid, Level lvl);
+  /// size d2). This methods returns the latter slice (of size d2).
+  const SliceInfo &getMostRecentSliceOnLvl(TensorId tid, Level lvl);
+
+  /// Similar to getMostRecentSliceOnLvl, but yields error when the most recent
+  /// slice is not the final slice needed to fully reduced the dependencies.
+  const SliceInfo &getFinalSliceOnLvl(TensorId tid, Level lvl) {
+    const SliceInfo &info = getMostRecentSliceOnLvl(tid, lvl);
+    assert(info.depth == dependentLvlMap[tid][lvl].size() - 1);
+    return info;
+  }
 
   /// Get the remaining number of constraints needed to fully *resolve*
   /// dependent levels on tensor[tid].
@@ -436,18 +448,15 @@ class LoopEmitter {
   genSliceLvlTraverseLoop(OpBuilder &builder, Location loc, Value pLo,
                           Value pHi, Value offset, Value size, TensorId tid,
                           Level lvl, ValueRange userReduc, bool genYield,
-                          /*bodyBuilder=*/
-                          llvm::function_ref<void(OpBuilder &, Location, Value,
-                                                  MutableArrayRef<Value>)>);
+                          LoopBodyBuilder bodyBuilder);
 
   /// Generates a nested loop that iterates over tid on all the coordinates on
   /// lvl.
-  ValueRange genUnResolvedSliceTreeTraverse(
-      OpBuilder &builder, Location loc, Value offset, TensorId tid, Level lvl,
-      size_t depth, ValueRange userReduc,
-      /*bodyBody=*/
-      llvm::function_ref<void(OpBuilder &, Location, Value,
-                              MutableArrayRef<Value>)>);
+  ValueRange
+  genUnResolvedSliceTreeTraverse(OpBuilder &builder, Location loc, TensorId tid,
+                                 ArrayRef<const SliceInfo *> unResLvls,
+                                 ValueRange userReduc,
+                                 LoopBodyBuilder bodyBuilder);
 
   /// Generates code to get the first non-empty slice of tid on lvl, when all
   /// the previous level before `lvl` are resolved (or lvl is the first level).
@@ -465,6 +474,11 @@ class LoopEmitter {
   void genUnResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
                                Level lvl);
 
+  /// Invalidates the index kept in slice postion buffers (by setting it to
+  /// zero).
+  /// TODO: We should instead use an SSA value for the index.
+  void invalidateSliceIterIdx(OpBuilder &builder, Location loc, TensorId tid,
+                              Level lvl);
   /// Generates code to get the first non-empty slice of tid on lvl.
   /// return true if has already been resolved.
   bool genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl);

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 40a2454f779de..3a90ca513cc45 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -1484,11 +1484,12 @@ static bool startLoopSeq(CodegenEnv &env, OpBuilder &builder, ExprId exp,
                                            std::optional<Level> lvl,
                                            DimLevelType dlt, bool isIdxReduc) {
     assert(env.merger().loop(b) == idx);
-    // FIXME: Dense index reduction can reuse the universal index as well.
-    if (!isIdxReduc && (isDenseDLT(dlt) || isUndefDLT(dlt))) {
+    if (isDenseDLT(dlt) || isUndefDLT(dlt))
       needsUniv = true;
-    } else {
-      // sparse/singleton levels.
+    if (isCompressedDLT(dlt) || isSingletonDLT(dlt) || isIdxReduc) {
+      // Only when this is a index reduction loop, can the dlt be undefined.
+      assert(!isUndefDLT(dlt) || isIdxReduc);
+      // sparse/singleton levels, or a dense/sparse index reduction loop.
       tids.push_back(tid);
       lvls.push_back(*lvl);
     }
@@ -1581,7 +1582,7 @@ static bool translateBitsToTidLvlPairs(
           tids.push_back(tid);
           lvls.push_back(*lvl);
           numloopCond++;
-        } else if (isDenseDLT(dlt)) {
+        } else if (isDenseDLT(dlt) || isIdxReduc) {
           tids.push_back(tid);
           lvls.push_back(*lvl);
         } else {

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
index 1ca6b81285e25..555d1bb232035 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -1,9 +1,4 @@
-// UNSUPPORTED: target={{.*}}
-// FIXME: The test case is disabled (for now) because affine index on sparse tensor
-// are not handled efficiently by sparse compiler, the test case will be re-enabled
-// after new algorithm is implemented.
-
-// DEFINE: %{option} = enable-runtime-library=true
+// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true"
 // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:  -e entry -entry-point-result=void  \
@@ -13,16 +8,16 @@
 // RUN: %{compile} | %{run}
 //
 // Do the same run, but now with direct IR generation.
-// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true"
+// REDEFINE: %{option} = "enable-runtime-library=false enable-index-reduction=true"
 // RUN: %{compile} | %{run}
 //
 // Do the same run, but now with direct IR generation and vectorization.
-// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
+// REDEFINE: %{option} = "enable-runtime-library=false enable-index-reduction=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
 // RUN: %{compile} | %{run}
 
 // Do the same run, but now with direct IR generation and, if available, VLA
 // vectorization.
-// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA"
+// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-index-reduction=true enable-arm-sve=%ENABLE_VLA"
 // REDEFINE: %{run} = %lli \
 // REDEFINE:   --entry-function=entry_lli \
 // REDEFINE:   --extra-module=%S/Inputs/main_for_lli.ll \
@@ -33,6 +28,7 @@
 
 #DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
 #CSR = #sparse_tensor.encoding<{dimLevelType = ["dense", "compressed"]}>
+#CDR = #sparse_tensor.encoding<{dimLevelType = ["compressed", "dense"]}>
 #CSC = #sparse_tensor.encoding<{
   dimLevelType = [ "dense", "compressed" ],
   dimOrdering = affine_map<(i,j) -> (j,i)>
@@ -42,46 +38,55 @@
 module {
 
   func.func @conv2d(%input:  tensor<8x8xi32>,
-               %filter: tensor<3x3xi32, #DCSR>,
+               %filter: tensor<3x3xi32>,
                %output: tensor<6x6xi32>) -> tensor<6x6xi32> {
     %0 = linalg.conv_2d
-      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
+      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32>)
       outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>
     return %0 : tensor<6x6xi32>
   }
 
   func.func @conv2d_sparse_out(%input:  tensor<8x8xi32>,
-               %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> {
+               %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> {
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>
     %0 = linalg.conv_2d
-      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
+      ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32>)
       outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
     return %0 : tensor<6x6xi32, #DCSR>
   }
 
   func.func @conv2d_all_sparse_DCSR(%input:  tensor<8x8xi32, #DCSR>,
-               %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> {
+               %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> {
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>
     %0 = linalg.conv_2d
-      ins  (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32, #DCSR>)
+      ins  (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32>)
       outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
     return %0 : tensor<6x6xi32, #DCSR>
   }
 
   func.func @conv2d_all_sparse_CSR(%input:  tensor<8x8xi32, #CSR>,
-               %filter: tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR> {
+               %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #CSR> {
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSR>
     %0 = linalg.conv_2d
-      ins  (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32, #CSR>)
+      ins  (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32>)
       outs (%s: tensor<6x6xi32, #CSR>) -> tensor<6x6xi32, #CSR>
     return %0 : tensor<6x6xi32, #CSR>
   }
 
+  func.func @conv2d_all_sparse_CD(%input:  tensor<8x8xi32, #CDR>,
+               %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #CDR> {
+    %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CDR>
+    %0 = linalg.conv_2d
+      ins  (%input, %filter: tensor<8x8xi32, #CDR>, tensor<3x3xi32>)
+      outs (%s: tensor<6x6xi32, #CDR>) -> tensor<6x6xi32, #CDR>
+    return %0 : tensor<6x6xi32, #CDR>
+  }
+
   func.func @conv2d_all_sparse_CSC(%input:  tensor<8x8xi32, #CSC>,
-               %filter: tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC> {
+               %filter: tensor<3x3xi32>) -> tensor<6x6xi32, #CSC> {
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSC>
     %0 = linalg.conv_2d
-      ins  (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32, #CSC>)
+      ins  (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32>)
       outs (%s: tensor<6x6xi32, #CSC>) -> tensor<6x6xi32, #CSC>
     return %0 : tensor<6x6xi32, #CSC>
   }
@@ -96,12 +101,6 @@ module {
       [  0,  0,  0 ],
       [ -1,  0,  1 ]
     ]> : tensor<3x3xi32>
-    %sparse_filter_DCSR = sparse_tensor.convert %filter
-      : tensor<3x3xi32> to tensor<3x3xi32, #DCSR>
-    %sparse_filter_CSR = sparse_tensor.convert %filter
-      : tensor<3x3xi32> to tensor<3x3xi32, #CSR>
-    %sparse_filter_CSC = sparse_tensor.convert %filter
-      : tensor<3x3xi32> to tensor<3x3xi32, #CSC>
 
 
     %input = arith.constant dense<[
@@ -118,26 +117,31 @@ module {
       : tensor<8x8xi32> to tensor<8x8xi32, #DCSR>
     %sparse_input_CSR = sparse_tensor.convert %input
       : tensor<8x8xi32> to tensor<8x8xi32, #CSR>
+    %sparse_input_CD = sparse_tensor.convert %input
+      : tensor<8x8xi32> to tensor<8x8xi32, #CDR>
     %sparse_input_CSC = sparse_tensor.convert %input
       : tensor<8x8xi32> to tensor<8x8xi32, #CSC>
 
     // Call the kernel.
     %output = arith.constant dense<0> : tensor<6x6xi32>
-    %0 = call @conv2d(%input, %sparse_filter_DCSR, %output)
+    %0 = call @conv2d(%input, %filter, %output)
        : (tensor<8x8xi32>,
-          tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32>
-    %1 = call @conv2d_sparse_out(%input, %sparse_filter_DCSR)
+          tensor<3x3xi32>, tensor<6x6xi32>) -> tensor<6x6xi32>
+    %1 = call @conv2d_sparse_out(%input, %filter)
        : (tensor<8x8xi32>,
-          tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
-    %2 = call @conv2d_all_sparse_DCSR(%sparse_input_DCSR, %sparse_filter_DCSR)
+          tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR>
+    %2 = call @conv2d_all_sparse_DCSR(%sparse_input_DCSR, %filter)
        : (tensor<8x8xi32, #DCSR>,
-          tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
-    %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %sparse_filter_CSR)
+          tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR>
+    %3 = call @conv2d_all_sparse_CSR(%sparse_input_CSR, %filter)
        : (tensor<8x8xi32, #CSR>,
-          tensor<3x3xi32, #CSR>) -> tensor<6x6xi32, #CSR>
-    %4 = call @conv2d_all_sparse_CSC(%sparse_input_CSC, %sparse_filter_CSC)
+          tensor<3x3xi32>) -> tensor<6x6xi32, #CSR>
+    %4 = call @conv2d_all_sparse_CD(%sparse_input_CD, %filter)
+       : (tensor<8x8xi32, #CDR>,
+          tensor<3x3xi32>) -> tensor<6x6xi32, #CDR>
+    %5 = call @conv2d_all_sparse_CSC(%sparse_input_CSC, %filter)
        : (tensor<8x8xi32, #CSC>,
-          tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC>
+          tensor<3x3xi32>) -> tensor<6x6xi32, #CSC>
 
 
     // Verify the output.
@@ -183,6 +187,21 @@ module {
       : tensor<6x6xi32>, vector<6x6xi32>
     vector.print %v2 : vector<6x6xi32>
 
+    //
+    // Should be the same as dense output
+    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
+    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
+    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
+    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
+    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    //
+    %all_sparse_CD = sparse_tensor.convert %4
+      : tensor<6x6xi32, #CDR> to tensor<6x6xi32>
+    %v4 = vector.transfer_read %all_sparse_CD[%c0, %c0], %i0
+      : tensor<6x6xi32>, vector<6x6xi32>
+    vector.print %v4 : vector<6x6xi32>
+
     //
     // Should be the same as dense output
     // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
@@ -207,25 +226,23 @@ module {
     // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
     // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
     //
-    %all_sparse_CSC = sparse_tensor.convert %4
+    %all_sparse_CSC = sparse_tensor.convert %5
       : tensor<6x6xi32, #CSC> to tensor<6x6xi32>
-    %v4 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
+    %v5 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
       : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v4 : vector<6x6xi32>
+    vector.print %v5 : vector<6x6xi32>
 
     // Release the resources.
-    bufferization.dealloc_tensor %sparse_filter_DCSR : tensor<3x3xi32, #DCSR>
-    bufferization.dealloc_tensor %sparse_filter_CSR : tensor<3x3xi32, #CSR>
-    bufferization.dealloc_tensor %sparse_filter_CSC : tensor<3x3xi32, #CSC>
-
     bufferization.dealloc_tensor %sparse_input_DCSR : tensor<8x8xi32, #DCSR>
     bufferization.dealloc_tensor %sparse_input_CSR : tensor<8x8xi32, #CSR>
     bufferization.dealloc_tensor %sparse_input_CSC : tensor<8x8xi32, #CSC>
+    bufferization.dealloc_tensor %sparse_input_CD : tensor<8x8xi32, #CDR>
 
     bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR>
-    bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CSC>
+    bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CDR>
+    bufferization.dealloc_tensor %5 : tensor<6x6xi32, #CSC>
     return
   }
 }

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
index 3c7d89f26401f..f9602ab93d259 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
@@ -1,9 +1,4 @@
-// UNSUPPORTED: target={{.*}}
-// FIXME: The test case is disabled (for now) because affine index on sparse tensor
-// are not handled efficiently by sparse compiler, the test case will be re-enabled
-// after new algorithm is implemented.
-
-// DEFINE: %{option} = enable-runtime-library=true
+// DEFINE: %{option} = "enable-index-reduction=true enable-runtime-library=true"
 // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:  -e entry -entry-point-result=void  \
@@ -13,16 +8,16 @@
 // RUN: %{compile} | %{run}
 //
 // Do the same run, but now with direct IR generation.
-// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true"
+// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true"
 // RUN: %{compile} | %{run}
 //
 // Do the same run, but now with direct IR generation and vectorization.
-// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
+// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true"
 // RUN: %{compile} | %{run}
 
 // Do the same run, but now with direct IR generation and, if available, VLA
 // vectorization.
-// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA"
+// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true"
 // REDEFINE: %{run} = %lli \
 // REDEFINE:   --entry-function=entry_lli \
 // REDEFINE:   --extra-module=%S/Inputs/main_for_lli.ll \
@@ -39,6 +34,10 @@
   dimLevelType = [ "compressed", "dense", "compressed" ]
 }>
 
+#DDC = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed", "compressed" ]
+}>
+
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor<?x?x?xf32> {
   %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor<?x?x?xf32>
@@ -53,24 +52,33 @@ func.func @conv_3d(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: te
   return %ret : tensor<?x?x?xf32>
 }
 
-func.func @conv_3d_CCC(%arg0: tensor<?x?x?xf32, #CCC>, %arg1: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC> {
+func.func @conv_3d_CCC(%arg0: tensor<?x?x?xf32, #CCC>, %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32, #CCC> {
   %c6 = arith.constant 6 : index
   %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor<?x?x?xf32, #CCC>
   %ret = linalg.conv_3d
-     ins (%arg0, %arg1: tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>)
+     ins (%arg0, %arg1: tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32>)
     outs (%s: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC>
   return %ret : tensor<?x?x?xf32, #CCC>
 }
 
-func.func @conv_3d_CDC(%arg0: tensor<?x?x?xf32, #CDC>, %arg1: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC> {
+func.func @conv_3d_CDC(%arg0: tensor<?x?x?xf32, #CDC>, %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32, #CDC> {
   %c6 = arith.constant 6 : index
   %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor<?x?x?xf32, #CDC>
   %ret = linalg.conv_3d
-     ins (%arg0, %arg1: tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>)
+     ins (%arg0, %arg1: tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32>)
     outs (%s: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC>
   return %ret : tensor<?x?x?xf32, #CDC>
 }
 
+func.func @conv_3d_DDC(%arg0: tensor<?x?x?xf32, #DDC>, %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32, #DDC> {
+  %c6 = arith.constant 6 : index
+  %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor<?x?x?xf32, #DDC>
+  %ret = linalg.conv_3d
+     ins (%arg0, %arg1: tensor<?x?x?xf32, #DDC>, tensor<?x?x?xf32>)
+    outs (%s: tensor<?x?x?xf32, #DDC>) -> tensor<?x?x?xf32, #DDC>
+  return %ret : tensor<?x?x?xf32, #DDC>
+}
+
 func.func @entry() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -88,17 +96,15 @@ func.func @entry() {
 
   %in3D_CCC = sparse_tensor.convert %in3D
     : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CCC>
-  %filter3D_CCC = sparse_tensor.convert %filter3D
-    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CCC>
-
   %in3D_CDC = sparse_tensor.convert %in3D
     : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CDC>
-  %filter3D_CDC = sparse_tensor.convert %filter3D
-    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #CDC>
+  %in3D_DDC = sparse_tensor.convert %in3D
+    : tensor<?x?x?xf32> to tensor<?x?x?xf32, #DDC>
 
   %dense_ret = call @conv_3d(%in3D, %filter3D, %out3D) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>)
-  %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D_CCC) : (tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>) -> (tensor<?x?x?xf32, #CCC>)
-  %CDC_ret = call @conv_3d_CDC(%in3D_CDC, %filter3D_CDC) : (tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>) -> (tensor<?x?x?xf32, #CDC>)
+  %CCC_ret = call @conv_3d_CCC(%in3D_CCC, %filter3D) : (tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32>) -> (tensor<?x?x?xf32, #CCC>)
+  %CDC_ret = call @conv_3d_CDC(%in3D_CDC, %filter3D) : (tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32>) -> (tensor<?x?x?xf32, #CDC>)
+  %DDC_ret = call @conv_3d_DDC(%in3D_DDC, %filter3D) : (tensor<?x?x?xf32, #DDC>, tensor<?x?x?xf32>) -> (tensor<?x?x?xf32, #DDC>)
 
   //      CHECK:( ( ( 108, 108, 108, 108, 108, 108 ),
   // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
@@ -224,18 +230,59 @@ func.func @entry() {
       : tensor<?x?x?xf32>, vector<6x6x6xf32>
   vector.print %v2 : vector<6x6x6xf32>
 
+  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
+  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
+  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
+  %3 = sparse_tensor.convert %DDC_ret
+    : tensor<?x?x?xf32, #DDC> to tensor<?x?x?xf32>
+  %v3 = vector.transfer_read %3[%c0, %c0, %c0], %zero
+      : tensor<?x?x?xf32>, vector<6x6x6xf32>
+  vector.print %v2 : vector<6x6x6xf32>
+
   // Free the resources
   bufferization.dealloc_tensor %in3D : tensor<?x?x?xf32>
   bufferization.dealloc_tensor %filter3D : tensor<?x?x?xf32>
   bufferization.dealloc_tensor %out3D : tensor<?x?x?xf32>
 
   bufferization.dealloc_tensor %in3D_CDC : tensor<?x?x?xf32, #CDC>
-  bufferization.dealloc_tensor %filter3D_CDC : tensor<?x?x?xf32, #CDC>
   bufferization.dealloc_tensor %in3D_CCC : tensor<?x?x?xf32, #CCC>
-  bufferization.dealloc_tensor %filter3D_CCC : tensor<?x?x?xf32, #CCC>
+  bufferization.dealloc_tensor %in3D_DDC : tensor<?x?x?xf32, #DDC>
 
   bufferization.dealloc_tensor %CCC_ret : tensor<?x?x?xf32, #CCC>
   bufferization.dealloc_tensor %CDC_ret : tensor<?x?x?xf32, #CDC>
-
+  bufferization.dealloc_tensor %DDC_ret : tensor<?x?x?xf32, #DDC>
   return
 }