[Mlir-commits] [mlir] [mlir][sparse] initialize slice-driven loop-related fields in one place (PR #76099)

Wed Dec 20 13:41:39 PST 2023

================
@@ -487,35 +492,70 @@ void LoopEmitter::initializeLoopEmit(
     // hoist the code ouside if-conditions.
   }
 
-  Type indexType = builder.getIndexType();
-  Value c0 = constantZero(builder, loc, indexType);
+  initSliceDriven(builder, loc);
+}
+
+void LoopEmitter::initSliceDriven(OpBuilder &builder, Location loc) {
+  Value c0 = C_IDX(0);
   for (TensorId t = 0, e = tensors.size(); t < e; t++) {
     auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
     if (!rtp)
       continue;
 
     Level lvlRank = SparseTensorType(rtp).getLvlRank();
+
+    // Compute the dependency reduction order.
+    auto remDepStack = dependentLvlMap;
+    std::vector<std::tuple<LoopId, TensorId, Level>> depRedOrder;
     for (Level lvl = 0; lvl < lvlRank; lvl++) {
-      if (!dependentLvlMap[t][lvl].empty()) {
-        ArrayRef<std::pair<TensorLevel, unsigned>> depLvls =
-            dependentLvlMap[t][lvl];
-        // Needs at least two operands to form a non-trivial affine expression.
-        assert(depLvls.size() == sliceMeta[t][lvl].size());
-
-        Value size = c0;
-        for (int e = depLvls.size() - 1; e >= 0; e--) {
-          auto [dt, dl] = unpackTensorLevel(depLvls[e].first);
-          unsigned stride = depLvls[e].second;
-          Value stridedSize = lvlSizes[dt][dl];
-          if (stride != 1)
-            stridedSize = MULI(stridedSize, C_IDX(stride));
-          size = ADDI(size, stridedSize);
-          sliceMeta[t][lvl][e] = std::make_pair(size, stride);
-        }
+      // Reverse queue into a stack.
+      std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end());
+      for (auto [loop, coeff] : dependentLvlMap[t][lvl])
+        depRedOrder.emplace_back(std::make_tuple(loop, t, lvl));
+    }
+
+    if (depRedOrder.empty())
+      continue;
+    std::sort(depRedOrder.begin(), depRedOrder.end(),
+              [](auto &l, auto &r) { return std::get<0>(l) < std::get<0>(r); });
+
+    for (auto [loop, t, lvl] : depRedOrder) {
+      std::pair<LoopId, unsigned> curDep = remDepStack[t][lvl].back();
+      assert(curDep.first == loop);
+      Value size = c0;
+      for (auto [loop, stride] : remDepStack[t][lvl]) {
+        // The synthetic tensor high defines the loop upper bound.
+        Value loopHi = highs[getSynTensorId()][loop];
+        size = ADDI(size, MULI(loopHi, C_IDX(stride)));
       }
+      sliceMeta[t][lvl].emplace_back(size, curDep.second);
+      remDepStack[t][lvl].pop_back();
+
+      // Generate caches required to fast compute next-non-empty slices with
+      // increasing offset for slice-base loop.
+      // We do not need cache for dense levels.
+      if (!remDepStack[t][lvl].empty() && !isDenseLT(lvls[t][lvl]->getLT())) {
+        Value cnt = C_IDX(1);
+        for (int preLvl = lvl - 1; preLvl >= 0; preLvl--) {
+          if (remDepStack[t][preLvl].empty())
+            break;
+          assert(remDepStack[t][preLvl].size() == 1 && "Not implemented");
+          auto [loop, stride] = remDepStack[t][preLvl].back();
+          assert(stride == 1 && "Not yet implemented");
+          // Accumlates the size required to cache the pLo for the slice.
+          // E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the
+          // second level. We at most need to a memref<d0xindex>.
+          //
+          // NOTE: this is apperantly an over-approximation when the previous
+          // level is compressed, and we can compute a precise memory size
+          // inside the loops. But that would also requires us to allocate/free
+          // memorys in loops.
----------------
yinying-lisa-li wrote:

nit: memory

https://github.com/llvm/llvm-project/pull/76099