[Mlir-commits] [mlir] [MLIR] [SparseTensor] Implement multiple loop ordering heuristics for sparse tensor dialect (PR #151885)

Fri Aug 8 11:05:41 PDT 2025

================
@@ -271,3 +349,907 @@ void IterationGraphSorter::addConstraints(Value t, AffineMap loop2LvlMap) {
     }
   }
 }
+
+// get encoding info (storage format, level types, etc)
+SparseTensorEncodingAttr getEncodingInfo(Value tensor) {
+  auto tensorType = dyn_cast<RankedTensorType>(tensor.getType());
+  if (!tensorType)
+    return nullptr; // Not a ranked tensor type
+  return getSparseTensorEncoding(tensorType);
+}
+
+void IterationGraphSorter::analyzeMemoryPatterns() {
+  const unsigned numLoops = getNumLoops();
+  loopMemoryAnalysis.resize(numLoops);
+
+  // Initialize memory analysis for each loop
+  for (unsigned loop = 0; loop < numLoops; ++loop) {
+    auto &memInfo = loopMemoryAnalysis[loop];
+    memInfo.totalTensorAccesses = 0;
+    memInfo.sparseAccessCost = 0;
+    memInfo.compressedSequentialAccesses.clear();
+    memInfo.randomSparseAccesses.clear();
+    memInfo.unitStrideAccesses.clear();
+    memInfo.avgStrideComplexity = 0.0;
+    memInfo.spatialLocalityScore = 0.0;
+    memInfo.temporalReuseScore = 0.0;
+    memInfo.accessPatternRand = 0.0;
+  }
+
+  // Analyze input tensors
+  for (auto [tensorIdx, tensor] : llvm::enumerate(ins)) {
+    const AffineMap &map = loop2InsLvl[tensorIdx];
+    analyzeMapForMemoryPatterns(map, tensorIdx, tensor, false);
+  }
+
+  // Analyze output tensor
+  analyzeMapForMemoryPatterns(loop2OutLvl, ins.size(), out, true);
+
+  // Compute final scores without architecture assumptions
+  for (unsigned loop = 0; loop < numLoops; ++loop) {
+    computeArchitectureScore(loop);
+  }
+}
+
+IterationGraphSorter::SparseAccessPattern
+IterationGraphSorter::analyzeSparseAccessPattern(
+    AffineMap map, unsigned dim, unsigned loopIdx,
+    SparseTensorEncodingAttr encoding, unsigned tensorIdx) {
+
+  SparseAccessPattern pattern;
+
+  // Get the level types for this encoding
+  auto lvlTypes = encoding.getLvlTypes();
+  if (dim >= lvlTypes.size()) {
+    pattern.type = IterationGraphSorter::SparseAccessType::kRandomSparse;
+    pattern.expectedSparsity = 0.01;
+    pattern.memoryIndirections = 3;
+    pattern.hasGoodLocality = false;
+    return pattern;
+  }
+
+  LevelType levelType = lvlTypes[dim];
+  AffineExpr dimExpr = map.getResult(dim);
+
+  // Analyze the affine expression for this dimension
+  if (auto dimExprCast = dyn_cast<AffineDimExpr>(dimExpr)) {
+    // Simple case: dimension expression is just a loop variable
+    if (dimExprCast.getPosition() == loopIdx) {
+
+      if (isCompressedLT(levelType)) {
+        // Sequential access through compressed dimension
+        pattern.type = SparseAccessType::kCompressedSequential;
+        pattern.expectedSparsity = 1.0;
+        pattern.memoryIndirections = 1;
+        pattern.hasGoodLocality = true;
+      } else if (isSingletonLT(levelType)) {
+        // Sequential scan through singleton dimension
+        pattern.type = SparseAccessType::kSingletonScan;
+        pattern.expectedSparsity = 0.1;
+        pattern.memoryIndirections = 2;
+        pattern.hasGoodLocality = false;
+      } else {
+        // Dense level
+        pattern.type = SparseAccessType::kDenseSubtensor;
+        pattern.expectedSparsity = 1.0;
+        pattern.memoryIndirections = 1;
+        pattern.hasGoodLocality = true;
+      }
+    } else {
+      // Loop variable doesn't match this dimension
+      pattern.type = IterationGraphSorter::SparseAccessType::kRandomSparse;
+      pattern.expectedSparsity = 0.01;
+      pattern.memoryIndirections = 3;
+      pattern.hasGoodLocality = false;
+    }
+  } else {
+    // Complex affine expression - generally bad for sparse access
+    pattern.type = IterationGraphSorter::SparseAccessType::kRandomSparse;
+    pattern.expectedSparsity = 0.01;
+    pattern.memoryIndirections = 3;
+    pattern.hasGoodLocality = false;
+  }
+
+  return pattern;
+}
+
+void IterationGraphSorter::analyzeMapForMemoryPatterns(AffineMap map,
+                                                       unsigned tensorIdx,
+                                                       Value tensor,
+                                                       bool isOutput) {
+
+  auto encoding = getEncodingInfo(tensor);
+  bool isSparse = static_cast<bool>(encoding);
+
+  const unsigned tensorRank = map.getNumResults();
+
+  for (unsigned dim = 0; dim < tensorRank; ++dim) {
+    AffineExpr dimExpr = map.getResult(dim);
+
+    AffineDimCollector collector;
+    collector.walkPostOrder(dimExpr);
+
+    for (auto dimExprNode : collector.dims) {
+      unsigned loopIdx = dimExprNode.getPosition();
+      auto &loopInfo = loopMemoryAnalysis[loopIdx];
+      loopInfo.totalTensorAccesses++;
+
+      if (isSparse) {
+        // Sparse tensor analysis
+        SparseAccessPattern pattern =
+            analyzeSparseAccessPattern(map, dim, loopIdx, encoding, tensorIdx);
+
+        switch (pattern.type) {
+        case SparseAccessType::kCompressedSequential:
+          loopInfo.compressedSequentialAccesses.push_back(tensorIdx);
+          break;
+        case SparseAccessType::kSingletonScan:
+          loopInfo.singletonScanAccesses.push_back(tensorIdx);
+          break;
+        case SparseAccessType::kRandomSparse:
+          loopInfo.randomSparseAccesses.push_back(tensorIdx);
+          break;
+        case SparseAccessType::kDenseSubtensor:
+          loopInfo.unitStrideAccesses.push_back(tensorIdx);
+          break;
+        }
+      } else {
+        // Dense tensor analysis (your original code)
+        unsigned strideComplexity =
+            computeStrideComplexity(map.getResult(dim), loopIdx);
+        if (strideComplexity == 1) {
+          loopInfo.unitStrideAccesses.push_back(tensorIdx);
+        } else if (strideComplexity == 2) {
+          loopInfo.linearStrideAccesses.push_back(tensorIdx);
+        } else {
+          loopInfo.complexAccesses.push_back(tensorIdx);
+        }
+      }
+    }
+  }
+}
+
+unsigned IterationGraphSorter::computeStrideComplexity(AffineExpr expr,
+                                                       unsigned targetLoop) {
+  if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
+    return dimExpr.getPosition() == targetLoop ? 1 : 3;
+  }
+
+  AffineDimCollector collector;
+  collector.walkPostOrder(expr);
+
+  unsigned targetLoopCount = 0;
+  unsigned otherLoopCount = 0;
+
+  for (auto dim : collector.dims) {
+    if (dim.getPosition() == targetLoop) {
+      targetLoopCount++;
+    } else {
+      otherLoopCount++;
+    }
+  }
+
+  if (targetLoopCount == 1 && otherLoopCount == 0) {
+    return 1; // Unit stride
+  } else if (targetLoopCount == 1 && otherLoopCount <= 1) {
+    return 2; // Linear stride
+  } else {
+    return 3; // Complex
+  }
+}
+
+void IterationGraphSorter::computeArchitectureScore(unsigned loopIdx) {
+  auto &memInfo = loopMemoryAnalysis[loopIdx];
+
+  if (memInfo.totalTensorAccesses == 0) {
+    memInfo.avgStrideComplexity = 0.0;
+    return;
+  }
+
+  // Compute sparse access cost
+  double sparseAccessScore = 0.0;
+  unsigned totalSparseAccesses = memInfo.compressedSequentialAccesses.size() +
+                                 memInfo.singletonScanAccesses.size() +
+                                 memInfo.randomSparseAccesses.size();
+
+  if (totalSparseAccesses > 0) {
+    // Weighted scoring based on access pattern efficiency
+    double compressedRatio =
+        (double)memInfo.compressedSequentialAccesses.size() /
+        totalSparseAccesses;
+    double singletonRatio =
+        (double)memInfo.singletonScanAccesses.size() / totalSparseAccesses;
+    double randomRatio =
+        (double)memInfo.randomSparseAccesses.size() / totalSparseAccesses;
+
+    double unitStrideRatio =
+        memInfo.totalTensorAccesses > 0
+            ? (double)(memInfo.unitStrideAccesses.size() +
+                       memInfo.compressedSequentialAccesses.size()) /
+                  memInfo.totalTensorAccesses
+            : 0.0;
+    memInfo.spatialLocalityScore = unitStrideRatio;
+
+    // Temporal reuse: reward loops that access multiple tensors (more reuse
+    // potential)
+    memInfo.temporalReuseScore =
+        std::min(1.0, memInfo.totalTensorAccesses / 3.0);
+
+    // Apply locality bonuses to final score
+    memInfo.avgStrideComplexity *= (1.0 + memInfo.spatialLocalityScore * 0.1);
+    memInfo.avgStrideComplexity *= (1.0 + memInfo.temporalReuseScore * 0.05);
+
+    // Scoring: compressed access = 1.0, singleton = 0.4, random = 0.1
+    sparseAccessScore =
+        compressedRatio * 1.0 + singletonRatio * 0.4 + randomRatio * 0.1;
+  }
+
+  // Compute dense access score
+  double denseAccessScore = 0.0;
+  unsigned totalDenseAccesses = memInfo.unitStrideAccesses.size() +
+                                memInfo.linearStrideAccesses.size() +
+                                memInfo.complexAccesses.size();
+
+  if (totalDenseAccesses > 0) {
+    double unitStrideRatio =
+        (double)memInfo.unitStrideAccesses.size() / totalDenseAccesses;
+    double linearStrideRatio =
+        (double)memInfo.linearStrideAccesses.size() / totalDenseAccesses;
+    double complexAccessRatio =
+        (double)memInfo.complexAccesses.size() / totalDenseAccesses;
+
+    denseAccessScore = unitStrideRatio * 1.0 + linearStrideRatio * 0.7 +
+                       complexAccessRatio * 0.2;
+  }
+
+  // Combine sparse and dense scores
+  double totalAccesses = totalSparseAccesses + totalDenseAccesses;
+  if (totalAccesses > 0) {
+    double sparseWeight = (double)totalSparseAccesses / totalAccesses;
+    double denseWeight = (double)totalDenseAccesses / totalAccesses;
+
+    memInfo.avgStrideComplexity =
+        sparseWeight * sparseAccessScore + denseWeight * denseAccessScore;
+  } else {
+    memInfo.avgStrideComplexity = 0.0;
+  }
+
+  // Apply existing bonuses (reduction preference, fan-out penalty)
+  if (iterTypes[loopIdx] == utils::IteratorType::reduction) {
+    memInfo.avgStrideComplexity *= 1.15;
+  }
+
+  // Fan-out penalty
+  unsigned fanOut = 0;
+  for (unsigned j = 0; j < getNumLoops(); ++j) {
+    if (itGraph[loopIdx][j])
+      fanOut++;
+  }
+
+  double fanOutRatio = (double)fanOut / getNumLoops();
+  if (fanOutRatio > 0.5) {
+    memInfo.avgStrideComplexity *= (1.0 - fanOutRatio * 0.2);
+  }
+}
+
+double IterationGraphSorter::computePortableScore(unsigned loopIdx) {
+  const auto &memInfo = loopMemoryAnalysis[loopIdx];
+
+  double memoryScore = memInfo.avgStrideComplexity;
+
+  // Bonus for loops that enable sparse optimizations
+  if (memInfo.compressedSequentialAccesses.size() > 0) {
+    memoryScore *=
+        1.2; // Prefer loops that access compressed dimensions sequentially
+  }
+
+  // Penalty for loops that cause random sparse access
+  if (memInfo.randomSparseAccesses.size() >
+      memInfo.compressedSequentialAccesses.size()) {
+    memoryScore *= 0.8; // Penalize loops that cause poor sparse access patterns
+  }
+
+  // Existing logic
+  double parallelScore =
+      (iterTypes[loopIdx] == utils::IteratorType::parallel) ? 1.1 : 1.0;
+
+  unsigned outDegree = 0;
+  unsigned inDegree = 0;
+  for (unsigned j = 0; j < getNumLoops(); ++j) {
+    if (itGraph[loopIdx][j])
+      outDegree++;
+    if (itGraph[j][loopIdx])
+      inDegree++;
+  }
+
+  double graphScore = 1.0 / (1.0 + outDegree * 0.1) + inDegree * 0.05;
+
+  return memoryScore * parallelScore * graphScore;
+}
+
+unsigned IterationGraphSorter::selectBestCandidateByMemory(
+    const std::vector<unsigned> &candidates) {
+  
+  if (candidates.empty()) return 0;
+
+  if (candidates.size() == 1)
+    return candidates[0];
+
+  unsigned bestCandidate = candidates[0];
+  double bestScore = computePortableScore(bestCandidate);
+
+  for (unsigned i = 1; i < candidates.size(); ++i) {
+    unsigned candidate = candidates[i];
+    double score = computePortableScore(candidate);
+
+    if (score > bestScore) {
+      bestScore = score;
+      
+    bestCandidate = candidate;
+    }
+  }
+
+  return bestCandidate;
+}
+
+// Dense-outer heuristic: prefer dense dimensions first
+unsigned IterationGraphSorter::selectBestCandidateByDensity(
+    const std::vector<unsigned> &candidates, bool denseFirst) {
+  unsigned bestCandidate = candidates[0];
+  int bestScore = denseFirst ? -1000 : 1000; // Start with worst possible score
+  
+  for (unsigned candidate : candidates) {
+    int score = 0;
+    
+    // Count dense vs sparse accesses for this loop
+    for (unsigned tensorIdx = 0; tensorIdx < ins.size(); tensorIdx++) {
+      Value tensor = ins[tensorIdx];
+      if (getSparseTensorEncoding(tensor.getType())) {
+        AffineMap dimToLvlMap = loop2InsLvl[tensorIdx];
+        if (candidate < dimToLvlMap.getNumResults()) {
+          auto lvlExpr = dimToLvlMap.getResult(candidate);
+          if (auto dimExpr = dyn_cast<AffineDimExpr>(lvlExpr)) {
+            unsigned lvl = dimExpr.getPosition();
+            auto enc = getSparseTensorEncoding(tensor.getType());
+            if (enc && lvl < enc.getLvlTypes().size()) {
+              auto lvlType = enc.getLvlTypes()[lvl];
+              if (isDenseLT(lvlType)) {
+                score += 10; // Dense is good
+              } else {
+                score -= 5;  // Sparse is bad
+              }
+            }
+          }
+        }
+      } else {
+        score += 5; // Dense tensor access is always good
+      }
+    }
+    
+    
+    bool isBetter = denseFirst ? (score > bestScore) : (score < bestScore);
+    if (isBetter) {
+      bestScore = score;
+      
+    bestCandidate = candidate;
+    }
+  }
+  
+  return bestCandidate;
+}
+
+// Sequential-first heuristic: prefer unit stride accesses
+unsigned IterationGraphSorter::selectBestCandidateBySequentiality(
+    const std::vector<unsigned> &candidates) {
+  unsigned bestCandidate = candidates[0];
+  int bestScore = -1000;
+  
+  for (unsigned candidate : candidates) {
+    int score = 0;
+    
+    // Simple heuristic: prefer lower-numbered loops (often more sequential)
+    // In practice, this would need more sophisticated stride analysis
+    for (unsigned tensorIdx = 0; tensorIdx < ins.size(); tensorIdx++) {
+      AffineMap map = loop2InsLvl[tensorIdx];
+      if (candidate < map.getNumResults()) {
+        auto expr = map.getResult(candidate);
+        // Simple approximation: direct dimension access is better
+        if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
+          if (dimExpr.getPosition() == candidate) {
+            score += 10; // Direct access is good
+          }
+        } else {
+          score -= 5; // Complex expression is worse
+        }
+      }
+    }
+        
+    if (score > bestScore) {
+      bestScore = score;
+      
+    bestCandidate = candidate;
+    }
+  }
+  
+  return bestCandidate;
+}
+
+// Parallel-first heuristic: parallel loops first, then by density
+unsigned IterationGraphSorter::selectBestCandidateByParallelism(
+    const std::vector<unsigned> &candidates) {
+  
+  unsigned bestCandidate = candidates[0];
+  int bestScore = -1000;
+  
+  for (unsigned candidate : candidates) {
+    int score = 0;
+    
+    // Strongly prefer parallel loops
+    if (candidate < iterTypes.size() && iterTypes[candidate] == utils::IteratorType::parallel) {
+      score += 100; // Big bonus for parallel
+    } else {
+      score -= 50;  // Penalty for reduction
----------------
PeimingLiu wrote:

How did you come up with these magic number?

https://github.com/llvm/llvm-project/pull/151885