[Mlir-commits] [mlir] [MLIR] [SparseTensor] Implement multiple loop ordering heuristics for sparse tensor dialect (PR #151885)
Peiming Liu
llvmlistbot at llvm.org
Fri Aug 8 11:05:41 PDT 2025
================
@@ -271,3 +349,907 @@ void IterationGraphSorter::addConstraints(Value t, AffineMap loop2LvlMap) {
}
}
}
+
+// get encoding info (storage format, level types, etc)
+SparseTensorEncodingAttr getEncodingInfo(Value tensor) {
+ auto tensorType = dyn_cast<RankedTensorType>(tensor.getType());
+ if (!tensorType)
+ return nullptr; // Not a ranked tensor type
+ return getSparseTensorEncoding(tensorType);
+}
+
+void IterationGraphSorter::analyzeMemoryPatterns() {
+ const unsigned numLoops = getNumLoops();
+ loopMemoryAnalysis.resize(numLoops);
+
+ // Initialize memory analysis for each loop
+ for (unsigned loop = 0; loop < numLoops; ++loop) {
+ auto &memInfo = loopMemoryAnalysis[loop];
+ memInfo.totalTensorAccesses = 0;
+ memInfo.sparseAccessCost = 0;
+ memInfo.compressedSequentialAccesses.clear();
+ memInfo.randomSparseAccesses.clear();
+ memInfo.unitStrideAccesses.clear();
+ memInfo.avgStrideComplexity = 0.0;
+ memInfo.spatialLocalityScore = 0.0;
+ memInfo.temporalReuseScore = 0.0;
+ memInfo.accessPatternRand = 0.0;
+ }
+
+ // Analyze input tensors
+ for (auto [tensorIdx, tensor] : llvm::enumerate(ins)) {
+ const AffineMap &map = loop2InsLvl[tensorIdx];
+ analyzeMapForMemoryPatterns(map, tensorIdx, tensor, false);
+ }
+
+ // Analyze output tensor
+ analyzeMapForMemoryPatterns(loop2OutLvl, ins.size(), out, true);
+
+ // Compute final scores without architecture assumptions
+ for (unsigned loop = 0; loop < numLoops; ++loop) {
+ computeArchitectureScore(loop);
+ }
+}
+
+IterationGraphSorter::SparseAccessPattern
+IterationGraphSorter::analyzeSparseAccessPattern(
+ AffineMap map, unsigned dim, unsigned loopIdx,
+ SparseTensorEncodingAttr encoding, unsigned tensorIdx) {
+
+ SparseAccessPattern pattern;
+
+ // Get the level types for this encoding
+ auto lvlTypes = encoding.getLvlTypes();
+ if (dim >= lvlTypes.size()) {
+ pattern.type = IterationGraphSorter::SparseAccessType::kRandomSparse;
+ pattern.expectedSparsity = 0.01;
+ pattern.memoryIndirections = 3;
+ pattern.hasGoodLocality = false;
+ return pattern;
+ }
+
+ LevelType levelType = lvlTypes[dim];
+ AffineExpr dimExpr = map.getResult(dim);
+
+ // Analyze the affine expression for this dimension
+ if (auto dimExprCast = dyn_cast<AffineDimExpr>(dimExpr)) {
+ // Simple case: dimension expression is just a loop variable
+ if (dimExprCast.getPosition() == loopIdx) {
+
+ if (isCompressedLT(levelType)) {
+ // Sequential access through compressed dimension
+ pattern.type = SparseAccessType::kCompressedSequential;
+ pattern.expectedSparsity = 1.0;
+ pattern.memoryIndirections = 1;
+ pattern.hasGoodLocality = true;
+ } else if (isSingletonLT(levelType)) {
+ // Sequential scan through singleton dimension
+ pattern.type = SparseAccessType::kSingletonScan;
+ pattern.expectedSparsity = 0.1;
+ pattern.memoryIndirections = 2;
+ pattern.hasGoodLocality = false;
+ } else {
+ // Dense level
+ pattern.type = SparseAccessType::kDenseSubtensor;
+ pattern.expectedSparsity = 1.0;
+ pattern.memoryIndirections = 1;
+ pattern.hasGoodLocality = true;
+ }
+ } else {
+ // Loop variable doesn't match this dimension
+ pattern.type = IterationGraphSorter::SparseAccessType::kRandomSparse;
+ pattern.expectedSparsity = 0.01;
+ pattern.memoryIndirections = 3;
+ pattern.hasGoodLocality = false;
+ }
+ } else {
+ // Complex affine expression - generally bad for sparse access
+ pattern.type = IterationGraphSorter::SparseAccessType::kRandomSparse;
+ pattern.expectedSparsity = 0.01;
+ pattern.memoryIndirections = 3;
+ pattern.hasGoodLocality = false;
+ }
+
+ return pattern;
+}
+
+void IterationGraphSorter::analyzeMapForMemoryPatterns(AffineMap map,
+ unsigned tensorIdx,
+ Value tensor,
+ bool isOutput) {
+
+ auto encoding = getEncodingInfo(tensor);
+ bool isSparse = static_cast<bool>(encoding);
+
+ const unsigned tensorRank = map.getNumResults();
+
+ for (unsigned dim = 0; dim < tensorRank; ++dim) {
+ AffineExpr dimExpr = map.getResult(dim);
+
+ AffineDimCollector collector;
+ collector.walkPostOrder(dimExpr);
+
+ for (auto dimExprNode : collector.dims) {
+ unsigned loopIdx = dimExprNode.getPosition();
+ auto &loopInfo = loopMemoryAnalysis[loopIdx];
+ loopInfo.totalTensorAccesses++;
+
+ if (isSparse) {
+ // Sparse tensor analysis
+ SparseAccessPattern pattern =
+ analyzeSparseAccessPattern(map, dim, loopIdx, encoding, tensorIdx);
+
+ switch (pattern.type) {
+ case SparseAccessType::kCompressedSequential:
+ loopInfo.compressedSequentialAccesses.push_back(tensorIdx);
+ break;
+ case SparseAccessType::kSingletonScan:
+ loopInfo.singletonScanAccesses.push_back(tensorIdx);
+ break;
+ case SparseAccessType::kRandomSparse:
+ loopInfo.randomSparseAccesses.push_back(tensorIdx);
+ break;
+ case SparseAccessType::kDenseSubtensor:
+ loopInfo.unitStrideAccesses.push_back(tensorIdx);
+ break;
+ }
+ } else {
+ // Dense tensor analysis (your original code)
+ unsigned strideComplexity =
+ computeStrideComplexity(map.getResult(dim), loopIdx);
+ if (strideComplexity == 1) {
+ loopInfo.unitStrideAccesses.push_back(tensorIdx);
+ } else if (strideComplexity == 2) {
+ loopInfo.linearStrideAccesses.push_back(tensorIdx);
+ } else {
+ loopInfo.complexAccesses.push_back(tensorIdx);
+ }
+ }
+ }
+ }
+}
+
+unsigned IterationGraphSorter::computeStrideComplexity(AffineExpr expr,
+ unsigned targetLoop) {
+ if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
+ return dimExpr.getPosition() == targetLoop ? 1 : 3;
+ }
+
+ AffineDimCollector collector;
+ collector.walkPostOrder(expr);
+
+ unsigned targetLoopCount = 0;
+ unsigned otherLoopCount = 0;
+
+ for (auto dim : collector.dims) {
+ if (dim.getPosition() == targetLoop) {
+ targetLoopCount++;
+ } else {
+ otherLoopCount++;
+ }
+ }
+
+ if (targetLoopCount == 1 && otherLoopCount == 0) {
+ return 1; // Unit stride
+ } else if (targetLoopCount == 1 && otherLoopCount <= 1) {
+ return 2; // Linear stride
+ } else {
+ return 3; // Complex
+ }
+}
+
+void IterationGraphSorter::computeArchitectureScore(unsigned loopIdx) {
+ auto &memInfo = loopMemoryAnalysis[loopIdx];
+
+ if (memInfo.totalTensorAccesses == 0) {
+ memInfo.avgStrideComplexity = 0.0;
+ return;
+ }
+
+ // Compute sparse access cost
+ double sparseAccessScore = 0.0;
+ unsigned totalSparseAccesses = memInfo.compressedSequentialAccesses.size() +
+ memInfo.singletonScanAccesses.size() +
+ memInfo.randomSparseAccesses.size();
+
+ if (totalSparseAccesses > 0) {
+ // Weighted scoring based on access pattern efficiency
+ double compressedRatio =
+ (double)memInfo.compressedSequentialAccesses.size() /
+ totalSparseAccesses;
+ double singletonRatio =
+ (double)memInfo.singletonScanAccesses.size() / totalSparseAccesses;
+ double randomRatio =
+ (double)memInfo.randomSparseAccesses.size() / totalSparseAccesses;
+
+ double unitStrideRatio =
+ memInfo.totalTensorAccesses > 0
+ ? (double)(memInfo.unitStrideAccesses.size() +
+ memInfo.compressedSequentialAccesses.size()) /
+ memInfo.totalTensorAccesses
+ : 0.0;
+ memInfo.spatialLocalityScore = unitStrideRatio;
+
+ // Temporal reuse: reward loops that access multiple tensors (more reuse
+ // potential)
+ memInfo.temporalReuseScore =
+ std::min(1.0, memInfo.totalTensorAccesses / 3.0);
+
+ // Apply locality bonuses to final score
+ memInfo.avgStrideComplexity *= (1.0 + memInfo.spatialLocalityScore * 0.1);
+ memInfo.avgStrideComplexity *= (1.0 + memInfo.temporalReuseScore * 0.05);
+
+ // Scoring: compressed access = 1.0, singleton = 0.4, random = 0.1
+ sparseAccessScore =
+ compressedRatio * 1.0 + singletonRatio * 0.4 + randomRatio * 0.1;
+ }
+
+ // Compute dense access score
+ double denseAccessScore = 0.0;
+ unsigned totalDenseAccesses = memInfo.unitStrideAccesses.size() +
+ memInfo.linearStrideAccesses.size() +
+ memInfo.complexAccesses.size();
+
+ if (totalDenseAccesses > 0) {
+ double unitStrideRatio =
+ (double)memInfo.unitStrideAccesses.size() / totalDenseAccesses;
+ double linearStrideRatio =
+ (double)memInfo.linearStrideAccesses.size() / totalDenseAccesses;
+ double complexAccessRatio =
+ (double)memInfo.complexAccesses.size() / totalDenseAccesses;
+
+ denseAccessScore = unitStrideRatio * 1.0 + linearStrideRatio * 0.7 +
+ complexAccessRatio * 0.2;
+ }
+
+ // Combine sparse and dense scores
+ double totalAccesses = totalSparseAccesses + totalDenseAccesses;
+ if (totalAccesses > 0) {
+ double sparseWeight = (double)totalSparseAccesses / totalAccesses;
+ double denseWeight = (double)totalDenseAccesses / totalAccesses;
+
+ memInfo.avgStrideComplexity =
+ sparseWeight * sparseAccessScore + denseWeight * denseAccessScore;
+ } else {
+ memInfo.avgStrideComplexity = 0.0;
+ }
+
+ // Apply existing bonuses (reduction preference, fan-out penalty)
+ if (iterTypes[loopIdx] == utils::IteratorType::reduction) {
+ memInfo.avgStrideComplexity *= 1.15;
+ }
+
+ // Fan-out penalty
+ unsigned fanOut = 0;
+ for (unsigned j = 0; j < getNumLoops(); ++j) {
+ if (itGraph[loopIdx][j])
+ fanOut++;
+ }
+
+ double fanOutRatio = (double)fanOut / getNumLoops();
+ if (fanOutRatio > 0.5) {
+ memInfo.avgStrideComplexity *= (1.0 - fanOutRatio * 0.2);
+ }
+}
+
+double IterationGraphSorter::computePortableScore(unsigned loopIdx) {
+ const auto &memInfo = loopMemoryAnalysis[loopIdx];
+
+ double memoryScore = memInfo.avgStrideComplexity;
+
+ // Bonus for loops that enable sparse optimizations
+ if (memInfo.compressedSequentialAccesses.size() > 0) {
+ memoryScore *=
+ 1.2; // Prefer loops that access compressed dimensions sequentially
+ }
+
+ // Penalty for loops that cause random sparse access
+ if (memInfo.randomSparseAccesses.size() >
+ memInfo.compressedSequentialAccesses.size()) {
+ memoryScore *= 0.8; // Penalize loops that cause poor sparse access patterns
+ }
+
+ // Existing logic
+ double parallelScore =
+ (iterTypes[loopIdx] == utils::IteratorType::parallel) ? 1.1 : 1.0;
+
+ unsigned outDegree = 0;
+ unsigned inDegree = 0;
+ for (unsigned j = 0; j < getNumLoops(); ++j) {
+ if (itGraph[loopIdx][j])
+ outDegree++;
+ if (itGraph[j][loopIdx])
+ inDegree++;
+ }
+
+ double graphScore = 1.0 / (1.0 + outDegree * 0.1) + inDegree * 0.05;
+
+ return memoryScore * parallelScore * graphScore;
+}
+
+unsigned IterationGraphSorter::selectBestCandidateByMemory(
+ const std::vector<unsigned> &candidates) {
+
+ if (candidates.empty()) return 0;
+
+ if (candidates.size() == 1)
+ return candidates[0];
+
+ unsigned bestCandidate = candidates[0];
+ double bestScore = computePortableScore(bestCandidate);
+
+ for (unsigned i = 1; i < candidates.size(); ++i) {
+ unsigned candidate = candidates[i];
+ double score = computePortableScore(candidate);
+
+ if (score > bestScore) {
+ bestScore = score;
+
+ bestCandidate = candidate;
+ }
+ }
+
+ return bestCandidate;
+}
+
+// Dense-outer heuristic: prefer dense dimensions first
+unsigned IterationGraphSorter::selectBestCandidateByDensity(
+ const std::vector<unsigned> &candidates, bool denseFirst) {
+ unsigned bestCandidate = candidates[0];
+ int bestScore = denseFirst ? -1000 : 1000; // Start with worst possible score
+
+ for (unsigned candidate : candidates) {
+ int score = 0;
+
+ // Count dense vs sparse accesses for this loop
+ for (unsigned tensorIdx = 0; tensorIdx < ins.size(); tensorIdx++) {
+ Value tensor = ins[tensorIdx];
+ if (getSparseTensorEncoding(tensor.getType())) {
+ AffineMap dimToLvlMap = loop2InsLvl[tensorIdx];
+ if (candidate < dimToLvlMap.getNumResults()) {
+ auto lvlExpr = dimToLvlMap.getResult(candidate);
+ if (auto dimExpr = dyn_cast<AffineDimExpr>(lvlExpr)) {
+ unsigned lvl = dimExpr.getPosition();
+ auto enc = getSparseTensorEncoding(tensor.getType());
+ if (enc && lvl < enc.getLvlTypes().size()) {
+ auto lvlType = enc.getLvlTypes()[lvl];
+ if (isDenseLT(lvlType)) {
+ score += 10; // Dense is good
+ } else {
+ score -= 5; // Sparse is bad
+ }
+ }
+ }
+ }
+ } else {
+ score += 5; // Dense tensor access is always good
+ }
+ }
+
+
+ bool isBetter = denseFirst ? (score > bestScore) : (score < bestScore);
+ if (isBetter) {
+ bestScore = score;
+
+ bestCandidate = candidate;
+ }
+ }
+
+ return bestCandidate;
+}
+
+// Sequential-first heuristic: prefer unit stride accesses
+unsigned IterationGraphSorter::selectBestCandidateBySequentiality(
+ const std::vector<unsigned> &candidates) {
+ unsigned bestCandidate = candidates[0];
+ int bestScore = -1000;
+
+ for (unsigned candidate : candidates) {
+ int score = 0;
+
+ // Simple heuristic: prefer lower-numbered loops (often more sequential)
+ // In practice, this would need more sophisticated stride analysis
+ for (unsigned tensorIdx = 0; tensorIdx < ins.size(); tensorIdx++) {
+ AffineMap map = loop2InsLvl[tensorIdx];
+ if (candidate < map.getNumResults()) {
+ auto expr = map.getResult(candidate);
+ // Simple approximation: direct dimension access is better
+ if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
+ if (dimExpr.getPosition() == candidate) {
+ score += 10; // Direct access is good
+ }
+ } else {
+ score -= 5; // Complex expression is worse
+ }
+ }
+ }
+
+ if (score > bestScore) {
+ bestScore = score;
+
+ bestCandidate = candidate;
+ }
+ }
+
+ return bestCandidate;
+}
+
+// Parallel-first heuristic: parallel loops first, then by density
+unsigned IterationGraphSorter::selectBestCandidateByParallelism(
+ const std::vector<unsigned> &candidates) {
+
+ unsigned bestCandidate = candidates[0];
+ int bestScore = -1000;
+
+ for (unsigned candidate : candidates) {
+ int score = 0;
+
+ // Strongly prefer parallel loops
+ if (candidate < iterTypes.size() && iterTypes[candidate] == utils::IteratorType::parallel) {
+ score += 100; // Big bonus for parallel
+ } else {
+ score -= 50; // Penalty for reduction
----------------
PeimingLiu wrote:
How did you come up with these magic number?
https://github.com/llvm/llvm-project/pull/151885
More information about the Mlir-commits
mailing list