[Mlir-commits] [mlir] ccd047c - [mlir][sparse] optimize COO index handling

Wed Apr 27 10:20:59 PDT 2022

Author: Aart Bik
Date: 2022-04-27T10:20:47-07:00
New Revision: ccd047cba4f15cd95e8e3895f823757c5988b192

URL: https://github.com/llvm/llvm-project/commit/ccd047cba4f15cd95e8e3895f823757c5988b192
DIFF: https://github.com/llvm/llvm-project/commit/ccd047cba4f15cd95e8e3895f823757c5988b192.diff

LOG: [mlir][sparse] optimize COO index handling

By using a shared index pool, we reduce the footprint of each "Element"
in the COO scheme and, in addition, reduce the overhead of allocating
indices (trading many allocations of vectors for allocations in a single
vector only). When the capacity is known, this means *all* allocation
can be done in advance.

This is a big win. For example, reading matrix SK-2005, with dimensions
50,636,154 x 50,636,154 and 1,949,412,601 nonzero elements improves
as follows (time in ms), or about 3.5x faster overall

```
SK-2005 before        after      speedup
  ---------------------------------------------
read     305,086.65    180,318.12    1.69
sort   2,836,096.23    510,492.87    5.56
pack     364,485.67    312,009.96    1.17
  ---------------------------------------------
TOTAL  3,505,668.56  1,002,820.95    3.50
```

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D124502

Added: 
    

Modified: 
    mlir/lib/ExecutionEngine/SparseTensorUtils.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
index e904bad64c370..7c59ccc0a739d 100644

--- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
@@ -84,22 +84,14 @@ static inline uint64_t checkedMul(uint64_t lhs, uint64_t rhs) {
 ///   ({i}, a[i])
 /// and a rank-5 tensor element like
 ///   ({i,j,k,l,m}, a[i,j,k,l,m])
+/// We use pointer to a shared index pool rather than e.g. a direct
+/// vector since that (1) reduces the per-element memory footprint, and
+/// (2) centralizes the memory reservation and (re)allocation to one place.
 template <typename V>
 struct Element {
-  Element(const std::vector<uint64_t> &ind, V val) : indices(ind), value(val){};
-  std::vector<uint64_t> indices;
+  Element(uint64_t *ind, V val) : indices(ind), value(val){};
+  uint64_t *indices; // pointer into shared index pool
   V value;
-  /// Returns true if indices of e1 < indices of e2.
-  static bool lexOrder(const Element<V> &e1, const Element<V> &e2) {
-    uint64_t rank = e1.indices.size();
-    assert(rank == e2.indices.size());
-    for (uint64_t r = 0; r < rank; r++) {
-      if (e1.indices[r] == e2.indices[r])
-        continue;
-      return e1.indices[r] < e2.indices[r];
-    }
-    return false;
-  }
 };
 
 /// A memory-resident sparse tensor in coordinate scheme (collection of
@@ -112,29 +104,61 @@ struct SparseTensorCOO {
 public:
   SparseTensorCOO(const std::vector<uint64_t> &szs, uint64_t capacity)
       : sizes(szs) {
-    if (capacity)
+    if (capacity) {
       elements.reserve(capacity);
+      indices.reserve(capacity * getRank());
+    }
   }
+
   /// Adds element as indices and value.
   void add(const std::vector<uint64_t> &ind, V val) {
     assert(!iteratorLocked && "Attempt to add() after startIterator()");
+    uint64_t *base = indices.data();
+    uint64_t size = indices.size();
     uint64_t rank = getRank();
     assert(rank == ind.size());
-    for (uint64_t r = 0; r < rank; r++)
+    for (uint64_t r = 0; r < rank; r++) {
       assert(ind[r] < sizes[r]); // within bounds
-    elements.emplace_back(ind, val);
+      indices.push_back(ind[r]);
+    }
+    // This base only changes if indices were reallocated. In that case, we
+    // need to correct all previous pointers into the vector. Note that this
+    // only happens if we did not set the initial capacity right, and then only
+    // for every internal vector reallocation (which with the doubling rule
+    // should only incur an amortized linear overhead).
+    uint64_t *new_base = indices.data();
+    if (new_base != base) {
+      for (uint64_t i = 0, n = elements.size(); i < n; i++)
+        elements[i].indices = new_base + (elements[i].indices - base);
+      base = new_base;
+    }
+    // Add element as (pointer into shared index pool, value) pair.
+    elements.emplace_back(base + size, val);
   }
+
   /// Sorts elements lexicographically by index.
   void sort() {
     assert(!iteratorLocked && "Attempt to sort() after startIterator()");
     // TODO: we may want to cache an `isSorted` bit, to avoid
     // unnecessary/redundant sorting.
-    std::sort(elements.begin(), elements.end(), Element<V>::lexOrder);
+    std::sort(elements.begin(), elements.end(),
+              [this](const Element<V> &e1, const Element<V> &e2) {
+                uint64_t rank = getRank();
+                for (uint64_t r = 0; r < rank; r++) {
+                  if (e1.indices[r] == e2.indices[r])
+                    continue;
+                  return e1.indices[r] < e2.indices[r];
+                }
+                return false;
+              });
   }
+
   /// Returns rank.
   uint64_t getRank() const { return sizes.size(); }
+
   /// Getter for sizes array.
   const std::vector<uint64_t> &getSizes() const { return sizes; }
+
   /// Getter for elements array.
   const std::vector<Element<V>> &getElements() const { return elements; }
 
@@ -143,6 +167,7 @@ struct SparseTensorCOO {
     iteratorLocked = true;
     iteratorPos = 0;
   }
+
   /// Get the next element.
   const Element<V> *getNext() {
     assert(iteratorLocked && "Attempt to getNext() before startIterator()");
@@ -172,7 +197,8 @@ struct SparseTensorCOO {
 
 private:
   const std::vector<uint64_t> sizes; // per-dimension sizes
-  std::vector<Element<V>> elements;
+  std::vector<Element<V>> elements;  // all COO elements
+  std::vector<uint64_t> indices;     // shared index pool
   bool iteratorLocked = false;
   unsigned iteratorPos = 0;
 };