[Mlir-commits] [mlir] [mlir][sparse] support 2:4 structured sparsity and loose compressed (PR #69968)

Mon Oct 23 13:46:45 PDT 2023

https://github.com/aartbik created https://github.com/llvm/llvm-project/pull/69968

This adds library support for these two new level formats.

>From 0227052c68bff354d1d53a0b6ea7a57636d24ba9 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik at google.com>
Date: Mon, 23 Oct 2023 13:15:56 -0700
Subject: [PATCH] [mlir][sparse] support 2:4 structured sparsity and loose
 compressed

This adds library support for these two new level formats.
---
 .../mlir/Dialect/SparseTensor/IR/Enums.h      |   8 +-
 .../ExecutionEngine/SparseTensor/Storage.h    | 115 +++++++++--------
 .../ExecutionEngine/SparseTensor/Storage.cpp  |   7 +-
 .../Dialect/SparseTensor/CPU/sparse_ds.mlir   | 120 ++++++++++++++++++
 mlir/test/Integration/data/ds.mtx             |  14 ++
 5 files changed, 208 insertions(+), 56 deletions(-)
 create mode 100644 mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
 create mode 100755 mlir/test/Integration/data/ds.mtx

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index c65a27567d59d9a..1e9aa2bdf45dbdb 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -277,7 +277,7 @@ constexpr bool isCompressedDLT(DimLevelType dlt) {
          static_cast<uint8_t>(DimLevelType::Compressed);
 }
 
-/// Check if the `DimLevelType` is compressed (regardless of properties).
+/// Check if the `DimLevelType` is loose compressed (regardless of properties).
 constexpr bool isLooseCompressedDLT(DimLevelType dlt) {
   return (static_cast<uint8_t>(dlt) & ~3) ==
          static_cast<uint8_t>(DimLevelType::LooseCompressed);
@@ -289,6 +289,12 @@ constexpr bool isSingletonDLT(DimLevelType dlt) {
          static_cast<uint8_t>(DimLevelType::Singleton);
 }
 
+/// Check if the `DimLevelType` is 2OutOf4 (regardless of properties).
+constexpr bool is2OutOf4DLT(DimLevelType dlt) {
+  return (static_cast<uint8_t>(dlt) & ~3) ==
+         static_cast<uint8_t>(DimLevelType::TwoOutOfFour);
+}
+
 /// Check if the `DimLevelType` is ordered (regardless of storage format).
 constexpr bool isOrderedDLT(DimLevelType dlt) {
   return !(static_cast<uint8_t>(dlt) & 2);
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index ad92ee1f89fc153..460549726356370 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -115,11 +115,19 @@ class SparseTensorStorageBase {
     return isCompressedDLT(getLvlType(l));
   }
 
+  /// Safely checks if the level uses loose compressed storage.
+  bool isLooseCompressedLvl(uint64_t l) const {
+    return isLooseCompressedDLT(getLvlType(l));
+  }
+
   /// Safely checks if the level uses singleton storage.
   bool isSingletonLvl(uint64_t l) const {
     return isSingletonDLT(getLvlType(l));
   }
 
+  /// Safely checks if the level uses 2 out of 4 storage.
+  bool is2OutOf4Lvl(uint64_t l) const { return is2OutOf4DLT(getLvlType(l)); }
+
   /// Safely checks if the level is ordered.
   bool isOrderedLvl(uint64_t l) const { return isOrderedDLT(getLvlType(l)); }
 
@@ -138,9 +146,6 @@ class SparseTensorStorageBase {
   MLIR_SPARSETENSOR_FOREVERY_FIXED_O(DECL_GETCOORDINATES)
 #undef DECL_GETCOORDINATES
 
-  /// Gets the coordinate-value stored at the given level and position.
-  virtual uint64_t getCrd(uint64_t lvl, uint64_t pos) const = 0;
-
   /// Gets primary storage.
 #define DECL_GETVALUES(VNAME, V) virtual void getValues(std::vector<V> **);
   MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETVALUES)
@@ -280,13 +285,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     *out = &values;
   }
 
-  /// Returns coordinate at given position.
-  uint64_t getCrd(uint64_t lvl, uint64_t pos) const final {
-    assert(isCompressedDLT(getLvlType(lvl)) || isSingletonDLT(getLvlType(lvl)));
-    assert(pos < coordinates[lvl].size());
-    return coordinates[lvl][pos]; // Converts the stored `C` into `uint64_t`.
-  }
-
   /// Partially specialize forwarding insertions based on template types.
   void forwardingInsert(const uint64_t *dimCoords, V val) final {
     assert(dimCoords && coo);
@@ -302,7 +300,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     if (allDense) {
       uint64_t lvlRank = getLvlRank();
       uint64_t valIdx = 0;
-      // Linearize the address
+      // Linearize the address.
       for (uint64_t lvl = 0; lvl < lvlRank; lvl++)
         valIdx = valIdx * getLvlSize(lvl) + lvlCoords[lvl];
       values[valIdx] = val;
@@ -441,16 +439,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   }
 
 private:
-  /// Appends an arbitrary new position to `positions[lvl]`.  This method
-  /// checks that `pos` is representable in the `P` type; however, it
-  /// does not check that `pos` is semantically valid (i.e., larger than
-  /// the previous position and smaller than `coordinates[lvl].capacity()`).
-  void appendPos(uint64_t lvl, uint64_t pos, uint64_t count = 1) {
-    assert(isCompressedLvl(lvl));
-    positions[lvl].insert(positions[lvl].end(), count,
-                          detail::checkOverflowCast<P>(pos));
-  }
-
   /// Appends coordinate `crd` to level `lvl`, in the semantically
   /// general sense.  For non-dense levels, that means appending to the
   /// `coordinates[lvl]` array, checking that `crd` is representable in
@@ -461,11 +449,11 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// `full` is the number of "entries" already written to `values` for this
   /// segment (aka one after the highest coordinate previously appended).
   void appendCrd(uint64_t lvl, uint64_t full, uint64_t crd) {
-    const auto dlt = getLvlType(lvl); // Avoid redundant bounds checking.
-    if (isCompressedDLT(dlt) || isSingletonDLT(dlt)) {
+    if (!isDenseLvl(lvl)) {
+      assert(isCompressedLvl(lvl) || isLooseCompressedLvl(lvl) ||
+             isSingletonLvl(lvl) || is2OutOf4Lvl(lvl));
       coordinates[lvl].push_back(detail::checkOverflowCast<C>(crd));
     } else { // Dense level.
-      assert(isDenseDLT(dlt));
       assert(crd >= full && "Coordinate was already filled");
       if (crd == full)
         return; // Short-circuit, since it'll be a nop.
@@ -482,15 +470,13 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// storage, as opposed to "level-sizes" which are the cardinality
   /// of possible coordinates for that level.
   uint64_t assembledSize(uint64_t parentSz, uint64_t l) const {
-    const auto dlt = getLvlType(l); // Avoid redundant bounds checking.
-    if (isCompressedDLT(dlt))
+    if (isCompressedLvl(l))
       return positions[l][parentSz];
-    if (isSingletonDLT(dlt))
+    if (isSingletonLvl(l))
       return parentSz; // New size is same as the parent.
-    if (isDenseDLT(dlt))
-      return parentSz * getLvlSize(l);
-    MLIR_SPARSETENSOR_FATAL("unsupported level type: %d\n",
-                            static_cast<uint8_t>(dlt));
+    // TODO: support levels assignment for loose/2:4?
+    assert(isDenseLvl(l));
+    return parentSz * getLvlSize(l);
   }
 
   /// Initializes sparse tensor storage scheme from a memory-resident sparse
@@ -514,7 +500,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
       uint64_t seg = lo + 1;
       if (isUniqueLvl(l))
         while (seg < hi && lvlElements[seg].coords[l] == c)
-          ++seg;
+          seg++;
       // Handle segment in interval for sparse or dense level.
       appendCrd(l, full, c);
       full = c + 1;
@@ -529,14 +515,22 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// Finalizes the sparse position structure at this level.
   void finalizeSegment(uint64_t l, uint64_t full = 0, uint64_t count = 1) {
     if (count == 0)
-      return;                       // Short-circuit, since it'll be a nop.
-    const auto dlt = getLvlType(l); // Avoid redundant bounds checking.
-    if (isCompressedDLT(dlt)) {
-      appendPos(l, coordinates[l].size(), count);
-    } else if (isSingletonDLT(dlt)) {
+      return; // Short-circuit, since it'll be a nop.
+    if (isCompressedLvl(l)) {
+      uint64_t pos = coordinates[l].size();
+      positions[l].insert(positions[l].end(), count,
+                          detail::checkOverflowCast<P>(pos));
+    } else if (isLooseCompressedLvl(l)) {
+      // Finish this level, and push pairs for the empty ones, and one
+      // more for next level. Note that this always leaves one extra
+      // unused element at the end.
+      uint64_t pos = coordinates[l].size();
+      positions[l].insert(positions[l].end(), 2 * count,
+                          detail::checkOverflowCast<P>(pos));
+    } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) {
       return; // Nothing to finalize.
     } else {  // Dense dimension.
-      assert(isDenseDLT(dlt));
+      assert(isDenseLvl(l));
       const uint64_t sz = getLvlSizes()[l];
       assert(sz >= full && "Segment is overfull");
       count = detail::checkedMul(count, sz - full);
@@ -589,7 +583,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
           (crd < cur && !isOrderedLvl(l))) {
         return l;
       }
-
       if (crd < cur) {
         assert(false && "non-lexicographic insertion");
         return -1u;
@@ -609,27 +602,37 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
       return;
     }
     if (isCompressedLvl(l)) {
-      // Look up the bounds of the `l`-level segment determined by the
-      // `(l - 1)`-level position `parentPos`.
       const std::vector<P> &positionsL = positions[l];
       assert(parentPos + 1 < positionsL.size());
       const uint64_t pstart = static_cast<uint64_t>(positionsL[parentPos]);
       const uint64_t pstop = static_cast<uint64_t>(positionsL[parentPos + 1]);
-      // Loop-invariant code for looking up the `l`-level coordinates.
       const std::vector<C> &coordinatesL = coordinates[l];
       assert(pstop <= coordinatesL.size());
-      for (uint64_t pos = pstart; pos < pstop; ++pos) {
+      for (uint64_t pos = pstart; pos < pstop; pos++) {
         lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
         toCOO(pos, l + 1, dimCoords);
       }
-    } else if (isSingletonLvl(l)) {
-      lvlCursor[l] = getCrd(l, parentPos);
+    } else if (isLooseCompressedLvl(l)) {
+      const std::vector<P> &positionsL = positions[l];
+      assert(2 * parentPos + 1 < positionsL.size());
+      const uint64_t pstart = static_cast<uint64_t>(positionsL[2 * parentPos]);
+      const uint64_t pstop =
+          static_cast<uint64_t>(positionsL[2 * parentPos + 1]);
+      const std::vector<C> &coordinatesL = coordinates[l];
+      assert(pstop <= coordinatesL.size());
+      for (uint64_t pos = pstart; pos < pstop; pos++) {
+        lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
+        toCOO(pos, l + 1, dimCoords);
+      }
+    } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) {
+      assert(parentPos < coordinates[l].size());
+      lvlCursor[l] = static_cast<uint64_t>(coordinates[l][parentPos]);
       toCOO(parentPos, l + 1, dimCoords);
     } else { // Dense level.
       assert(isDenseLvl(l));
       const uint64_t sz = getLvlSizes()[l];
       const uint64_t pstart = parentPos * sz;
-      for (uint64_t c = 0; c < sz; ++c) {
+      for (uint64_t c = 0; c < sz; c++) {
         lvlCursor[l] = c;
         toCOO(pstart + c, l + 1, dimCoords);
       }
@@ -706,19 +709,30 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
   bool allDense = true;
   uint64_t sz = 1;
   for (uint64_t l = 0; l < lvlRank; l++) {
-    const DimLevelType dlt = lvlTypes[l]; // Avoid redundant bounds checking.
-    if (isCompressedDLT(dlt)) {
+    if (isCompressedLvl(l)) {
       positions[l].reserve(sz + 1);
       positions[l].push_back(0);
       coordinates[l].reserve(sz);
       sz = 1;
       allDense = false;
-    } else if (isSingletonDLT(dlt)) {
+    } else if (isLooseCompressedLvl(l)) {
+      positions[l].reserve(2 * sz + 1); // last one unused
+      positions[l].push_back(0);
       coordinates[l].reserve(sz);
       sz = 1;
       allDense = false;
+    } else if (isSingletonLvl(l)) {
+      coordinates[l].reserve(sz);
+      sz = 1;
+      allDense = false;
+    } else if (is2OutOf4Lvl(l)) {
+      assert(allDense && l == lvlRank - 1 && "unexpected 2:4 usage");
+      sz = detail::checkedMul(sz, lvlSizes[l]) / 2;
+      coordinates[l].reserve(sz);
+      values.reserve(sz);
+      allDense = false;
     } else { // Dense level.
-      assert(isDenseDLT(dlt));
+      assert(isDenseLvl(l));
       sz = detail::checkedMul(sz, lvlSizes[l]);
     }
   }
@@ -773,6 +787,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       positions[l].assign(posPtr, posPtr + parentSz + 1);
       coordinates[l].assign(crdPtr, crdPtr + positions[l][parentSz]);
     } else {
+      // TODO: support levels assignment for loose/2:4?
       assert(isDenseLvl(l));
     }
     parentSz = assembledSize(parentSz, l);
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
index 40805a179d4b385..ea7e3125b7f47d9 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -36,11 +36,8 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT
   assert(lvlRank > 0 && "Trivial shape is unsupported");
   for (uint64_t l = 0; l < lvlRank; ++l) {
     assert(lvlSizes[l] > 0 && "Level size zero has trivial storage");
-    const auto dlt = lvlTypes[l];
-    if (!(isDenseDLT(dlt) || isCompressedDLT(dlt) || isSingletonDLT(dlt))) {
-      MLIR_SPARSETENSOR_FATAL("unsupported level type: %d\n",
-                              static_cast<uint8_t>(dlt));
-    }
+    assert(isDenseLvl(l) || isCompressedLvl(l) || isLooseCompressedLvl(l) ||
+           isSingletonLvl(l) || is2OutOf4Lvl(l));
   }
 }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
new file mode 100644
index 000000000000000..773c34e1f3dabca
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
@@ -0,0 +1,120 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
+// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// REDEFINE: %{env} = TENSOR0="%mlir_src_dir/test/Integration/data/ds.mtx"
+// RUN: %{compile} | env %{env} %{run} | FileCheck %s
+//
+// TODO: enable!
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false
+// R_UN: %{compile} | env %{env} %{run} | FileCheck %s
+
+!Filename = !llvm.ptr<i8>
+
+#CSR = #sparse_tensor.encoding<{
+  map = (i, j) -> ( i : dense, j : compressed)
+}>
+
+#CSR_hi = #sparse_tensor.encoding<{
+  map = (i, j) -> ( i : dense, j : loose_compressed)
+}>
+
+#NV_24 = #sparse_tensor.encoding<{
+  map = ( i, j ) -> ( i            : dense,
+                      j floordiv 4 : dense,
+                      j mod 4      : block2_4),
+  crdWidth = 8
+}>
+
+module {
+
+  func.func private @getTensorFilename(index) -> (!Filename)
+
+  //
+  // Input matrix:
+  //
+  //  [[0.0,  0.0,  1.0,  2.0,  0.0,  3.0,  0.0,  4.0],
+  //   [0.0,  5.0,  6.0,  0.0,  7.0,  0.0,  0.0,  8.0],
+  //   [9.0,  0.0, 10.0,  0.0, 11.0, 12.0,  0.0,  0.0]]
+  //
+  func.func @entry() {
+    %u0 = arith.constant 0   : i8
+    %c0 = arith.constant 0   : index
+    %f0 = arith.constant 0.0 : f64
+
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %A1 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #CSR>
+    %A2 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #CSR_hi>
+    %A3 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #NV_24>
+
+    //
+    // CSR:
+    //
+    // CHECK:      ( 0, 4, 8, 12 )
+    // CHECK-NEXT: ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 )
+    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
+    //
+    %pos1 = sparse_tensor.positions %A1 {level = 1 : index } : tensor<?x?xf64, #CSR> to memref<?xindex>
+    %vecp1 = vector.transfer_read %pos1[%c0], %c0 : memref<?xindex>, vector<4xindex>
+    vector.print %vecp1 : vector<4xindex>
+    %crd1 = sparse_tensor.coordinates %A1 {level = 1 : index } : tensor<?x?xf64, #CSR> to memref<?xindex>
+    %vecc1 = vector.transfer_read %crd1[%c0], %c0 : memref<?xindex>, vector<12xindex>
+    vector.print %vecc1 : vector<12xindex>
+    %val1 = sparse_tensor.values %A1 : tensor<?x?xf64, #CSR> to memref<?xf64>
+    %vecv1 = vector.transfer_read %val1[%c0], %f0 : memref<?xf64>, vector<12xf64>
+    vector.print %vecv1 : vector<12xf64>
+
+    //
+    // CSR_hi:
+    //
+    // CHECK-NEXT: ( 0, 4, 4, 8, 8, 12 )
+    // CHECK-NEXT: ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 )
+    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
+    //
+    %pos2 = sparse_tensor.positions %A2 {level = 1 : index } : tensor<?x?xf64, #CSR_hi> to memref<?xindex>
+    %vecp2 = vector.transfer_read %pos2[%c0], %c0 : memref<?xindex>, vector<6xindex>
+    vector.print %vecp2 : vector<6xindex>
+    %crd2 = sparse_tensor.coordinates %A2 {level = 1 : index } : tensor<?x?xf64, #CSR_hi> to memref<?xindex>
+    %vecc2 = vector.transfer_read %crd2[%c0], %c0 : memref<?xindex>, vector<12xindex>
+    vector.print %vecc2 : vector<12xindex>
+    %val2 = sparse_tensor.values %A2 : tensor<?x?xf64, #CSR_hi> to memref<?xf64>
+    %vecv2 = vector.transfer_read %val2[%c0], %f0 : memref<?xf64>, vector<12xf64>
+    vector.print %vecv2 : vector<12xf64>
+
+    //
+    // NV_24
+    //
+    // CHECK-NEXT: ( 2, 3, 1, 3, 1, 2, 0, 3, 0, 2, 0, 1 )
+    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
+    //
+    %crd3 = sparse_tensor.coordinates %A3 {level = 2 : index } : tensor<?x?xf64, #NV_24> to memref<?xi8>
+    %vecc3 = vector.transfer_read %crd3[%c0], %u0 : memref<?xi8>, vector<12xi8>
+    vector.print %vecc3 : vector<12xi8>
+    %val3 = sparse_tensor.values %A3 : tensor<?x?xf64, #NV_24> to memref<?xf64>
+    %vecv3 = vector.transfer_read %val3[%c0], %f0 : memref<?xf64>, vector<12xf64>
+    vector.print %vecv3 : vector<12xf64>
+
+    // Release the resources.
+    bufferization.dealloc_tensor %A1: tensor<?x?xf64, #CSR>
+    bufferization.dealloc_tensor %A2: tensor<?x?xf64, #CSR_hi>
+    bufferization.dealloc_tensor %A3: tensor<?x?xf64, #NV_24>
+
+    return
+  }
+}
diff --git a/mlir/test/Integration/data/ds.mtx b/mlir/test/Integration/data/ds.mtx
new file mode 100755
index 000000000000000..8acc2ce081b6b35
--- /dev/null
+++ b/mlir/test/Integration/data/ds.mtx
@@ -0,0 +1,14 @@
+%%MatrixMarket matrix coordinate real general
+3 8 12
+1 3  1.0
+1 4  2.0
+1 6  3.0
+1 8  4.0
+2 2  5.0
+2 3  6.0
+2 5  7.0
+2 8  8.0
+3 1  9.0
+3 3 10.0
+3 5 11.0
+3 6 12.0