[Mlir-commits] [mlir] 93c81f4 - [mlir][taco] Uses sparse_tensor.new to read tensor input data from files.

Thu Feb 3 08:26:38 PST 2022

Author: Bixia Zheng
Date: 2022-02-03T08:26:33-08:00
New Revision: 93c81f44cce802be7f2b723a96ed8e10db6101fb

URL: https://github.com/llvm/llvm-project/commit/93c81f44cce802be7f2b723a96ed8e10db6101fb
DIFF: https://github.com/llvm/llvm-project/commit/93c81f44cce802be7f2b723a96ed8e10db6101fb.diff

LOG: [mlir][taco] Uses sparse_tensor.new to read tensor input data from files.

Replace the Python implementation for reading tensor input data from files with
create_sparse_tensor that uses sparse_tensor.new.

The MLIR TNS format has two extra meta data lines. Add the extra meta data to a
test data file.

Implement TACO tensor methods evaluate and unpack.

Add unit tests.

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D118803

Added: 
    mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_io.py

Modified: 
    mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns
    mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
    mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py

Removed: 
    


################################################################################
diff  --git a/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns b/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns
index a6c570c3c7d8f..b82ce864820fb 100644

--- a/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns
@@ -1,3 +1,8 @@
+# Extended FROSTT format:
+# rank number-non-zero-elements
+# dimension-sizes
+3 5
+2 4 4
 1 1 1 1.0
 1 2 2 2.0
 1 3 4 3.0

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
index f74ae09b9087e..24f114dba64a9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
@@ -532,6 +532,24 @@ def get_formats(self, idx: int) -> Tuple[ModeFormat]:
     return tuple(self._get_element(idx).dst_format.format_pack.formats)
 
 
+class _SparseValueInfo(enum.Enum):
+  """Describes how a sparse tensor value is stored.
+  _UNPACKED: The sparse tensor value is stored as (coordnates, values) in
+    Python.
+  _PACKED: The sparse tensor value is stored as a C pointer to a packed MLIR
+    sparse tensor.
+  """
+  _UNPACKED = 0
+  _PACKED = 1
+
+
+ at dataclasses.dataclass(frozen=True)
+class _Assignment:
+  """Records an assignment to a tensor T as T[indices] = expression."""
+  indices: Tuple["IndexVar", ...]
+  expression: "IndexExpr"
+
+
 class Tensor:
   """The tensor class.
 
@@ -622,12 +640,14 @@ def __init__(self,
     self._name = name or self._get_unique_name()
 
     self._dtype = dtype
+    self._assignment = None
     # We currently use _coords and _values to host the sparse tensor value with
     # COO format, and _dense_storage to host the dense tensor value. We haven't
     # implement the conversion between the two storages yet. This will be
     # improved in a follow up CL.
     self._coords = []
     self._values = []
+    self._sparse_value_location = _SparseValueInfo._UNPACKED
     self._dense_storage = None
     self._stats = _Stats()
     if value_or_shape is None or isinstance(value_or_shape, int) or isinstance(
@@ -647,7 +667,29 @@ def __init__(self,
                        "Must be a tuple or list for a shape or a single value"
                        f"if initializing a scalar tensor: {value_or_shape}.")
 
+  def is_unpacked(self) -> bool:
+    """Returns true if the tensor value is not packed as MLIR sparse tensor."""
+    return (self._sparse_value_location == _SparseValueInfo._UNPACKED)
+
+  def unpack(self) -> None:
+    """Unpacks the MLIR sparse tensor representation."""
+    if self.is_dense() or self.is_unpacked():
+      return
+
+    # Use the output MLIR sparse tensor pointer to retrieve the COO-flavored
+    # values and verify the values.
+    rank, nse, shape, values, indices = utils.sparse_tensor_to_coo_tensor(
+        self._packed_sparse_value, np.float64)
+    assert rank == self.order
+    assert np.allclose(self.shape, shape)
+    assert nse == len(values)
+    self._coords = indices
+    self._values = values
+    self._sparse_value_location = _SparseValueInfo._UNPACKED
+
   def __repr__(self) -> str:
+    self._sync_value()
+    self._unpack()
     value_str = (f"{repr(self._dense_storage)})" if self.is_dense() else
                  f"{repr(self._coords)} {repr(self._values)})")
     return (f"Tensor(_name={repr(self._name)} "
@@ -665,6 +707,11 @@ def insert(self, coords: List[int], val: Union[float, int]) -> None:
     Raises:
       ValueError: When there is any problem in the parameters.
     """
+    if self.is_dense():
+      raise ValueError("Insert method is not supported for dense tensors.")
+    if self._assignment != None or not self.is_unpacked():
+      raise ValueError(
+          "Can't use Insert method for a tensor constructed from a file.")
     if not isinstance(coords, list):
       raise ValueError(f"Non list coordinate detected: {coords}.")
     if not _all_instance_of(coords, int):
@@ -692,6 +739,9 @@ def to_array(self) -> np.ndarray:
     if not self.is_dense():
       raise ValueError("Conversion from non-dense Tensor "
                        "to numpy array not supported yet.")
+
+    self._sync_value()
+
     return self._dense_storage
 
   @staticmethod
@@ -755,6 +805,32 @@ def from_coo(
 
     return tensor
 
+  @staticmethod
+  def from_file(
+      filename: str,
+      fmt: Format,
+      dtype: DType,
+  ) -> "Tensor":
+    """Constructs a sparse tensor using the COO-flavored values from a file.
+
+    Args:
+      filename: A string for the name of the file that contains the sparse
+        tensor data.
+      fmt: The tensor storage format.
+      dtype: The tensor element data type.
+
+    Returns:
+      A tensor with the given non-zero values and storage format. The tensor
+      value is stored as an MLIR sparse tensor.
+    """
+    sparse_tensor, shape = utils.create_sparse_tensor(filename,
+                                                      fmt.format_pack.formats)
+    tensor = Tensor(shape.tolist(), fmt)
+    tensor._sparse_value_location = _SparseValueInfo._PACKED
+    tensor._packed_sparse_value = sparse_tensor
+
+    return tensor
+
   @property
   def dtype(self) -> DType:
     """Returns the data type for the Tensor."""
@@ -827,7 +903,13 @@ def __setitem__(self, key, value) -> None:
       raise ValueError("Mismatch between indices and tensor rank: "
                        f"len({indices}) != {self.order}.")
 
-    result = value.evaluate(self, indices)
+    self._assignment = _Assignment(indices, value)
+
+  def evaluate(self) -> None:
+    """Evaluates the assignment to the tensor."""
+    result = self._assignment.expression.evaluate(self,
+                                                  self._assignment.indices)
+    self._assignment = None
     if self.is_dense():
       assert isinstance(result, np.ndarray)
       self._dense_storage = result
@@ -836,6 +918,11 @@ def __setitem__(self, key, value) -> None:
       assert (result[0].ndim, result[1].ndim) == (1, 2)
       (self._values, self._coords) = result
 
+  def _sync_value(self) -> None:
+    """Updates the tensor value by evaluating the pending assignment."""
+    if self._assignment is not None:
+      self.evaluate()
+
   def mlir_tensor_type(self) -> ir.RankedTensorType:
     """Returns the MLIR type for the tensor."""
     return _mlir_tensor_type(self._dtype, tuple(self._shape),
@@ -860,16 +947,21 @@ def ctype_pointer(self) -> ctypes.pointer:
         self._dense_storage = np.zeros(self._shape, self._dtype.value)
       return _ctype_pointer_from_array(self._dense_storage)
 
-    shape = np.array(self._shape, np.int64)
-    indices = np.array(self._coords, np.int64)
-    values = np.array(self._values, self._dtype.value)
-    ptr = utils.coo_tensor_to_sparse_tensor(shape, values, indices)
+    if self.is_unpacked():
+      shape = np.array(self._shape, np.int64)
+      indices = np.array(self._coords, np.int64)
+      values = np.array(self._values, self._dtype.value)
+      ptr = utils.coo_tensor_to_sparse_tensor(shape, values, indices)
+    else:
+      ptr = self._packed_sparse_value
+
     return ctypes.pointer(ctypes.cast(ptr, ctypes.c_void_p))
 
   def get_coordinates_and_values(
       self) -> Tuple[List[Tuple[int, ...]], List[_AnyRuntimeType]]:
     """Returns the coordinates and values for the non-zero elements."""
     if not self.is_dense():
+      assert (self.is_unpacked())
       return (self._coords, self._values)
 
     # Coordinates for non-zero elements, grouped by dimensions.

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py
index 0ee69c78da37a..5d446d6af1636 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py
@@ -30,118 +30,6 @@
 _MTX_FILENAME_SUFFIX = ".mtx"
 _TNS_FILENAME_SUFFIX = ".tns"
 
-_MTX_HEAD = "%%MatrixMarket"
-_MTX_MATRIX = "matrix"
-_MTX_COORDINATE = "coordinate"
-_MTX_REAL = "real"
-_MTX_SYMMETRY = "symmetric"
-_MTX_GENERAL = "general"
-_SYMMETRY_FIELD_ID = 4
-
-# The TACO supported header for .mtx has the following five fields:
-# . %%MatrixMarket
-# . matrix | tensor
-# . coordinate | array
-# . real
-# . symmetric | general
-#
-# This is what we support currently.
-_SUPPORTED_HEADER_FIELDS = ((_MTX_HEAD,), (_MTX_MATRIX,), (_MTX_COORDINATE,),
-                            (_MTX_REAL,), (_MTX_GENERAL, _MTX_SYMMETRY))
-
-_A_SPACE = " "
-_MTX_COMMENT = "%"
-_TNS_COMMENT = "#"
-
-
-def _coordinate_from_strings(strings: List[str]) -> List[int]:
-  """"Return the coordinate represented by the input strings."""
-  # Coordinates are 1-based in the text file and 0-based in memory.
-  return [int(s) - 1 for s in strings]
-
-
-def _read_coordinate_format(file: TextIO, tensor: Tensor,
-                            is_symmetric: bool) -> None:
-  """Reads tensor values in coordinate format."""
-  rank = tensor.order
-  # Process the data for the tensor.
-  for line in file:
-    if not line:
-      continue
-
-    fields = line.split(_A_SPACE)
-    if rank != len(fields) - 1:
-      raise ValueError("The format and data have mismatched ranks: "
-                       f"{rank} vs {len(fields)-1}.")
-    coordinate = _coordinate_from_strings(fields[:-1])
-    value = float(fields[-1])
-    tensor.insert(coordinate, value)
-    if is_symmetric and coordinate[0] != coordinate[-1]:
-      coordinate.reverse()
-      tensor.insert(coordinate, value)
-
-
-def _read_mtx(file: TextIO, fmt: Format) -> Tensor:
-  """Inputs tensor from a text file with .mtx format."""
-  # The first line should have this five fields:
-  #   head tensor-kind format data-type symmetry
-  fields = file.readline().rstrip("\n").split(_A_SPACE)
-  tuple_to_str = lambda x: "|".join(x)
-  if len(fields) != len(_SUPPORTED_HEADER_FIELDS):
-    raise ValueError(
-        "Expected first line with theses fields "
-        f"{' '.join(map(tuple_to_str, _SUPPORTED_HEADER_FIELDS))}: "
-        f"{' '.join(fields)}")
-
-  for i, values in enumerate(_SUPPORTED_HEADER_FIELDS):
-    if fields[i] not in values:
-      raise ValueError(f"The {i}th field can only be one of these values "
-                       f"{tuple_to_str(values)}: {fields[i]}")
-
-  is_symmetric = (fields[_SYMMETRY_FIELD_ID] == _MTX_SYMMETRY)
-  # Skip leading empty lines or comment lines.
-  line = file.readline()
-  while not line or line[0] == _MTX_COMMENT:
-    line = file.readline()
-
-  # Process the first data line with dimensions and number of non-zero values.
-  fields = line.split(_A_SPACE)
-  rank = fmt.rank()
-  if rank != len(fields) - 1:
-    raise ValueError("The format and data have mismatched ranks: "
-                     f"{rank} vs {len(fields)-1}.")
-  shape = fields[:-1]
-  shape = [int(s) for s in shape]
-  num_non_zero = float(fields[-1])
-
-  # Read the tensor values in coordinate format.
-  tensor = Tensor(shape, fmt)
-  _read_coordinate_format(file, tensor, is_symmetric)
-  return tensor
-
-
-def _read_tns(file: TextIO, fmt: Format) -> Tensor:
-  """Inputs tensor from a text file with .tns format."""
-  rank = fmt.rank()
-  coordinates = []
-  values = []
-  dtype = DType(Type.FLOAT64)
-
-  for line in file:
-    # Skip empty lines and comment lines.
-    if not line or line[0] == _TNS_COMMENT:
-      continue
-
-    # Process each line with a coordinate and the value at the coordinate.
-    fields = line.split(_A_SPACE)
-    if rank != len(fields) - 1:
-      raise ValueError("The format and data have mismatched ranks: "
-                       f"{rank} vs {len(fields)-1}.")
-    coordinates.append(tuple(_coordinate_from_strings(fields[:-1])))
-    values.append(dtype.value(fields[-1]))
-
-  return Tensor.from_coo(coordinates, values, fmt, dtype)
-
 
 def _write_tns(file: TextIO, tensor: Tensor) -> None:
   """Outputs a tensor to a file using .tns format."""
@@ -177,9 +65,7 @@ def read(filename: str, fmt: Format) -> Tensor:
   if not isinstance(fmt, Format) or fmt.is_dense():
     raise ValueError(f"Expected a sparse Format object: {fmt}.")
 
-  with open(filename, "r") as file:
-    return (_read_mtx(file, fmt) if filename.endswith(_MTX_FILENAME_SUFFIX) else
-            _read_tns(file, fmt))
+  return Tensor.from_file(filename, fmt, DType(Type.FLOAT64))
 
 
 def write(filename: str, tensor: Tensor) -> None:
@@ -202,5 +88,7 @@ def write(filename: str, tensor: Tensor) -> None:
   if not isinstance(tensor, Tensor):
     raise ValueError(f"Expected a Tensor object: {tensor}.")
 
+  # TODO: combine the evaluation and the outputing into one step.
+  tensor._sync_value()
   with open(filename, "w") as file:
     return _write_tns(file, tensor)

diff  --git a/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_io.py b/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_io.py
new file mode 100644
index 0000000000000..1466dc841dcd6
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_io.py
@@ -0,0 +1,110 @@
+# RUN: SUPPORTLIB=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext %PYTHON %s | FileCheck %s
+
+from string import Template
+
+import numpy as np
+import os
+import sys
+import tempfile
+
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+from tools import mlir_pytaco
+from tools import mlir_pytaco_io
+from tools import mlir_pytaco_utils as pytaco_utils
+
+# Define the aliases to shorten the code.
+_COMPRESSED = mlir_pytaco.ModeFormat.COMPRESSED
+_DENSE = mlir_pytaco.ModeFormat.DENSE
+
+
+def _run(f):
+  print(f.__name__)
+  f()
+  return f
+
+
+_FORMAT = mlir_pytaco.Format([_COMPRESSED, _COMPRESSED])
+_MTX_DATA_TEMPLATE = Template(
+    """%%MatrixMarket matrix coordinate real $general_or_symmetry
+3 3 3
+3 1 3
+1 2 2
+3 2 4
+""")
+
+
+def _get_mtx_data(value):
+  mtx_data = _MTX_DATA_TEMPLATE
+  return mtx_data.substitute(general_or_symmetry=value)
+
+
+# CHECK-LABEL: test_read_mtx_matrix_general
+ at _run
+def test_read_mtx_matrix_general():
+  with tempfile.TemporaryDirectory() as test_dir:
+    file_name = os.path.join(test_dir, "data.mtx")
+    with open(file_name, "w") as file:
+      file.write(_get_mtx_data("general"))
+    a = mlir_pytaco_io.read(file_name, _FORMAT)
+  passed = 0
+  # The value of a is stored as an MLIR sparse tensor.
+  passed += (not a.is_unpacked())
+  a.unpack()
+  passed += (a.is_unpacked())
+  coords, values = a.get_coordinates_and_values()
+  passed += np.allclose(coords, [[0, 1], [2, 0], [2, 1]])
+  passed += np.allclose(values, [2.0, 3.0, 4.0])
+  # CHECK: 4
+  print(passed)
+
+
+# CHECK-LABEL: test_read_mtx_matrix_symmetry
+ at _run
+def test_read_mtx_matrix_symmetry():
+  with tempfile.TemporaryDirectory() as test_dir:
+    file_name = os.path.join(test_dir, "data.mtx")
+    with open(file_name, "w") as file:
+      file.write(_get_mtx_data("symmetric"))
+    a = mlir_pytaco_io.read(file_name, _FORMAT)
+  passed = 0
+  # The value of a is stored as an MLIR sparse tensor.
+  passed += (not a.is_unpacked())
+  a.unpack()
+  passed += (a.is_unpacked())
+  coords, values = a.get_coordinates_and_values()
+  print(coords)
+  print(values)
+  passed += np.allclose(coords,
+                        [[0, 1], [0, 2], [1, 0], [1, 2], [2, 0], [2, 1]])
+  passed += np.allclose(values, [2.0, 3.0, 2.0, 4.0, 3.0, 4.0])
+  # CHECK: 4
+  print(passed)
+
+
+_TNS_DATA = """2 3
+3 2
+3 1 3
+1 2 2
+3 2 4
+"""
+
+
+# CHECK-LABEL: test_read_tns
+ at _run
+def test_read_tns():
+  with tempfile.TemporaryDirectory() as test_dir:
+    file_name = os.path.join(test_dir, "data.tns")
+    with open(file_name, "w") as file:
+      file.write(_TNS_DATA)
+    a = mlir_pytaco_io.read(file_name, _FORMAT)
+  passed = 0
+  # The value of a is stored as an MLIR sparse tensor.
+  passed += (not a.is_unpacked())
+  a.unpack()
+  passed += (a.is_unpacked())
+  coords, values = a.get_coordinates_and_values()
+  passed += np.allclose(coords, [[0, 1], [2, 0], [2, 1]])
+  passed += np.allclose(values, [2.0, 3.0, 4.0])
+  # CHECK: 4
+  print(passed)