[compiler-rt] 98bb198 - [sanitizer] Add Lempel–Ziv–Welch encoder/decoder

Fri Dec 3 13:11:49 PST 2021

Author: Vitaly Buka
Date: 2021-12-03T13:11:40-08:00
New Revision: 98bb198693ca562742b4d4698c9680399a4cc1f5

URL: https://github.com/llvm/llvm-project/commit/98bb198693ca562742b4d4698c9680399a4cc1f5
DIFF: https://github.com/llvm/llvm-project/commit/98bb198693ca562742b4d4698c9680399a4cc1f5.diff

LOG: [sanitizer] Add Lempel–Ziv–Welch encoder/decoder

It's very simple, fast and efficient for the stack depot compression if used on entire pointers.

Reviewed By: morehouse, kstoimenov

Differential Revision: https://reviews.llvm.org/D114918

Added: 
    compiler-rt/lib/sanitizer_common/sanitizer_lzw.h
    compiler-rt/lib/sanitizer_common/tests/sanitizer_lzw_test.cpp

Modified: 
    compiler-rt/lib/sanitizer_common/CMakeLists.txt
    compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index 48c801017fe1..1e8339fb645e 100644

--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -155,6 +155,7 @@ set(SANITIZER_IMPL_HEADERS
   sanitizer_linux.h
   sanitizer_list.h
   sanitizer_local_address_space_view.h
+  sanitizer_lzw.h
   sanitizer_mac.h
   sanitizer_malloc_mac.inc
   sanitizer_mutex.h

diff  --git a/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h b/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h
new file mode 100644
index 000000000000..42acfbdcea09
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h
@@ -0,0 +1,159 @@
+//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lempel–Ziv–Welch encoding/decoding
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_LZW_H
+#define SANITIZER_LZW_H
+
+#include "sanitizer_dense_map.h"
+
+namespace __sanitizer {
+
+using LzwCodeType = u32;
+
+template <class T, class ItIn, class ItOut>
+ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
+  using Substring =
+      detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
+
+  // Sentinel value for substrings of len 1.
+  static constexpr LzwCodeType kNoPrefix =
+      Min(DenseMapInfo<Substring>::getEmptyKey().first,
+          DenseMapInfo<Substring>::getTombstoneKey().first) -
+      1;
+  DenseMap<Substring, LzwCodeType> prefix_to_code;
+  {
+    // Add all substring of len 1 as initial dictionary.
+    InternalMmapVector<T> dict_len1;
+    for (auto it = begin; it != end; ++it)
+      if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
+        dict_len1.push_back(*it);
+
+    // Slightly helps with later delta encoding.
+    Sort(dict_len1.data(), dict_len1.size());
+
+    // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
+    // just generate them.
+    *out = dict_len1.size();
+    ++out;
+
+    for (uptr i = 0; i != dict_len1.size(); ++i) {
+      // Remap after the Sort.
+      prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
+      *out = dict_len1[i];
+      ++out;
+    }
+    CHECK_EQ(prefix_to_code.size(), dict_len1.size());
+  }
+
+  if (begin == end)
+    return out;
+
+  // Main LZW encoding loop.
+  LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
+  ++begin;
+  for (auto it = begin; it != end; ++it) {
+    // Extend match with the new item.
+    auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
+    if (ins.second) {
+      // This is a new substring, but emit the code for the current match
+      // (before extend). This allows LZW decoder to recover the dictionary.
+      *out = match;
+      ++out;
+      // Reset the match to a single item, which must be already in the map.
+      match = prefix_to_code.find({kNoPrefix, *it})->second;
+    } else {
+      // Already known, use as the current match.
+      match = ins.first->second;
+    }
+  }
+
+  *out = match;
+  ++out;
+
+  return out;
+}
+
+template <class T, class ItIn, class ItOut>
+ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
+  if (begin == end)
+    return out;
+
+  // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
+  InternalMmapVector<T> dict_len1(*begin);
+  ++begin;
+
+  if (begin == end)
+    return out;
+
+  for (auto& v : dict_len1) {
+    v = *begin;
+    ++begin;
+  }
+
+  // Substrings of len 2 and up. Indexes are shifted because [0,
+  // dict_len1.size()) stored in dict_len1. Substings get here after being
+  // emitted to the output, so we can use output position.
+  InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
+      code_to_substr;
+
+  // Copies already emitted substrings into the output again.
+  auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
+    if (code < dict_len1.size()) {
+      *out = dict_len1[code];
+      ++out;
+      return out;
+    }
+    const auto& s = code_to_substr[code - dict_len1.size()];
+
+    for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
+    return out;
+  };
+
+  // Returns lens of the substring with the given code.
+  auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
+    if (code < dict_len1.size())
+      return 1;
+    const auto& s = code_to_substr[code - dict_len1.size()];
+    return s.second - s.first;
+  };
+
+  // Main LZW decoding loop.
+  LzwCodeType prev_code = *begin;
+  ++begin;
+  out = copy(prev_code, out);
+  for (auto it = begin; it != end; ++it) {
+    LzwCodeType code = *it;
+    auto start = out;
+    if (code == dict_len1.size() + code_to_substr.size()) {
+      // Special LZW case. The code is not in the dictionary yet. This is
+      // possible only when the new substring is the same as previous one plus
+      // the first item of the previous substring. We can emit that in two
+      // steps.
+      out = copy(prev_code, out);
+      *out = *start;
+      ++out;
+    } else {
+      out = copy(code, out);
+    }
+
+    // Every time encoded emits the code, it also creates substing of len + 1
+    // including the first item of the just emmited substring. Do the same here.
+    uptr len = code_to_len(prev_code);
+    code_to_substr.push_back({start - len, start + 1});
+
+    prev_code = code;
+  }
+  return out;
+}
+
+}  // namespace __sanitizer
+#endif

diff  --git a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
index 0ca8f7f353a6..4602ee9398f5 100644
--- a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
@@ -27,6 +27,7 @@ set(SANITIZER_UNITTESTS
   sanitizer_libc_test.cpp
   sanitizer_linux_test.cpp
   sanitizer_list_test.cpp
+  sanitizer_lzw_test.cpp
   sanitizer_mac_test.cpp
   sanitizer_mutex_test.cpp
   sanitizer_nolibc_test.cpp

diff  --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_lzw_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_lzw_test.cpp
new file mode 100644
index 000000000000..4899a56cdecc
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_lzw_test.cpp
@@ -0,0 +1,89 @@
+//===-- sanitizer_lzw_test.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_lzw.h"
+
+#include <iterator>
+
+#include "gtest/gtest.h"
+#include "sanitizer_hash.h"
+
+namespace __sanitizer {
+
+template <typename T>
+struct LzwTest : public ::testing::Test {
+  template <typename Generator>
+  void Run(size_t n, Generator gen) {
+    std::vector<T> data(n);
+    std::generate(data.begin(), data.end(), gen);
+
+    std::vector<u64> lzw;
+    LzwEncode<T>(data.begin(), data.end(), std::back_inserter(lzw));
+
+    std::vector<T> unlzw(data.size() * 2);
+    auto unlzw_end = LzwDecode<T>(lzw.begin(), lzw.end(), unlzw.data());
+    unlzw.resize(unlzw_end - unlzw.data());
+
+    EXPECT_EQ(data, unlzw);
+  }
+};
+
+static constexpr size_t kSizes[] = {0, 1, 2, 7, 13, 32, 129, 10000};
+
+using LzwTestTypes = ::testing::Types<u8, u16, u32, u64>;
+TYPED_TEST_SUITE(LzwTest, LzwTestTypes, );
+
+TYPED_TEST(LzwTest, Same) {
+  MurMur2Hash64Builder h(0);
+  for (size_t sz : kSizes) {
+    u64 v = 0;
+    for (size_t i = 0; i < 100 && !this->HasFailure(); ++i) {
+      this->Run(sz, [&] { return v; });
+      h.add(i);
+      v = h.get();
+    }
+  }
+}
+
+TYPED_TEST(LzwTest, Increment) {
+  MurMur2Hash64Builder h(0);
+  for (size_t sz : kSizes) {
+    u64 v = 0;
+    for (size_t i = 0; i < 100 && !this->HasFailure(); ++i) {
+      this->Run(sz, [&v] { return v++; });
+      h.add(i);
+      v = h.get();
+    }
+  }
+}
+
+TYPED_TEST(LzwTest, IncrementMod) {
+  MurMur2Hash64Builder h(0);
+  for (size_t sz : kSizes) {
+    u64 v = 0;
+    for (size_t i = 1; i < 16 && !this->HasFailure(); ++i) {
+      this->Run(sz, [&] { return v++ % i; });
+      h.add(i);
+      v = h.get();
+    }
+  }
+}
+
+TYPED_TEST(LzwTest, RandomLimited) {
+  for (size_t sz : kSizes) {
+    for (size_t i = 1; i < 1000 && !this->HasFailure(); i *= 2) {
+      u64 v = 0;
+      this->Run(sz, [&] {
+        MurMur2Hash64Builder h(v % i /* Keep unique set limited */);
+        v = h.get();
+        return v;
+      });
+    }
+  }
+}
+
+}  // namespace __sanitizer