[llvm] [Dataset] Upstream llvm-ir-dataset-utils (PR #72320)

Aiden Grossman via llvm-commits llvm-commits at lists.llvm.org
Sat Nov 18 16:30:53 PST 2023


https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/72320

>From 388b1e2e8a31eabab8b8d7ea3481068b8a9a9a66 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Tue, 14 Nov 2023 14:28:50 -0800
Subject: [PATCH] [Dataset] Upstream llvm-ir-dataset-utils

---
 llvm-ir-dataset-utils/.gitignore              |    7 +
 llvm-ir-dataset-utils/.packaging/Dockerfile   |  262 +++
 llvm-ir-dataset-utils/.packaging/README.md    |   49 +
 llvm-ir-dataset-utils/.style.yapf             |    3 +
 llvm-ir-dataset-utils/Pipfile                 |   15 +
 llvm-ir-dataset-utils/Pipfile.lock            | 1660 +++++++++++++++++
 llvm-ir-dataset-utils/README.md               |   70 +
 .../corpus_descriptions/chromium.json         |   25 +
 .../corpus_descriptions/firefox.json          |   22 +
 .../corpus_descriptions/linux.json            |   16 +
 .../corpus_descriptions/llvm.json             |   17 +
 .../autoconf_cpython.json                     |   14 +
 .../autoconf_ffmpeg.json                      |   16 +
 .../cargo_azure_sdk_for_rust.json             |   12 +
 .../cargo_fall_back_to_tar.json               |   16 +
 .../corpus_descriptions_test/cargo_syn.json   |   12 +
 .../cargo_tar_archive.json                    |   11 +
 .../cargo_timeout.json                        |   12 +
 .../cmake_cpuinfo.json                        |   13 +
 .../cmake_googletest.json                     |   12 +
 .../corpus_descriptions_test/julia_gen.json   |    7 +
 .../julia_zomato.json                         |   13 +
 .../manual_bc_corpus.json                     |   17 +
 .../manual_no_license.json                    |   14 +
 .../corpus_descriptions_test/manual_tree.json |   15 +
 .../sources_fallback.json                     |   19 +
 .../sources_tar_archive.json                  |   14 +
 .../corpus_descriptions_test/spack_gmake.json |    9 +
 .../corpus_descriptions_test/spack_zlib.json  |    9 +
 .../spack_zlib_cray.json                      |    9 +
 .../swift_swift_blocks.json                   |   13 +
 .../docs/building-corpora.md                  |   18 +
 .../llvm_ir_dataset_utils/__init__.py         |    0
 .../builders/__init__.py                      |    0
 .../builders/autoconf_builder.py              |   68 +
 .../llvm_ir_dataset_utils/builders/builder.py |  231 +++
 .../builders/cargo_builder.py                 |  177 ++
 .../builders/cmake_builder.py                 |   76 +
 .../builders/julia_builder.jl                 |   27 +
 .../builders/julia_builder.py                 |  118 ++
 .../builders/manual_builder.py                |   58 +
 .../builders/spack_builder.py                 |  226 +++
 .../builders/swift_builder.py                 |   56 +
 .../sources/git_source.py                     |   50 +
 .../llvm_ir_dataset_utils/sources/source.py   |   23 +
 .../sources/tar_source.py                     |   35 +
 .../llvm_ir_dataset_utils/tools/__init__.py   |    0
 .../tools/aggregate_build_sizes.py            |   57 +
 .../tools/audit_licenses.py                   |  100 +
 .../tools/audit_package_list_licenses.py      |   50 +
 .../tools/build_crate_from_repository.py      |  107 ++
 .../tools/build_julia_packages.py             |   71 +
 .../tools/build_spack_package_from_list.py    |  110 ++
 .../tools/build_swift_packages.py             |   67 +
 .../tools/collect_textual_ir.py               |   58 +
 .../tools/corpus_from_description.py          |   62 +
 .../tools/count_tokens.py                     |   35 +
 .../tools/delete_folder.py                    |   43 +
 .../tools/export_deduplicated_corpus.py       |  163 ++
 .../tools/extract_build_failure_logs.py       |   54 +
 .../tools/get_build_failure_logs.py           |   78 +
 .../tools/get_common_constants.py             |  120 ++
 .../tools/get_julia_packages.py               |  112 ++
 .../tools/get_spack_package_list.py           |  152 ++
 .../tools/get_swift_packages.py               |  116 ++
 .../llvm_ir_dataset_utils/tools/link_files.py |   89 +
 .../tools/module_statistics.py                |  174 ++
 .../tools/parse_crates_database.py            |  125 ++
 .../tools/process_to_parquet.py               |   67 +
 .../tools/search_strings.py                   |   74 +
 .../tools/spack_analyze_failures.py           |   87 +
 .../tools/top_x_constants.py                  |   37 +
 .../util/bitcode_module.py                    |  705 +++++++
 .../util/dataset_corpus.py                    |   59 +
 .../llvm_ir_dataset_utils/util/file.py        |   16 +
 .../llvm_ir_dataset_utils/util/github_api.py  |   16 +
 .../llvm_ir_dataset_utils/util/licenses.py    |  173 ++
 .../llvm_ir_dataset_utils/util/parallel.py    |   17 +
 .../util/pass_list_constants.py               |   66 +
 .../llvm_ir_dataset_utils/util/spack.py       |   74 +
 .../visualization_tools/bitcode_histograms.py |  154 ++
 .../dimensionality_reduction.py               |  138 ++
 .../duplication_heatmap.py                    |  148 ++
 .../function_call_histogram.py                |   99 +
 .../generate_histograms.py                    |  121 ++
 .../generate_pass_frequency_chart.py          |  104 ++
 .../visualization_tools/size_treemap.py       |   99 +
 llvm-ir-dataset-utils/pyproject.toml          |   45 +
 88 files changed, 7808 insertions(+)
 create mode 100644 llvm-ir-dataset-utils/.gitignore
 create mode 100644 llvm-ir-dataset-utils/.packaging/Dockerfile
 create mode 100644 llvm-ir-dataset-utils/.packaging/README.md
 create mode 100644 llvm-ir-dataset-utils/.style.yapf
 create mode 100644 llvm-ir-dataset-utils/Pipfile
 create mode 100644 llvm-ir-dataset-utils/Pipfile.lock
 create mode 100644 llvm-ir-dataset-utils/README.md
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions/chromium.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions/firefox.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions/linux.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions/llvm.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_cpython.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_ffmpeg.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/cargo_azure_sdk_for_rust.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/cargo_fall_back_to_tar.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/cargo_syn.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/cargo_tar_archive.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/cargo_timeout.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/cmake_cpuinfo.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/cmake_googletest.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/julia_gen.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/julia_zomato.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/manual_bc_corpus.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/manual_no_license.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/manual_tree.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/sources_fallback.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/sources_tar_archive.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/spack_gmake.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib_cray.json
 create mode 100644 llvm-ir-dataset-utils/corpus_descriptions_test/swift_swift_blocks.json
 create mode 100644 llvm-ir-dataset-utils/docs/building-corpora.md
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/__init__.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/__init__.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/autoconf_builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cargo_builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cmake_builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.jl
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/manual_builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/spack_builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/swift_builder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/git_source.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/source.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/tar_source.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/__init__.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/aggregate_build_sizes.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_licenses.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_package_list_licenses.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_crate_from_repository.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_julia_packages.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_spack_package_from_list.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_swift_packages.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/collect_textual_ir.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/corpus_from_description.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/count_tokens.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/delete_folder.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/export_deduplicated_corpus.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/extract_build_failure_logs.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_build_failure_logs.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_common_constants.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_julia_packages.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_spack_package_list.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_swift_packages.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/link_files.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/module_statistics.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/parse_crates_database.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/process_to_parquet.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/search_strings.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/spack_analyze_failures.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/top_x_constants.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/bitcode_module.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/dataset_corpus.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/file.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/github_api.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/licenses.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/parallel.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/pass_list_constants.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/spack.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/bitcode_histograms.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/dimensionality_reduction.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/duplication_heatmap.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/function_call_histogram.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_histograms.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_pass_frequency_chart.py
 create mode 100644 llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/size_treemap.py
 create mode 100644 llvm-ir-dataset-utils/pyproject.toml

diff --git a/llvm-ir-dataset-utils/.gitignore b/llvm-ir-dataset-utils/.gitignore
new file mode 100644
index 000000000000000..4828cef57ce77a5
--- /dev/null
+++ b/llvm-ir-dataset-utils/.gitignore
@@ -0,0 +1,7 @@
+__pycache__
+llvm_ir_dataset_utils.egg-info/
+*.crt
+*.tar.gz
+*.tar
+*.sif
+*.swp
diff --git a/llvm-ir-dataset-utils/.packaging/Dockerfile b/llvm-ir-dataset-utils/.packaging/Dockerfile
new file mode 100644
index 000000000000000..9c62f1766089996
--- /dev/null
+++ b/llvm-ir-dataset-utils/.packaging/Dockerfile
@@ -0,0 +1,262 @@
+ARG UBUNTU_VERSION=22.04
+ARG SWIFT_MAJOR_VERSION=8
+ARG SWIFT_MINOR_VERSION=1
+ARG CUSTOM_CERT
+ARG ENABLE_LEGACY_RENEGOTIATION
+
+FROM ubuntu:$UBUNTU_VERSION
+
+ARG SWIFT_MAJOR_VERSION
+ARG SWIFT_MINOR_VERSION
+ARG CUSTOM_CERT
+ARG ENABLE_LEGACY_RENEGOTIATION
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install the base dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  python3 \
+  python3-dev \
+  python-is-python3 \
+  wget \
+  curl \
+  lsb-release \
+  ca-certificates \
+  software-properties-common \
+  build-essential \
+  gnupg2 \
+  python3-pip \
+  git \
+  pkg-config \
+  libssl-dev \
+  gcc \
+  gfortran \
+  vim \
+  libarchive-dev \
+  libudev-dev \
+  libasound2-dev \
+  libzmq3-dev \
+  cmake \
+  ninja-build \
+  flex \
+  bison \
+  libelf-dev \
+  bc \
+  cpio \
+  htop \
+  jq \
+  file \
+  unzip \
+  binutils \
+  bison \
+  bzip2 \
+  cdbs \
+  curl \
+  dbus-x11 \
+  devscripts \
+  dpkg-dev \
+  elfutils \
+  fakeroot \
+  flex \
+  git-core \
+  gperf \
+  libasound2-dev \
+  libatspi2.0-dev \
+  libbrlapi-dev \
+  libbz2-dev \
+  libc6-dev \
+  libcairo2-dev \
+  libcap-dev \
+  libcups2-dev \
+  libcurl4-gnutls-dev \
+  libdrm-dev \
+  libelf-dev \
+  libevdev-dev \
+  libffi-dev \
+  libfuse2 \
+  libgbm-dev \
+  libglib2.0-dev \
+  libglu1-mesa-dev \
+  libgtk-3-dev \
+  libkrb5-dev \
+  libnspr4-dev \
+  libnss3-dev \
+  libpam0g-dev \
+  libpci-dev \
+  libpulse-dev \
+  libsctp-dev \
+  libspeechd-dev \
+  libsqlite3-dev \
+  libssl-dev \
+  libsystemd-dev \
+  libudev-dev \
+  libva-dev \
+  libwww-perl \
+  libxshmfence-dev \
+  libxslt1-dev \
+  libxss-dev \
+  libxt-dev \
+  libxtst-dev \
+  lighttpd \
+  locales \
+  openbox \
+  p7zip \
+  patch \
+  perl \
+  pkg-config \
+  rpm \
+  ruby \
+  subversion \
+  uuid-dev \
+  wdiff \
+  x11-utils \
+  xcompmgr \
+  xz-utils \
+  zip \
+  libbluetooth-dev \
+  libxkbcommon-dev \
+  mesa-common-dev \
+  zstd \
+  libjpeg-dev \
+  libudev1 \
+  libbrlapi0.8 \
+  libvulkan-dev \
+  libinput-dev \
+  binutils-arm-linux-gnueabihf \
+  binutils-aarch64-linux-gnu \
+  binutils-mipsel-linux-gnu \
+  binutils-mips64el-linux-gnuabi64 \
+  libc6-i386 \
+  lib32stdc++6 \
+  lib32gcc-s1 \
+  lib32z1 \
+  libasound2 \
+  libatk1.0-0 \
+  libatspi2.0-0 \
+  libc6 \
+  libcairo2 \
+  libcap2 \
+  libcgi-session-perl \
+  libcups2 \
+  libdrm2 \
+  libegl1 \
+  libevdev2 \
+  libexpat1 \
+  libfontconfig1 \
+  libfreetype6 \
+  libgbm1 \
+  libglib2.0-0 \
+  libgl1 \
+  libgtk-3-0 \
+  libncurses5 \
+  libpam0g \
+  libpango-1.0-0 \
+  libpangocairo-1.0-0 \
+  libpci3 \
+  libpcre3 \
+  libpixman-1-0 \
+  libspeechd2 \
+  libstdc++6 \
+  libsqlite3-0 \
+  libuuid1 \
+  libwayland-egl1 \
+  libwayland-egl1-mesa \
+  libx11-6 \
+  libx11-xcb1 \
+  libxau6 \
+  libxcb1 \
+  libxcomposite1 \
+  libxcursor1 \
+  libxdamage1 \
+  libxdmcp6 \
+  libxext6 \
+  libxfixes3 \
+  libxi6 \
+  libxinerama1 \
+  libxrandr2 \
+  libxrender1 \
+  libxtst6 \
+  x11-utils \
+  xvfb \
+  zlib1g \
+  libpulse0 \
+  libbz2-1.0 \
+  libffi8 \
+  libpng16-16 \
+  libnspr4 \
+  libnss3 \
+  libvulkan1 \
+  libinput10 \
+  cups \
+  xcb \
+  libxcb-xkb-dev \
+  x11-xkb-utils \
+  libx11-xcb-dev \
+  libxkbcommon-x11-dev \
+  generate-ninja \
+  cbindgen \
+  nasm \
+  libdbus-glib-1-dev
+
+# Setup a custom certificate/SSL settings depending upon build arguments
+# Include README.md here so that the build doesn't fail if there is no custom
+# certificate specified. Then we just delete it afterwards.
+COPY README.md $CUSTOM_CERT /usr/local/share/ca-certificates/
+RUN rm /usr/local/share/ca-certificates/README.md \
+  && update-ca-certificates
+RUN if [ -n "$ENABLE_LEGACY_RENEGOTIATION" ]; then echo "Options = UnsafeLegacyRenegotiation" >> /etc/ssl/openssl.cnf ; fi
+
+# Can this be converted into a native Ubuntu install as in the LLVM case
+ENV CARGO_HOME="/cargo"
+ENV RUSTUP_HOME="/rustup"
+RUN curl https://sh.rustup.rs | sh -s -- -y --default-toolchain none
+ENV PATH="$PATH:/cargo/bin"
+
+# LLVM Installation
+RUN git clone -b running-fixes --depth=1 https://github.com/llvm-ml/llvm-project \
+    && mkdir /llvm-project/build \
+    && cd /llvm-project/build \
+    && cmake -GNinja -DCMAKE_BUILD_TYPE=Release \
+         -DLLVM_ENABLE_PROJECTS="clang;lld" \
+         -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" \
+         -DCMAKE_INSTALL_PREFIX=/usr \
+         -DLLVM_TARGETS_TO_BUILD=Native \
+         -DLLVM_ENABLE_ASSERTIONS=ON \
+         ../llvm \
+    && ninja install \
+    && cd / \
+    && rm -rf /llvm-project
+
+# Install Julia
+RUN git clone -b emit-per-package-bitcode --depth=1 https://github.com/llvm-ml/julia /julia \
+    && cd /julia \
+    && make MARCH=x86-64 -j $(nproc) \
+    && echo prefix=/usr > Make.user \
+    && make MARCH=x86-64 install \
+    && cd / \
+    && rm -rf /julia
+
+# Install Swift
+RUN curl \
+    https://download.swift.org/swift-5.8.1-release/ubuntu2204/swift-5.8.1-RELEASE/swift-5.8.1-RELEASE-ubuntu22.04.tar.gz \
+    | tar -xz
+RUN mv swift-5.$SWIFT_MAJOR_VERSION.$SWIFT_MINOR_VERSION-RELEASE-ubuntu22.04/usr/ /opt/swift-5.$SWIFT_MAJOR_VERSION.$SWIFT_MINOR_VERSION/
+ENV PATH="${PATH}:/opt/swift-5.$SWIFT_MAJOR_VERSION.$SWIFT_MINOR_VERSION/bin/"
+
+# Set up the Python dependencies
+COPY Pipfile* ./
+RUN pip3 install pipenv \
+ && pipenv sync --categories "packages dev-packages" --system \
+ && pipenv --clear \
+ && rm Pipfile*
+
+# Install current node
+RUN curl -sL https://deb.nodesource.com/setup_20.x | bash
+RUN apt-get install -y nodejs
+
+# Clean up the Docker container to make the image smaller
+RUN apt-get autoremove -y --purge \
+    && apt-get clean -y \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV DEBIAN_FRONTEND=
diff --git a/llvm-ir-dataset-utils/.packaging/README.md b/llvm-ir-dataset-utils/.packaging/README.md
new file mode 100644
index 000000000000000..04f2ce251112bb3
--- /dev/null
+++ b/llvm-ir-dataset-utils/.packaging/README.md
@@ -0,0 +1,49 @@
+# Packaging
+
+This directory contains utilities to package the project along with relevant
+dependencies and toolchains to build the dataset.
+
+### Building the Docker Image
+
+To build the Docker image, run the following command from the root of the
+repository:
+
+```bash
+docker build -t llvm-ir-dataset-utils -f ./.packaging/Dockerfile .
+```
+
+To get the image building on machines where an older firewall or custom SSL
+certificates are used, you can pass the following two build arguments to
+the Docker build to make the image build work in your environment:
+
+* `CUSTOM_CERT` - Pass the path to a `*.crt` file in the build context to make
+the container use the certificate. Note that the file extension must be `*.crt`
+and not `*.pem` or something else due to how Ubuntu's `update-ca-certificates`
+detects new certificates.
+* `ENABLE_LEGACY_RENEGOTIATION` - Enables legacy renegotiation which is a
+problem on some systems that have a firewall in place when accessing certain
+hosts.
+
+As an example, to build a container in an environment that doesn't support SSL
+renegotiation and with a custom certificate, you can run the following commands:
+
+1. Start by making sure your current working directory is the root of the
+project:
+```bash
+cd /path/to/llvm-ir-dataset-utils
+```
+2. Copy over the certificate (bundle) that you want the container to use:
+```bash
+cp /path/to/certificate.crt ./additional_cert.crt
+```
+3. Build the container image, making sure to specify the appropriate build
+flags:
+```bash
+docker build \
+  -t llvm-ir-dataset-utils \
+  -f ./.packaging/Dockerfile \
+  --build-arg="CUSTOM_CERT=./additional_cert.crt" \
+  --build-arg="ENABLE_LEGACY_RENEGOTIATION=ON"
+```
+
+Then you should end up with the desired container image.
diff --git a/llvm-ir-dataset-utils/.style.yapf b/llvm-ir-dataset-utils/.style.yapf
new file mode 100644
index 000000000000000..64ab7f0ee656f18
--- /dev/null
+++ b/llvm-ir-dataset-utils/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+based_on_style = yapf
+indent_width = 2
diff --git a/llvm-ir-dataset-utils/Pipfile b/llvm-ir-dataset-utils/Pipfile
new file mode 100644
index 000000000000000..7f10ca5991bbb6c
--- /dev/null
+++ b/llvm-ir-dataset-utils/Pipfile
@@ -0,0 +1,15 @@
+[packages]
+absl-py = ">=1.4.0" # Apache-2.0
+ml-compiler-opt = ">=0.0.1.dev202308100007" # Apache-2.0
+ray = ">=2.5.1" # Apache-2.0
+toml = ">=0.10.2" # MIT
+pandas = ">=2.0.3" # BSD-3-Clause
+plotly = ">=5.16.1" # MIT
+# Kaleido is a dependency of plotly not explicitly declared needed for image
+# export.
+kaleido = ">=0.2.1" # MIT
+umap-learn = ">=0.5.3" # BSD-3-Clause
+
+[dev-packages]
+yapf = ">=0.33.0" # Apache-2.0
+pytest = ">=7.4.0" # MIT
diff --git a/llvm-ir-dataset-utils/Pipfile.lock b/llvm-ir-dataset-utils/Pipfile.lock
new file mode 100644
index 000000000000000..2fa1afad4730e63
--- /dev/null
+++ b/llvm-ir-dataset-utils/Pipfile.lock
@@ -0,0 +1,1660 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "c64b39554cf5cb7ea6f048c599aa5f486412531f0eb95896e7f2675ce9003035"
+        },
+        "pipfile-spec": 6,
+        "requires": {},
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "absl-py": {
+            "hashes": [
+                "sha256:0d3fe606adfa4f7db64792dd4c7aee4ee0c38ab75dfd353b7a83ed3e957fcb47",
+                "sha256:d2c244d01048ba476e7c080bd2c6df5e141d211de80223460d5b3b8a2a58433d"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.6'",
+            "version": "==1.4.0"
+        },
+        "aiosignal": {
+            "hashes": [
+                "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc",
+                "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==1.3.1"
+        },
+        "astunparse": {
+            "hashes": [
+                "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872",
+                "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"
+            ],
+            "version": "==1.6.3"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04",
+                "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==23.1.0"
+        },
+        "cachetools": {
+            "hashes": [
+                "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590",
+                "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==5.3.1"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082",
+                "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2023.7.22"
+        },
+        "charset-normalizer": {
+            "hashes": [
+                "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96",
+                "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c",
+                "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710",
+                "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706",
+                "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020",
+                "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252",
+                "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad",
+                "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329",
+                "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a",
+                "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f",
+                "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6",
+                "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4",
+                "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a",
+                "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46",
+                "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2",
+                "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23",
+                "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace",
+                "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd",
+                "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982",
+                "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10",
+                "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2",
+                "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea",
+                "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09",
+                "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5",
+                "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149",
+                "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489",
+                "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9",
+                "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80",
+                "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592",
+                "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3",
+                "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6",
+                "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed",
+                "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c",
+                "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200",
+                "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a",
+                "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e",
+                "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d",
+                "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6",
+                "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623",
+                "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669",
+                "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3",
+                "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa",
+                "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9",
+                "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2",
+                "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f",
+                "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1",
+                "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4",
+                "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a",
+                "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8",
+                "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3",
+                "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029",
+                "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f",
+                "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959",
+                "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22",
+                "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7",
+                "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952",
+                "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346",
+                "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e",
+                "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d",
+                "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299",
+                "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd",
+                "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a",
+                "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3",
+                "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037",
+                "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94",
+                "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c",
+                "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858",
+                "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a",
+                "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449",
+                "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c",
+                "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918",
+                "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1",
+                "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c",
+                "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac",
+                "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"
+            ],
+            "markers": "python_full_version >= '3.7.0'",
+            "version": "==3.2.0"
+        },
+        "click": {
+            "hashes": [
+                "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28",
+                "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==8.1.7"
+        },
+        "cloudpickle": {
+            "hashes": [
+                "sha256:61f594d1f4c295fa5cd9014ceb3a1fc4a70b0de1164b94fbc2d854ccba056f9f",
+                "sha256:d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.2.1"
+        },
+        "decorator": {
+            "hashes": [
+                "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+                "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==5.1.1"
+        },
+        "dm-reverb": {
+            "hashes": [
+                "sha256:2cad7369b6bff5329cb1dd5de460644d9ffc01ba4481ea18448cf0a19eb11ce2",
+                "sha256:85570bebeed78c2ff70155f9e553cbb09d02c8a5981ba62871b7fe02cf238a13",
+                "sha256:da261f59253a4764df5454db99a2a2ef9af897bb20b7ef5f79756b096c951eab",
+                "sha256:e3af7b93984ffc75e70e5ab38bbb6f6c41c5b4070461418040b1e151f52bfe7b"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==0.12.0"
+        },
+        "dm-tree": {
+            "hashes": [
+                "sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6",
+                "sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c",
+                "sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf",
+                "sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430",
+                "sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de",
+                "sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317",
+                "sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca",
+                "sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913",
+                "sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf",
+                "sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef",
+                "sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426",
+                "sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1",
+                "sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e",
+                "sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60",
+                "sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5",
+                "sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f",
+                "sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410",
+                "sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134",
+                "sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b",
+                "sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7",
+                "sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393",
+                "sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571",
+                "sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368",
+                "sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80",
+                "sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7",
+                "sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d",
+                "sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a",
+                "sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d",
+                "sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6",
+                "sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5",
+                "sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68",
+                "sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8",
+                "sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f",
+                "sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436",
+                "sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee",
+                "sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb",
+                "sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144",
+                "sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb",
+                "sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d"
+            ],
+            "version": "==0.1.8"
+        },
+        "filelock": {
+            "hashes": [
+                "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81",
+                "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==3.12.2"
+        },
+        "flatbuffers": {
+            "hashes": [
+                "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89",
+                "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"
+            ],
+            "version": "==23.5.26"
+        },
+        "frozenlist": {
+            "hashes": [
+                "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6",
+                "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01",
+                "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251",
+                "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9",
+                "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b",
+                "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87",
+                "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf",
+                "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f",
+                "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0",
+                "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2",
+                "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b",
+                "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc",
+                "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c",
+                "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467",
+                "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9",
+                "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1",
+                "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a",
+                "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79",
+                "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167",
+                "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300",
+                "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf",
+                "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea",
+                "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2",
+                "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab",
+                "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3",
+                "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb",
+                "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087",
+                "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc",
+                "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8",
+                "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62",
+                "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f",
+                "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326",
+                "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c",
+                "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431",
+                "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963",
+                "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7",
+                "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef",
+                "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3",
+                "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956",
+                "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781",
+                "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472",
+                "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc",
+                "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839",
+                "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672",
+                "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3",
+                "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503",
+                "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d",
+                "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8",
+                "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b",
+                "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc",
+                "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f",
+                "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559",
+                "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b",
+                "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95",
+                "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb",
+                "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963",
+                "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919",
+                "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f",
+                "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3",
+                "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1",
+                "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==1.4.0"
+        },
+        "gast": {
+            "hashes": [
+                "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1",
+                "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.4.0"
+        },
+        "gin-config": {
+            "hashes": [
+                "sha256:0c6ea5026ded927c8c93c990b01c695257c1df446e45e549a158cfbc79e19ed6",
+                "sha256:bddb7ca221ea2b46cdb59321e79fecf02d6e3b728906047fcd4076c297609fd6"
+            ],
+            "version": "==0.5.0"
+        },
+        "google-auth": {
+            "hashes": [
+                "sha256:164cba9af4e6e4e40c3a4f90a1a6c12ee56f14c0b4868d1ca91b32826ab334ce",
+                "sha256:d61d1b40897407b574da67da1a833bdc10d5a11642566e506565d1b1a46ba873"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.22.0"
+        },
+        "google-auth-oauthlib": {
+            "hashes": [
+                "sha256:95880ca704928c300f48194d1770cf5b1462835b6e49db61445a520f793fd5fb",
+                "sha256:e375064964820b47221a7e1b7ee1fd77051b6323c3f9e3e19785f78ab67ecfc5"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.0.0"
+        },
+        "google-pasta": {
+            "hashes": [
+                "sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954",
+                "sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed",
+                "sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e"
+            ],
+            "version": "==0.2.0"
+        },
+        "grpcio": {
+            "hashes": [
+                "sha256:00258cbe3f5188629828363ae8ff78477ce976a6f63fb2bb5e90088396faa82e",
+                "sha256:092fa155b945015754bdf988be47793c377b52b88d546e45c6a9f9579ac7f7b6",
+                "sha256:0f80bf37f09e1caba6a8063e56e2b87fa335add314cf2b78ebf7cb45aa7e3d06",
+                "sha256:20ec6fc4ad47d1b6e12deec5045ec3cd5402d9a1597f738263e98f490fe07056",
+                "sha256:2313b124e475aa9017a9844bdc5eafb2d5abdda9d456af16fc4535408c7d6da6",
+                "sha256:23e7d8849a0e58b806253fd206ac105b328171e01b8f18c7d5922274958cc87e",
+                "sha256:2f708a6a17868ad8bf586598bee69abded4996b18adf26fd2d91191383b79019",
+                "sha256:2f7349786da979a94690cc5c2b804cab4e8774a3cf59be40d037c4342c906649",
+                "sha256:34950353539e7d93f61c6796a007c705d663f3be41166358e3d88c45760c7d98",
+                "sha256:40b72effd4c789de94ce1be2b5f88d7b9b5f7379fe9645f198854112a6567d9a",
+                "sha256:4b089f7ad1eb00a104078bab8015b0ed0ebcb3b589e527ab009c53893fd4e613",
+                "sha256:4faea2cfdf762a664ab90589b66f416274887641ae17817de510b8178356bf73",
+                "sha256:5371bcd861e679d63b8274f73ac281751d34bd54eccdbfcd6aa00e692a82cd7b",
+                "sha256:5613a2fecc82f95d6c51d15b9a72705553aa0d7c932fad7aed7afb51dc982ee5",
+                "sha256:57b183e8b252825c4dd29114d6c13559be95387aafc10a7be645462a0fc98bbb",
+                "sha256:5b7a4ce8f862fe32b2a10b57752cf3169f5fe2915acfe7e6a1e155db3da99e79",
+                "sha256:5e5b58e32ae14658085c16986d11e99abd002ddbf51c8daae8a0671fffb3467f",
+                "sha256:60fe15288a0a65d5c1cb5b4a62b1850d07336e3ba728257a810317be14f0c527",
+                "sha256:6907b1cf8bb29b058081d2aad677b15757a44ef2d4d8d9130271d2ad5e33efca",
+                "sha256:76c44efa4ede1f42a9d5b2fed1fe9377e73a109bef8675fb0728eb80b0b8e8f2",
+                "sha256:7a635589201b18510ff988161b7b573f50c6a48fae9cb567657920ca82022b37",
+                "sha256:7b400807fa749a9eb286e2cd893e501b110b4d356a218426cb9c825a0474ca56",
+                "sha256:82640e57fb86ea1d71ea9ab54f7e942502cf98a429a200b2e743d8672171734f",
+                "sha256:871f9999e0211f9551f368612460442a5436d9444606184652117d6a688c9f51",
+                "sha256:9338bacf172e942e62e5889b6364e56657fbf8ac68062e8b25c48843e7b202bb",
+                "sha256:a8a8e560e8dbbdf29288872e91efd22af71e88b0e5736b0daf7773c1fecd99f0",
+                "sha256:aed90d93b731929e742967e236f842a4a2174dc5db077c8f9ad2c5996f89f63e",
+                "sha256:b363bbb5253e5f9c23d8a0a034dfdf1b7c9e7f12e602fc788c435171e96daccc",
+                "sha256:b4098b6b638d9e0ca839a81656a2fd4bc26c9486ea707e8b1437d6f9d61c3941",
+                "sha256:b53333627283e7241fcc217323f225c37783b5f0472316edcaa4479a213abfa6",
+                "sha256:b670c2faa92124b7397b42303e4d8eb64a4cd0b7a77e35a9e865a55d61c57ef9",
+                "sha256:bb396952cfa7ad2f01061fbc7dc1ad91dd9d69243bcb8110cf4e36924785a0fe",
+                "sha256:c60b83c43faeb6d0a9831f0351d7787a0753f5087cc6fa218d78fdf38e5acef0",
+                "sha256:c6ebecfb7a31385393203eb04ed8b6a08f5002f53df3d59e5e795edb80999652",
+                "sha256:d78d8b86fcdfa1e4c21f8896614b6cc7ee01a2a758ec0c4382d662f2a62cf766",
+                "sha256:d7f8df114d6b4cf5a916b98389aeaf1e3132035420a88beea4e3d977e5f267a5",
+                "sha256:e1cb52fa2d67d7f7fab310b600f22ce1ff04d562d46e9e0ac3e3403c2bb4cc16",
+                "sha256:e3fdf04e402f12e1de8074458549337febb3b45f21076cc02ef4ff786aff687e",
+                "sha256:e503cb45ed12b924b5b988ba9576dc9949b2f5283b8e33b21dcb6be74a7c58d0",
+                "sha256:f19ac6ac0a256cf77d3cc926ef0b4e64a9725cc612f97228cd5dc4bd9dbab03b",
+                "sha256:f1fb0fd4a1e9b11ac21c30c169d169ef434c6e9344ee0ab27cfa6f605f6387b2",
+                "sha256:fada6b07ec4f0befe05218181f4b85176f11d531911b64c715d1875c4736d73a",
+                "sha256:fd173b4cf02b20f60860dc2ffe30115c18972d7d6d2d69df97ac38dee03be5bf",
+                "sha256:fe752639919aad9ffb0dee0d87f29a6467d1ef764f13c4644d212a9a853a078d",
+                "sha256:fee387d2fab144e8a34e0e9c5ca0f45c9376b99de45628265cfa9886b1dbe62b"
+            ],
+            "markers": "python_version >= '3.10'",
+            "version": "==1.57.0"
+        },
+        "gym": {
+            "hashes": [
+                "sha256:dbd3d0c50fc1260b57e6f12ba792152b73551730512623b7653d6dfb2f7a105d"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==0.23.0"
+        },
+        "gym-notices": {
+            "hashes": [
+                "sha256:ad25e200487cafa369728625fe064e88ada1346618526102659b4640f2b4b911",
+                "sha256:e5f82e00823a166747b4c2a07de63b6560b1acb880638547e0cabf825a01e463"
+            ],
+            "version": "==0.0.8"
+        },
+        "h5py": {
+            "hashes": [
+                "sha256:12aa556d540f11a2cae53ea7cfb94017353bd271fb3962e1296b342f6550d1b8",
+                "sha256:23e74b878bbe1653ab34ca49b83cac85529cd0b36b9d625516c5830cc5ca2eac",
+                "sha256:36408f8c62f50007d14e000f9f3acf77e103b9e932c114cbe52a3089e50ebf94",
+                "sha256:3f457089c5d524b7998e3649bc63240679b8fb0a3859ea53bbb06841f3d755f1",
+                "sha256:54f01202cdea754ab4227dd27014bdbd561a4bbe4b631424fd812f7c2ce9c6ac",
+                "sha256:551e358db05a874a0f827b22e95b30092f2303edc4b91bb62ad2f10e0236e1a0",
+                "sha256:64acceaf6aff92af091a4b83f6dee3cf8d3061f924a6bb3a33eb6c4658a8348b",
+                "sha256:6822a814b9d8b8363ff102f76ea8d026f0ca25850bb579d85376029ee3e73b93",
+                "sha256:78e44686334cbbf2dd21d9df15823bc38663f27a3061f6a032c68a3e30c47bf7",
+                "sha256:79bbca34696c6f9eeeb36a91776070c49a060b2879828e2c8fa6c58b8ed10dd1",
+                "sha256:804c7fb42a34c8ab3a3001901c977a5c24d2e9c586a0f3e7c0a389130b4276fc",
+                "sha256:8d9492391ff5c3c80ec30ae2fe82a3f0efd1e750833739c25b0d090e3be1b095",
+                "sha256:95f7a745efd0d56076999b52e8da5fad5d30823bac98b59c68ae75588d09991a",
+                "sha256:9da9e7e63376c32704e37ad4cea2dceae6964cee0d8515185b3ab9cbd6b947bc",
+                "sha256:a4e20897c88759cbcbd38fb45b507adc91af3e0f67722aa302d71f02dd44d286",
+                "sha256:a6284061f3214335e1eec883a6ee497dbe7a79f19e6a57fed2dd1f03acd5a8cb",
+                "sha256:d97409e17915798029e297a84124705c8080da901307ea58f29234e09b073ddc",
+                "sha256:dbf5225543ca35ce9f61c950b73899a82be7ba60d58340e76d0bd42bf659235a",
+                "sha256:e604db6521c1e367c6bd7fad239c847f53cc46646f2d2651372d05ae5e95f817",
+                "sha256:eb7bdd5e601dd1739698af383be03f3dad0465fe67184ebd5afca770f50df9d6",
+                "sha256:f68b41efd110ce9af1cbe6fa8af9f4dcbadace6db972d30828b911949e28fadd"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==3.9.0"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
+                "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==3.4"
+        },
+        "joblib": {
+            "hashes": [
+                "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1",
+                "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==1.3.2"
+        },
+        "jsonschema": {
+            "hashes": [
+                "sha256:043dc26a3845ff09d20e4420d6012a9c91c9aa8999fa184e7efcfeccb41e32cb",
+                "sha256:6e1e7569ac13be8139b2dd2c21a55d350066ee3f80df06c608b398cdc6f30e8f"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==4.19.0"
+        },
+        "jsonschema-specifications": {
+            "hashes": [
+                "sha256:05adf340b659828a004220a9613be00fa3f223f2b82002e273dee62fd50524b1",
+                "sha256:c91a50404e88a1f6ba40636778e2ee08f6e24c5613fe4c53ac24578a5a7f72bb"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==2023.7.1"
+        },
+        "kaleido": {
+            "hashes": [
+                "sha256:4670985f28913c2d063c5734d125ecc28e40810141bdb0a46f15b76c1d45f23c",
+                "sha256:845819844c8082c9469d9c17e42621fbf85c2b237ef8a86ec8a8527f98b6512a",
+                "sha256:aa21cf1bf1c78f8fa50a9f7d45e1003c387bd3d6fe0a767cfbbf344b95bdc3a8",
+                "sha256:bb9a5d1f710357d5d432ee240ef6658a6d124c3e610935817b4b42da9c787c05",
+                "sha256:ca6f73e7ff00aaebf2843f73f1d3bacde1930ef5041093fe76b83a15785049a7",
+                "sha256:ecc72635860be616c6b7161807a65c0dbd9b90c6437ac96965831e2e24066552"
+            ],
+            "index": "pypi",
+            "version": "==0.2.1"
+        },
+        "keras": {
+            "hashes": [
+                "sha256:5ce5f706f779fa7330e63632f327b75ce38144a120376b2ae1917c00fa6136af",
+                "sha256:5df12cc241a015a11b65ddb452c0eeb2744fce21d9b54ba48db87492568ccc68"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==2.13.1"
+        },
+        "libclang": {
+            "hashes": [
+                "sha256:1e940048f51d0b0999099a9b78629ab8a64b62af5e9ff1b2b062439c21ee244d",
+                "sha256:4a9acbfd9c135a72f80d5dbff7588dfb0c81458244a89b9e83526e8595880e0a",
+                "sha256:4acdde39dfe410c877b4ccc0d4b57eb952100e4ee26bbdf6cfdb88e2033a7d31",
+                "sha256:8130482120500476a027171f8f3c8dfc2536b591716eea71fc5da22cae13131b",
+                "sha256:88bc7e7b393c32e41e03ba77ef02fdd647da1f764c2cd028e69e0837080b79f6",
+                "sha256:9dcdc730939788b8b69ffd6d5d75fe5366e3ee007f1e36a99799ec0b0c001492",
+                "sha256:d80ed5827736ed5ec2bcedf536720476fd9d4fa4c79ef0cb24aea4c59332f361",
+                "sha256:da9e47ebc3f0a6d90fb169ef25f9fbcd29b4a4ef97a8b0e3e3a17800af1423f4",
+                "sha256:daab4a11dae228f1efa9efa3fe638b493b14d8d52c71fb3c7019e2f1df4514c2",
+                "sha256:e1a5ad1e895e5443e205568c85c04b4608e4e973dae42f4dfd9cb46c81d1486b",
+                "sha256:f04e3060ae1f207f234d0608900c99c50edcb743e5e18276d78da2ddd727d39f"
+            ],
+            "version": "==16.0.6"
+        },
+        "llvmlite": {
+            "hashes": [
+                "sha256:09f83ea7a54509c285f905d968184bba00fc31ebf12f2b6b1494d677bb7dde9b",
+                "sha256:0c23edd196bd797dc3a7860799054ea3488d2824ecabc03f9135110c2e39fcbc",
+                "sha256:3673c53cb21c65d2ff3704962b5958e967c6fc0bd0cff772998face199e8d87b",
+                "sha256:39a0b4d0088c01a469a5860d2e2d7a9b4e6a93c0f07eb26e71a9a872a8cadf8d",
+                "sha256:467b43836b388eaedc5a106d76761e388dbc4674b2f2237bc477c6895b15a634",
+                "sha256:4a7525db121f2e699809b539b5308228854ccab6693ecb01b52c44a2f5647e20",
+                "sha256:5b3076dc4e9c107d16dc15ecb7f2faf94f7736cd2d5e9f4dc06287fd672452c1",
+                "sha256:5cdb0d45df602099d833d50bd9e81353a5e036242d3c003c5b294fc61d1986b4",
+                "sha256:7b37297f3cbd68d14a97223a30620589d98ad1890e5040c9e5fc181063f4ed49",
+                "sha256:84747289775d0874e506f907a4513db889471607db19b04de97d144047fec885",
+                "sha256:84ce9b1c7a59936382ffde7871978cddcda14098e5a76d961e204523e5c372fb",
+                "sha256:9329b930d699699846623054121ed105fd0823ed2180906d3b3235d361645490",
+                "sha256:96707ebad8b051bbb4fc40c65ef93b7eeee16643bd4d579a14d11578e4b7a647",
+                "sha256:a36d9f244b6680cb90bbca66b146dabb2972f4180c64415c96f7c8a2d8b60a36",
+                "sha256:a66a5bd580951751b4268f4c3bddcef92682814d6bc72f3cd3bb67f335dd7097",
+                "sha256:bba2747cf5b4954e945c287fe310b3fcc484e2a9d1b0c273e99eb17d103bb0e6",
+                "sha256:bbd5e82cc990e5a3e343a3bf855c26fdfe3bfae55225f00efd01c05bbda79918",
+                "sha256:cda71de10a1f48416309e408ea83dab5bf36058f83e13b86a2961defed265568",
+                "sha256:e2dbbb8424037ca287983b115a29adf37d806baf7e1bf4a67bd2cffb74e085ed",
+                "sha256:e35766e42acef0fe7d1c43169a8ffc327a47808fae6a067b049fe0e9bbf84dd5",
+                "sha256:e44f854dc11559795bcdeaf12303759e56213d42dabbf91a5897aa2d8b033810",
+                "sha256:e74e7bec3235a1e1c9ad97d897a620c5007d0ed80c32c84c1d787e7daa17e4ec",
+                "sha256:f643d15aacd0b0b0dc8b74b693822ba3f9a53fa63bc6a178c2dba7cc88f42144",
+                "sha256:ff8f31111bb99d135ff296757dc81ab36c2dee54ed4bd429158a96da9807c316"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==0.40.1"
+        },
+        "markdown": {
+            "hashes": [
+                "sha256:225c6123522495d4119a90b3a3ba31a1e87a70369e03f14799ea9c0d7183a3d6",
+                "sha256:a4c1b65c0957b4bd9e7d86ddc7b3c9868fb9670660f6f99f6d1bca8954d5a941"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==3.4.4"
+        },
+        "markupsafe": {
+            "hashes": [
+                "sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e",
+                "sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e",
+                "sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431",
+                "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686",
+                "sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559",
+                "sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc",
+                "sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c",
+                "sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0",
+                "sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4",
+                "sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9",
+                "sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575",
+                "sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba",
+                "sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d",
+                "sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3",
+                "sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00",
+                "sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155",
+                "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac",
+                "sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52",
+                "sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f",
+                "sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8",
+                "sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b",
+                "sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24",
+                "sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea",
+                "sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198",
+                "sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0",
+                "sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee",
+                "sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be",
+                "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2",
+                "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707",
+                "sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6",
+                "sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58",
+                "sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779",
+                "sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636",
+                "sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c",
+                "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad",
+                "sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee",
+                "sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc",
+                "sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2",
+                "sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48",
+                "sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7",
+                "sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e",
+                "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b",
+                "sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa",
+                "sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5",
+                "sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e",
+                "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb",
+                "sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9",
+                "sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57",
+                "sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc",
+                "sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.1.3"
+        },
+        "ml-compiler-opt": {
+            "hashes": [
+                "sha256:61319bdbdd3719f439f5a0f6b233b84cadbbece47209fd50048ac237fc639bc1",
+                "sha256:d7610080293aab202a321e02e0a17b8f481e6b7275a35a68c91cf5bc2a94a6a3"
+            ],
+            "index": "pypi",
+            "markers": "python_version < '3.11' and python_version >= '3.8'",
+            "version": "==0.0.1.dev202308290006"
+        },
+        "msgpack": {
+            "hashes": [
+                "sha256:06f5174b5f8ed0ed919da0e62cbd4ffde676a374aba4020034da05fab67b9164",
+                "sha256:0c05a4a96585525916b109bb85f8cb6511db1c6f5b9d9cbcbc940dc6b4be944b",
+                "sha256:137850656634abddfb88236008339fdaba3178f4751b28f270d2ebe77a563b6c",
+                "sha256:17358523b85973e5f242ad74aa4712b7ee560715562554aa2134d96e7aa4cbbf",
+                "sha256:18334484eafc2b1aa47a6d42427da7fa8f2ab3d60b674120bce7a895a0a85bdd",
+                "sha256:1835c84d65f46900920b3708f5ba829fb19b1096c1800ad60bae8418652a951d",
+                "sha256:1967f6129fc50a43bfe0951c35acbb729be89a55d849fab7686004da85103f1c",
+                "sha256:1ab2f3331cb1b54165976a9d976cb251a83183631c88076613c6c780f0d6e45a",
+                "sha256:1c0f7c47f0087ffda62961d425e4407961a7ffd2aa004c81b9c07d9269512f6e",
+                "sha256:20a97bf595a232c3ee6d57ddaadd5453d174a52594bf9c21d10407e2a2d9b3bd",
+                "sha256:20c784e66b613c7f16f632e7b5e8a1651aa5702463d61394671ba07b2fc9e025",
+                "sha256:266fa4202c0eb94d26822d9bfd7af25d1e2c088927fe8de9033d929dd5ba24c5",
+                "sha256:28592e20bbb1620848256ebc105fc420436af59515793ed27d5c77a217477705",
+                "sha256:288e32b47e67f7b171f86b030e527e302c91bd3f40fd9033483f2cacc37f327a",
+                "sha256:3055b0455e45810820db1f29d900bf39466df96ddca11dfa6d074fa47054376d",
+                "sha256:332360ff25469c346a1c5e47cbe2a725517919892eda5cfaffe6046656f0b7bb",
+                "sha256:362d9655cd369b08fda06b6657a303eb7172d5279997abe094512e919cf74b11",
+                "sha256:366c9a7b9057e1547f4ad51d8facad8b406bab69c7d72c0eb6f529cf76d4b85f",
+                "sha256:36961b0568c36027c76e2ae3ca1132e35123dcec0706c4b7992683cc26c1320c",
+                "sha256:379026812e49258016dd84ad79ac8446922234d498058ae1d415f04b522d5b2d",
+                "sha256:382b2c77589331f2cb80b67cc058c00f225e19827dbc818d700f61513ab47bea",
+                "sha256:476a8fe8fae289fdf273d6d2a6cb6e35b5a58541693e8f9f019bfe990a51e4ba",
+                "sha256:48296af57cdb1d885843afd73c4656be5c76c0c6328db3440c9601a98f303d87",
+                "sha256:4867aa2df9e2a5fa5f76d7d5565d25ec76e84c106b55509e78c1ede0f152659a",
+                "sha256:4c075728a1095efd0634a7dccb06204919a2f67d1893b6aa8e00497258bf926c",
+                "sha256:4f837b93669ce4336e24d08286c38761132bc7ab29782727f8557e1eb21b2080",
+                "sha256:4f8d8b3bf1ff2672567d6b5c725a1b347fe838b912772aa8ae2bf70338d5a198",
+                "sha256:525228efd79bb831cf6830a732e2e80bc1b05436b086d4264814b4b2955b2fa9",
+                "sha256:5494ea30d517a3576749cad32fa27f7585c65f5f38309c88c6d137877fa28a5a",
+                "sha256:55b56a24893105dc52c1253649b60f475f36b3aa0fc66115bffafb624d7cb30b",
+                "sha256:56a62ec00b636583e5cb6ad313bbed36bb7ead5fa3a3e38938503142c72cba4f",
+                "sha256:57e1f3528bd95cc44684beda696f74d3aaa8a5e58c816214b9046512240ef437",
+                "sha256:586d0d636f9a628ddc6a17bfd45aa5b5efaf1606d2b60fa5d87b8986326e933f",
+                "sha256:5cb47c21a8a65b165ce29f2bec852790cbc04936f502966768e4aae9fa763cb7",
+                "sha256:6c4c68d87497f66f96d50142a2b73b97972130d93677ce930718f68828b382e2",
+                "sha256:821c7e677cc6acf0fd3f7ac664c98803827ae6de594a9f99563e48c5a2f27eb0",
+                "sha256:916723458c25dfb77ff07f4c66aed34e47503b2eb3188b3adbec8d8aa6e00f48",
+                "sha256:9e6ca5d5699bcd89ae605c150aee83b5321f2115695e741b99618f4856c50898",
+                "sha256:9f5ae84c5c8a857ec44dc180a8b0cc08238e021f57abdf51a8182e915e6299f0",
+                "sha256:a2b031c2e9b9af485d5e3c4520f4220d74f4d222a5b8dc8c1a3ab9448ca79c57",
+                "sha256:a61215eac016f391129a013c9e46f3ab308db5f5ec9f25811e811f96962599a8",
+                "sha256:a740fa0e4087a734455f0fc3abf5e746004c9da72fbd541e9b113013c8dc3282",
+                "sha256:a9985b214f33311df47e274eb788a5893a761d025e2b92c723ba4c63936b69b1",
+                "sha256:ab31e908d8424d55601ad7075e471b7d0140d4d3dd3272daf39c5c19d936bd82",
+                "sha256:ac9dd47af78cae935901a9a500104e2dea2e253207c924cc95de149606dc43cc",
+                "sha256:addab7e2e1fcc04bd08e4eb631c2a90960c340e40dfc4a5e24d2ff0d5a3b3edb",
+                "sha256:b1d46dfe3832660f53b13b925d4e0fa1432b00f5f7210eb3ad3bb9a13c6204a6",
+                "sha256:b2de4c1c0538dcb7010902a2b97f4e00fc4ddf2c8cda9749af0e594d3b7fa3d7",
+                "sha256:b5ef2f015b95f912c2fcab19c36814963b5463f1fb9049846994b007962743e9",
+                "sha256:b72d0698f86e8d9ddf9442bdedec15b71df3598199ba33322d9711a19f08145c",
+                "sha256:bae7de2026cbfe3782c8b78b0db9cbfc5455e079f1937cb0ab8d133496ac55e1",
+                "sha256:bf22a83f973b50f9d38e55c6aade04c41ddda19b00c4ebc558930d78eecc64ed",
+                "sha256:c075544284eadc5cddc70f4757331d99dcbc16b2bbd4849d15f8aae4cf36d31c",
+                "sha256:c396e2cc213d12ce017b686e0f53497f94f8ba2b24799c25d913d46c08ec422c",
+                "sha256:cb5aaa8c17760909ec6cb15e744c3ebc2ca8918e727216e79607b7bbce9c8f77",
+                "sha256:cdc793c50be3f01106245a61b739328f7dccc2c648b501e237f0699fe1395b81",
+                "sha256:d25dd59bbbbb996eacf7be6b4ad082ed7eacc4e8f3d2df1ba43822da9bfa122a",
+                "sha256:e42b9594cc3bf4d838d67d6ed62b9e59e201862a25e9a157019e171fbe672dd3",
+                "sha256:e57916ef1bd0fee4f21c4600e9d1da352d8816b52a599c46460e93a6e9f17086",
+                "sha256:ed40e926fa2f297e8a653c954b732f125ef97bdd4c889f243182299de27e2aa9",
+                "sha256:ef8108f8dedf204bb7b42994abf93882da1159728a2d4c5e82012edd92c9da9f",
+                "sha256:f933bbda5a3ee63b8834179096923b094b76f0c7a73c1cfe8f07ad608c58844b",
+                "sha256:fe5c63197c55bce6385d9aee16c4d0641684628f63ace85f73571e65ad1c1e8d"
+            ],
+            "version": "==1.0.5"
+        },
+        "numba": {
+            "hashes": [
+                "sha256:33c0500170d213e66d90558ad6aca57d3e03e97bb11da82e6d87ab793648cb17",
+                "sha256:3cf78d74ad9d289fbc1e5b1c9f2680fca7a788311eb620581893ab347ec37a7e",
+                "sha256:3d6483c27520d16cf5d122868b79cad79e48056ecb721b52d70c126bed65431e",
+                "sha256:4838edef2df5f056cb8974670f3d66562e751040c448eb0b67c7e2fec1726649",
+                "sha256:4c078f84b5529a7fdb8413bb33d5100f11ec7b44aa705857d9eb4e54a54ff505",
+                "sha256:53e9fab973d9e82c9f8449f75994a898daaaf821d84f06fbb0b9de2293dd9306",
+                "sha256:5a82bf37444039c732485c072fda21a361790ed990f88db57fd6941cd5e5d307",
+                "sha256:60ec56386076e9eed106a87c96626d5686fbb16293b9834f0849cf78c9491779",
+                "sha256:643cb09a9ba9e1bd8b060e910aeca455e9442361e80fce97690795ff9840e681",
+                "sha256:6c057ccedca95df23802b6ccad86bb318be624af45b5a38bb8412882be57a681",
+                "sha256:8e00ca63c5d0ad2beeb78d77f087b3a88c45ea9b97e7622ab2ec411a868420ee",
+                "sha256:93df62304ada9b351818ba19b1cfbddaf72cd89348e81474326ca0b23bf0bae1",
+                "sha256:9587ba1bf5f3035575e45562ada17737535c6d612df751e811d702693a72d95e",
+                "sha256:9a1b2b69448e510d672ff9a6b18d2db9355241d93c6a77677baa14bec67dc2a0",
+                "sha256:9b17fbe4a69dcd9a7cd49916b6463cd9a82af5f84911feeb40793b8bce00dfa7",
+                "sha256:9bcc36478773ce838f38afd9a4dfafc328d4ffb1915381353d657da7f6473282",
+                "sha256:a32ee263649aa3c3587b833d6311305379529570e6c20deb0c6f4fb5bc7020db",
+                "sha256:a3eac19529956185677acb7f01864919761bfffbb9ae04bbbe5e84bbc06cfc2b",
+                "sha256:ae50c8c90c2ce8057f9618b589223e13faa8cbc037d8f15b4aad95a2c33a0582",
+                "sha256:c0602e4f896e6a6d844517c3ab434bc978e7698a22a733cc8124465898c28fa8",
+                "sha256:db8268eb5093cae2288942a8cbd69c9352f6fe6e0bfa0a9a27679436f92e4248",
+                "sha256:e447c4634d1cc99ab50d4faa68f680f1d88b06a2a05acf134aa6fcc0342adeca",
+                "sha256:f47dd214adc5dcd040fe9ad2adbd2192133c9075d2189ce1b3d5f9d72863ef05",
+                "sha256:ff66d5b022af6c7d81ddbefa87768e78ed4f834ab2da6ca2fd0d60a9e69b94f5"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==0.57.1"
+        },
+        "numpy": {
+            "hashes": [
+                "sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187",
+                "sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812",
+                "sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7",
+                "sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4",
+                "sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6",
+                "sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0",
+                "sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4",
+                "sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570",
+                "sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4",
+                "sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f",
+                "sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80",
+                "sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289",
+                "sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385",
+                "sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078",
+                "sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c",
+                "sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463",
+                "sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3",
+                "sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950",
+                "sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155",
+                "sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7",
+                "sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c",
+                "sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096",
+                "sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17",
+                "sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf",
+                "sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4",
+                "sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02",
+                "sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c",
+                "sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b"
+            ],
+            "markers": "python_version >= '3.9'",
+            "version": "==1.24.3"
+        },
+        "oauthlib": {
+            "hashes": [
+                "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca",
+                "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.2.2"
+        },
+        "opt-einsum": {
+            "hashes": [
+                "sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147",
+                "sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==3.3.0"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61",
+                "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==23.1"
+        },
+        "pandas": {
+            "hashes": [
+                "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682",
+                "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc",
+                "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b",
+                "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089",
+                "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5",
+                "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26",
+                "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210",
+                "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b",
+                "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641",
+                "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd",
+                "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78",
+                "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b",
+                "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e",
+                "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061",
+                "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0",
+                "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e",
+                "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8",
+                "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d",
+                "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0",
+                "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c",
+                "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183",
+                "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df",
+                "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8",
+                "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f",
+                "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==2.0.3"
+        },
+        "pillow": {
+            "hashes": [
+                "sha256:00e65f5e822decd501e374b0650146063fbb30a7264b4d2744bdd7b913e0cab5",
+                "sha256:040586f7d37b34547153fa383f7f9aed68b738992380ac911447bb78f2abe530",
+                "sha256:0b6eb5502f45a60a3f411c63187db83a3d3107887ad0d036c13ce836f8a36f1d",
+                "sha256:1ce91b6ec08d866b14413d3f0bbdea7e24dfdc8e59f562bb77bc3fe60b6144ca",
+                "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891",
+                "sha256:22c10cc517668d44b211717fd9775799ccec4124b9a7f7b3635fc5386e584992",
+                "sha256:3400aae60685b06bb96f99a21e1ada7bc7a413d5f49bce739828ecd9391bb8f7",
+                "sha256:349930d6e9c685c089284b013478d6f76e3a534e36ddfa912cde493f235372f3",
+                "sha256:368ab3dfb5f49e312231b6f27b8820c823652b7cd29cfbd34090565a015e99ba",
+                "sha256:38250a349b6b390ee6047a62c086d3817ac69022c127f8a5dc058c31ccef17f3",
+                "sha256:3a684105f7c32488f7153905a4e3015a3b6c7182e106fe3c37fbb5ef3e6994c3",
+                "sha256:3a82c40d706d9aa9734289740ce26460a11aeec2d9c79b7af87bb35f0073c12f",
+                "sha256:3b08d4cc24f471b2c8ca24ec060abf4bebc6b144cb89cba638c720546b1cf538",
+                "sha256:3ed64f9ca2f0a95411e88a4efbd7a29e5ce2cea36072c53dd9d26d9c76f753b3",
+                "sha256:3f07ea8d2f827d7d2a49ecf1639ec02d75ffd1b88dcc5b3a61bbb37a8759ad8d",
+                "sha256:520f2a520dc040512699f20fa1c363eed506e94248d71f85412b625026f6142c",
+                "sha256:5c6e3df6bdd396749bafd45314871b3d0af81ff935b2d188385e970052091017",
+                "sha256:608bfdee0d57cf297d32bcbb3c728dc1da0907519d1784962c5f0c68bb93e5a3",
+                "sha256:685ac03cc4ed5ebc15ad5c23bc555d68a87777586d970c2c3e216619a5476223",
+                "sha256:76de421f9c326da8f43d690110f0e79fe3ad1e54be811545d7d91898b4c8493e",
+                "sha256:76edb0a1fa2b4745fb0c99fb9fb98f8b180a1bbceb8be49b087e0b21867e77d3",
+                "sha256:7be600823e4c8631b74e4a0d38384c73f680e6105a7d3c6824fcf226c178c7e6",
+                "sha256:81ff539a12457809666fef6624684c008e00ff6bf455b4b89fd00a140eecd640",
+                "sha256:88af2003543cc40c80f6fca01411892ec52b11021b3dc22ec3bc9d5afd1c5334",
+                "sha256:8c11160913e3dd06c8ffdb5f233a4f254cb449f4dfc0f8f4549eda9e542c93d1",
+                "sha256:8f8182b523b2289f7c415f589118228d30ac8c355baa2f3194ced084dac2dbba",
+                "sha256:9211e7ad69d7c9401cfc0e23d49b69ca65ddd898976d660a2fa5904e3d7a9baa",
+                "sha256:92be919bbc9f7d09f7ae343c38f5bb21c973d2576c1d45600fce4b74bafa7ac0",
+                "sha256:9c82b5b3e043c7af0d95792d0d20ccf68f61a1fec6b3530e718b688422727396",
+                "sha256:9f7c16705f44e0504a3a2a14197c1f0b32a95731d251777dcb060aa83022cb2d",
+                "sha256:9fb218c8a12e51d7ead2a7c9e101a04982237d4855716af2e9499306728fb485",
+                "sha256:a74ba0c356aaa3bb8e3eb79606a87669e7ec6444be352870623025d75a14a2bf",
+                "sha256:b4f69b3700201b80bb82c3a97d5e9254084f6dd5fb5b16fc1a7b974260f89f43",
+                "sha256:bc2ec7c7b5d66b8ec9ce9f720dbb5fa4bace0f545acd34870eff4a369b44bf37",
+                "sha256:c189af0545965fa8d3b9613cfdb0cd37f9d71349e0f7750e1fd704648d475ed2",
+                "sha256:c1fbe7621c167ecaa38ad29643d77a9ce7311583761abf7836e1510c580bf3dd",
+                "sha256:c7cf14a27b0d6adfaebb3ae4153f1e516df54e47e42dcc073d7b3d76111a8d86",
+                "sha256:c9f72a021fbb792ce98306ffb0c348b3c9cb967dce0f12a49aa4c3d3fdefa967",
+                "sha256:cd25d2a9d2b36fcb318882481367956d2cf91329f6892fe5d385c346c0649629",
+                "sha256:ce543ed15570eedbb85df19b0a1a7314a9c8141a36ce089c0a894adbfccb4568",
+                "sha256:ce7b031a6fc11365970e6a5686d7ba8c63e4c1cf1ea143811acbb524295eabed",
+                "sha256:d35e3c8d9b1268cbf5d3670285feb3528f6680420eafe35cccc686b73c1e330f",
+                "sha256:d50b6aec14bc737742ca96e85d6d0a5f9bfbded018264b3b70ff9d8c33485551",
+                "sha256:d5d0dae4cfd56969d23d94dc8e89fb6a217be461c69090768227beb8ed28c0a3",
+                "sha256:d5db32e2a6ccbb3d34d87c87b432959e0db29755727afb37290e10f6e8e62614",
+                "sha256:d72e2ecc68a942e8cf9739619b7f408cc7b272b279b56b2c83c6123fcfa5cdff",
+                "sha256:d737a602fbd82afd892ca746392401b634e278cb65d55c4b7a8f48e9ef8d008d",
+                "sha256:d80cf684b541685fccdd84c485b31ce73fc5c9b5d7523bf1394ce134a60c6883",
+                "sha256:db24668940f82321e746773a4bc617bfac06ec831e5c88b643f91f122a785684",
+                "sha256:dbc02381779d412145331789b40cc7b11fdf449e5d94f6bc0b080db0a56ea3f0",
+                "sha256:dffe31a7f47b603318c609f378ebcd57f1554a3a6a8effbc59c3c69f804296de",
+                "sha256:edf4392b77bdc81f36e92d3a07a5cd072f90253197f4a52a55a8cec48a12483b",
+                "sha256:efe8c0681042536e0d06c11f48cebe759707c9e9abf880ee213541c5b46c5bf3",
+                "sha256:f31f9fdbfecb042d046f9d91270a0ba28368a723302786c0009ee9b9f1f60199",
+                "sha256:f88a0b92277de8e3ca715a0d79d68dc82807457dae3ab8699c758f07c20b3c51",
+                "sha256:faaf07ea35355b01a35cb442dd950d8f1bb5b040a7787791a535de13db15ed90"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==10.0.0"
+        },
+        "plotly": {
+            "hashes": [
+                "sha256:19cc34f339acd4e624177806c14df22f388f23fb70658b03aad959a0e650a0dc",
+                "sha256:295ac25edeb18c893abb71dcadcea075b78fd6fdf07cee4217a4e1009667925b"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.6'",
+            "version": "==5.16.1"
+        },
+        "portpicker": {
+            "hashes": [
+                "sha256:b2787a41404cf7edbe29b07b9e0ed863b09f2665dcc01c1eb0c2261c1e7d0755",
+                "sha256:bd507fd6f96f65ee02781f2e674e9dc6c99bbfa6e3c39992e3916204c9d431fa"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.6.0"
+        },
+        "protobuf": {
+            "hashes": [
+                "sha256:237b9a50bd3b7307d0d834c1b0eb1a6cd47d3f4c2da840802cd03ea288ae8880",
+                "sha256:25ae91d21e3ce8d874211110c2f7edd6384816fb44e06b2867afe35139e1fd1c",
+                "sha256:2b23bd6e06445699b12f525f3e92a916f2dcf45ffba441026357dea7fa46f42b",
+                "sha256:3b7b170d3491ceed33f723bbf2d5a260f8a4e23843799a3906f16ef736ef251e",
+                "sha256:4e69965e7e54de4db989289a9b971a099e626f6167a9351e9d112221fc691bc1",
+                "sha256:58e12d2c1aa428ece2281cef09bbaa6938b083bcda606db3da4e02e991a0d924",
+                "sha256:6bd26c1fa9038b26c5c044ee77e0ecb18463e957fefbaeb81a3feb419313a54e",
+                "sha256:77700b55ba41144fc64828e02afb41901b42497b8217b558e4a001f18a85f2e3",
+                "sha256:7fda70797ddec31ddfa3576cbdcc3ddbb6b3078b737a1a87ab9136af0570cd6e",
+                "sha256:839952e759fc40b5d46be319a265cf94920174d88de31657d5622b5d8d6be5cd",
+                "sha256:bb7aa97c252279da65584af0456f802bd4b2de429eb945bbc9b3d61a42a8cd16",
+                "sha256:c00c3c7eb9ad3833806e21e86dca448f46035242a680f81c3fe068ff65e79c74",
+                "sha256:c5cdd486af081bf752225b26809d2d0a85e575b80a84cde5172a05bbb1990099"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==4.24.2"
+        },
+        "psutil": {
+            "hashes": [
+                "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d",
+                "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217",
+                "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4",
+                "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c",
+                "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f",
+                "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da",
+                "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4",
+                "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42",
+                "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5",
+                "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4",
+                "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9",
+                "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f",
+                "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30",
+                "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==5.9.5"
+        },
+        "pyasn1": {
+            "hashes": [
+                "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57",
+                "sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "version": "==0.5.0"
+        },
+        "pyasn1-modules": {
+            "hashes": [
+                "sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c",
+                "sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "version": "==0.3.0"
+        },
+        "pygame": {
+            "hashes": [
+                "sha256:009e9886a463f4cb86e5d11024fafb6b9a5f5808d21c4df66938922adc6ee90b",
+                "sha256:06e8e4d04f8d57969689d9316bbffc7e4b9862534541535b6e892e410c4f4248",
+                "sha256:0745fc99d104c71bf90c70c09a541b7922ba47391e16e9629490722fc1f3e46b",
+                "sha256:0b182a6010f16571b1a92e34208ac1e9ad01a725f2fa3c1ad158c4b515ae12de",
+                "sha256:0e8cb75c4cb8ae512e4f3d5410ea64359b049f7825a6e8a9010fdf5c539b8433",
+                "sha256:154a0a0a953006e8ab90353055037b6710e00aa19941e2fbc1b4f0358e1a4882",
+                "sha256:1758bbf986efc9f08344c7eae733f6d04f596745f737bc4c02412b809ac65d58",
+                "sha256:1924826a32cc49c0d6b2e523f05e2ea608e1ff631ba595a910ddf37a8b38ee77",
+                "sha256:1e083351df89cc0f9ce003cff003d420fa0362e9fd00a793c9a68fc8cd0a2e5f",
+                "sha256:217072e80f470847e121b5c5658f89be35e2c1c3ab23d4126fe80fc64fd34d27",
+                "sha256:21a4fbc7462d3b8bd9692a6198fb031a80a85e7ac24cfcee382901b2b57db67c",
+                "sha256:240f920f846150ffb2c217d75cd1005989fbedd007df82949db361934bfa9660",
+                "sha256:261fd434d869e1ae285b3ca90112d62c712fde53679e1bce3dfcee79e93f2b06",
+                "sha256:2f2c3c25a1018495011dd0734df4f23ddf6e88037e884b9ecb03ef33a17c5b4c",
+                "sha256:2fbbd59b95017824a9b7c3695b13f2df27a64c557c42970a6cad2e26dd5f4f31",
+                "sha256:41baee1b5502b7472df0eb59af66ee347ac8ef042b08b553c85546e34daa736a",
+                "sha256:45cfe97fa4de560d866f53afbf3c61136bfb4114eb585ac4b6c82c278baf2c1c",
+                "sha256:4908fc837fd9f68b6cfdfeffb99b342c25cf770d76768498dc4c066c2f8b2776",
+                "sha256:4ef7f1f7d37ffddee63569b5d09f693d0704b04e7c2ff5af90bf61e4cdffe6c7",
+                "sha256:50934f9efc6f51d1c4408b607d34162f74ec0763628c6384e062f6bc3e97d98a",
+                "sha256:55374e1afb72c6dc546fb1d9ac972de2adf642de69ec7003e66023254512f1e5",
+                "sha256:5581841a9baa902d7efc243d3d03ae380f929e90d98ecccbbd49b1fa03e0a6ff",
+                "sha256:5624318d189403dde7c58bdb62c340edd0faad6f18a6cf46a9f6923a66db2ba4",
+                "sha256:565cccf5bf47e2ec577a0c237919aef9da66d075e982d339fae31e37734e02b8",
+                "sha256:595b639fab8a1acafe78050ef71668eae0a22fb5efa022b0cd2bce26a15a371d",
+                "sha256:6403f1705fc3b4fc2a51e06f3a7102cc1ed9884dc9ff5b99a4cac0d65255ad58",
+                "sha256:64d37dc04a14df9519e2e87611cac6a144c6204365e3fdb0bfb86fa459e32f38",
+                "sha256:6d138b15cf378b3755e1e48ea49f0f0406067ada2c176bc6489e70bc836ab72d",
+                "sha256:704cb29a380b8e84d4051300e7eccd918e35cb8c44ae931fc0ec8c942e42c71f",
+                "sha256:76f8207ed3feeda63df711245bec8613809b7aea71db7d0a1515268c5bd6f52d",
+                "sha256:787c1f46905c2f6dd0310144fa7c61cb54d97990c477992601555edf01699f95",
+                "sha256:787fff1984107da0c533d2e87c85b0082788c8b24952adf9f0bc6459be485e7f",
+                "sha256:7b06978de0400276e451cbbbba80cc9566cf5663dbb8518b7ec598078440de06",
+                "sha256:7cd54e859e1b626c332e254128db30e5d6d33544c10154a1ec7a052cac6a50a4",
+                "sha256:867498b0ac20f5c981dcc3ee00262d06b3806b355c632f42c11f5b716ccb2ba1",
+                "sha256:868ec3a2b87fdca43b4f4ef8314d4fb00c4d9ef6e732f5a9b0348ae3d015de3d",
+                "sha256:8fa2701374d3125084b2fd1f9c6d056e7e0fb8ec655e46a5fa1531b7e419fa69",
+                "sha256:97b0ec29c9810ca1125df013656e13a89388e5ca72fdd4d900235bceaed30349",
+                "sha256:986805fbc0827bf8b8dd8c52aa4ce2fbd4475d51fe1df326bf3e788c1b9f59fd",
+                "sha256:a0420eb1015abb3ebba4d29c1239010eac4d82a36b94659847d18af218293aa4",
+                "sha256:a139c4290fa5227ccead2c57f6e41195c22f643e0fb7336c9da0c734c9df3cd4",
+                "sha256:a9f24b4aeba86e882f3640c4251b0325f86556a5f2661bd7c9e3dc1c9fe966c6",
+                "sha256:ab841fe3a4c703cc021d3bb466a9bf5df41edfa3267c44fff5bd6f47944cd4c3",
+                "sha256:ad0835f1406a8589ebe7447801a47ff68c16b753d2b27193947e21c8adbac8c1",
+                "sha256:af925800444941cea5e45eb94954e5335006fb1b5d35d996e22b3f616e9e0e8e",
+                "sha256:b0bfcc7359308748edfe277137efd19f21e5b22373f106848d64dc048db22701",
+                "sha256:b358abc054bf94fede79bb213f7ac9dd8a8737c8ac48a6bb0fa72314c147bd76",
+                "sha256:b75481cc17a22679c69014ef2322d55cfa66be0923abbd9206e01ef10bb5dab6",
+                "sha256:ba578c5cac85358566de3010b3f3393df3b936b310eba6811abbae5241ec19c0",
+                "sha256:bc1bbb23e4f32b361956275bdde5992893981362fd37c0e62586537400bd4a4b",
+                "sha256:bf1024e516fd3a3948ec45f0ad3b63e69f66c342e4678b2e04a383f735272b8f",
+                "sha256:bf5b5fbb526fe76b0eb5a076b94afd29e0a91bc1ba9d6b573fad9722b7e8a5d9",
+                "sha256:bfc8a0d863470ec673ff267caaff59b858e967ef78170dc04fc318a7c5f9dd33",
+                "sha256:c473efee52359806d8c0a876f42ccb7ead088e7ae8f5d31e6e43f94793c943ea",
+                "sha256:c9d827fbd093d5d4ef35bd9ec2994535b5f37af15860975be7e59a2d415b51c7",
+                "sha256:caa4c10f79793b7be9eb1e647f84eee3e9ebb79d143d72eecaaaeb94eb44e1c1",
+                "sha256:cb0493d0f7fa378fccfc65548aec92edeadbde981c964337f11c884432bfaa35",
+                "sha256:cd259998a71a2f7793a4e5d5fc31493f7c7d9ec73e4320895145e7dbc1c8e48d",
+                "sha256:d61bbd7a071d80706fd6337abd96484398dc04b3245e8e5df4e7c99e3676086b",
+                "sha256:d9764bb10f61a1137aa1118b832417c52e53da19971f4fa94605ea5bd4acc92b",
+                "sha256:df29c4369df9231eebffac801fa7af021279d7e9dc4c1cae698cc4077c98d069",
+                "sha256:df450cc4342d5664accfdd895c5ee380710eaf16942722117c01deab3373bb35",
+                "sha256:e18c6b0fa9e39ad3fe68e48ace92285c026ee70f05bf4dfa54a33fa89f7a0474",
+                "sha256:e26c4bb679e7514a7f6c69ca8a68a495013bf46f200476d334459c44a733356f",
+                "sha256:f04d89bdf7951e9fde68c7174b522befd9ae6e5c2a75d195435223df044aaacc",
+                "sha256:f26e9f1385dddffe605d8afbcba1f90f81156deadbc27327dcc844eb71e24ffa",
+                "sha256:f5760ea0f181c8395bc39fdb50000ce2b77d453ba4c0d98e00303a89af6ae5f0",
+                "sha256:f71cfb9f161473511543ba8e713f1b34c6b00c70412ecd2d71111da82fe7f062",
+                "sha256:f9fe7d817ae099f1b1fd0aac7502f7472a3ba18b068efa3dc30b5d293760565a",
+                "sha256:fa8efe34b13a6bfb37e627c2b28e9967b87cd01de92cd000852e08ef8aed1cf0",
+                "sha256:ff16c4cffa9958935d39eed73e5a707fc6e86b85f1ec06baf7172c555801730d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.1.3"
+        },
+        "pynndescent": {
+            "hashes": [
+                "sha256:5d5dc683c03ef55fe3ddf693859720ca18f85c6e6e5bb0b4f14870278d5288ad"
+            ],
+            "version": "==0.5.10"
+        },
+        "python-dateutil": {
+            "hashes": [
+                "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
+                "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.8.2"
+        },
+        "pytz": {
+            "hashes": [
+                "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588",
+                "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"
+            ],
+            "version": "==2023.3"
+        },
+        "pyyaml": {
+            "hashes": [
+                "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5",
+                "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc",
+                "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df",
+                "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741",
+                "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206",
+                "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27",
+                "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595",
+                "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62",
+                "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98",
+                "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696",
+                "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290",
+                "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9",
+                "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d",
+                "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6",
+                "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867",
+                "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47",
+                "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486",
+                "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6",
+                "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3",
+                "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007",
+                "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938",
+                "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0",
+                "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c",
+                "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735",
+                "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d",
+                "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28",
+                "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4",
+                "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba",
+                "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8",
+                "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5",
+                "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd",
+                "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3",
+                "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0",
+                "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515",
+                "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c",
+                "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c",
+                "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924",
+                "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34",
+                "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43",
+                "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859",
+                "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673",
+                "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54",
+                "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a",
+                "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b",
+                "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab",
+                "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa",
+                "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c",
+                "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585",
+                "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d",
+                "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==6.0.1"
+        },
+        "ray": {
+            "hashes": [
+                "sha256:015a2aa30aba0719d20cdf8fa32c689b68016678cb20f46bd1df8b227c938b84",
+                "sha256:0a5870f9a16cb94080d770f83326d7e2163d88d75be240273cef4b932a071bb2",
+                "sha256:18d033cc468e5171d9995476c33f99a5b79f091c34265c7e9f3d8b1c9042437e",
+                "sha256:1a8de31a9a4049134cf7e97b725a4078c958a964d091cb3e812e31eddd013bd7",
+                "sha256:31f1dd05130e712b9b64ccad9e6eaa82c715bb25a0a45ffd48ebf4953f6fe347",
+                "sha256:3ccf809e5948333c1c8c81694514b5900259e79cbdc8bddd3680695820cafcf2",
+                "sha256:3e5a4bbc29268a64bd2a8d48ed60f32a5bcce285a2a4f4339174947733449e37",
+                "sha256:467b9aa63f09d20e3985457816d703fe27ea388cdcaa88ff5eff222f8074a05c",
+                "sha256:485e4cd46a569416a14a72c06fe7901b0e3902f3023100b375c477975824e707",
+                "sha256:4b4600c93e2e94b6ca75ef4b4cb92d7f98d4be5484273d6fbac4218fb82cf96f",
+                "sha256:56b920a1814decdd20a754b7c5048770684d6d3d242c83aa99da5d3e8c339f13",
+                "sha256:5923849ec0854ab3e5ca8873d47ed7e11074e1213a3c40f8864c9500de034313",
+                "sha256:787ec7f43f5b3ed85728cf4878bdfed0a334d9108b6af75ef3fe5c8d44a7f74d",
+                "sha256:81e2ee7252e2fbfb05a24124774a8de563daa261200a08d9cbc6b499f7262af1",
+                "sha256:8a3cde58dba07da7a62e1f804b3dae5b29de3be052e02e4559bff7e7cb4d4a3b",
+                "sha256:90b780e131f891185f9de2b9c08d1f2d729e5755c7389a1ddaa6f796fae0d787",
+                "sha256:a182a80aebf863b5d4e875bed0a80e83200e84f4f63c4126cef87cc01e43f067",
+                "sha256:a4ef2f52319286720be7f3bfe6043e9fd0b8cb7826cb2ffc90c23c1c42427464",
+                "sha256:abc6a537454506a5fa87137de058d12aeea38da7077aae6f0ebf6199e5f5b2a1",
+                "sha256:b358fd112876c3a249fd8cffbf20b26622817c78b2ade0a725a7036c693f8d70",
+                "sha256:bca66c8e8163f06dc5443623e7b221660529a39574a589ba9257f2188ea8bf6b",
+                "sha256:bdeacaafcbb97e5f1c3c3349e7fcc0c40f691cea2bf057027c5491ea1ac929b0",
+                "sha256:dff21468d621c8dac95b3df320e6c6121f6618f6827243fd75a057c8815c2498",
+                "sha256:e0f8eaf4c4592335722dad474685c2ffc98207b997e47a24b297a60db389a4cb"
+            ],
+            "index": "pypi",
+            "version": "==2.6.3"
+        },
+        "referencing": {
+            "hashes": [
+                "sha256:449b6669b6121a9e96a7f9e410b245d471e8d48964c67113ce9afe50c8dd7bdf",
+                "sha256:794ad8003c65938edcdbc027f1933215e0d0ccc0291e3ce20a4d87432b59efc0"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==0.30.2"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f",
+                "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.31.0"
+        },
+        "requests-oauthlib": {
+            "hashes": [
+                "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5",
+                "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.3.1"
+        },
+        "rpds-py": {
+            "hashes": [
+                "sha256:00215f6a9058fbf84f9d47536902558eb61f180a6b2a0fa35338d06ceb9a2e5a",
+                "sha256:0028eb0967942d0d2891eae700ae1a27b7fd18604cfcb16a1ef486a790fee99e",
+                "sha256:0155c33af0676fc38e1107679be882077680ad1abb6303956b97259c3177e85e",
+                "sha256:063411228b852fb2ed7485cf91f8e7d30893e69b0acb207ec349db04cccc8225",
+                "sha256:0700c2133ba203c4068aaecd6a59bda22e06a5e46255c9da23cbf68c6942215d",
+                "sha256:08e08ccf5b10badb7d0a5c84829b914c6e1e1f3a716fdb2bf294e2bd01562775",
+                "sha256:0d292cabd7c8335bdd3237ded442480a249dbcdb4ddfac5218799364a01a0f5c",
+                "sha256:15932ec5f224b0e35764dc156514533a4fca52dcfda0dfbe462a1a22b37efd59",
+                "sha256:18f87baa20e02e9277ad8960cd89b63c79c05caf106f4c959a9595c43f2a34a5",
+                "sha256:1a6420a36975e0073acaeee44ead260c1f6ea56812cfc6c31ec00c1c48197173",
+                "sha256:1b401e8b9aece651512e62c431181e6e83048a651698a727ea0eb0699e9f9b74",
+                "sha256:1d7b7b71bcb82d8713c7c2e9c5f061415598af5938666beded20d81fa23e7640",
+                "sha256:23750a9b8a329844ba1fe267ca456bb3184984da2880ed17ae641c5af8de3fef",
+                "sha256:23a059143c1393015c68936370cce11690f7294731904bdae47cc3e16d0b2474",
+                "sha256:26d9fd624649a10e4610fab2bc820e215a184d193e47d0be7fe53c1c8f67f370",
+                "sha256:291c9ce3929a75b45ce8ddde2aa7694fc8449f2bc8f5bd93adf021efaae2d10b",
+                "sha256:298e8b5d8087e0330aac211c85428c8761230ef46a1f2c516d6a2f67fb8803c5",
+                "sha256:2c7c4266c1b61eb429e8aeb7d8ed6a3bfe6c890a1788b18dbec090c35c6b93fa",
+                "sha256:2d68a8e8a3a816629283faf82358d8c93fe5bd974dd2704152394a3de4cec22a",
+                "sha256:344b89384c250ba6a4ce1786e04d01500e4dac0f4137ceebcaad12973c0ac0b3",
+                "sha256:3455ecc46ea443b5f7d9c2f946ce4017745e017b0d0f8b99c92564eff97e97f5",
+                "sha256:3d544a614055b131111bed6edfa1cb0fb082a7265761bcb03321f2dd7b5c6c48",
+                "sha256:3e5c26905aa651cc8c0ddc45e0e5dea2a1296f70bdc96af17aee9d0493280a17",
+                "sha256:3f5cc8c7bc99d2bbcd704cef165ca7d155cd6464c86cbda8339026a42d219397",
+                "sha256:4992266817169997854f81df7f6db7bdcda1609972d8ffd6919252f09ec3c0f6",
+                "sha256:4d55528ef13af4b4e074d067977b1f61408602f53ae4537dccf42ba665c2c7bd",
+                "sha256:576da63eae7809f375932bfcbca2cf20620a1915bf2fedce4b9cc8491eceefe3",
+                "sha256:58fc4d66ee349a23dbf08c7e964120dc9027059566e29cf0ce6205d590ed7eca",
+                "sha256:5b9bf77008f2c55dabbd099fd3ac87009471d223a1c7ebea36873d39511b780a",
+                "sha256:5e7996aed3f65667c6dcc8302a69368435a87c2364079a066750a2eac75ea01e",
+                "sha256:5f7487be65b9c2c510819e744e375bd41b929a97e5915c4852a82fbb085df62c",
+                "sha256:6388e4e95a26717b94a05ced084e19da4d92aca883f392dffcf8e48c8e221a24",
+                "sha256:65af12f70355de29e1092f319f85a3467f4005e959ab65129cb697169ce94b86",
+                "sha256:668d2b45d62c68c7a370ac3dce108ffda482b0a0f50abd8b4c604a813a59e08f",
+                "sha256:71333c22f7cf5f0480b59a0aef21f652cf9bbaa9679ad261b405b65a57511d1e",
+                "sha256:7150b83b3e3ddaac81a8bb6a9b5f93117674a0e7a2b5a5b32ab31fdfea6df27f",
+                "sha256:748e472345c3a82cfb462d0dff998a7bf43e621eed73374cb19f307e97e08a83",
+                "sha256:75dbfd41a61bc1fb0536bf7b1abf272dc115c53d4d77db770cd65d46d4520882",
+                "sha256:7618a082c55cf038eede4a918c1001cc8a4411dfe508dc762659bcd48d8f4c6e",
+                "sha256:780fcb855be29153901c67fc9c5633d48aebef21b90aa72812fa181d731c6b00",
+                "sha256:78d10c431073dc6ebceed35ab22948a016cc2b5120963c13a41e38bdde4a7212",
+                "sha256:7a3a3d3e4f1e3cd2a67b93a0b6ed0f2499e33f47cc568e3a0023e405abdc0ff1",
+                "sha256:7b6975d3763d0952c111700c0634968419268e6bbc0b55fe71138987fa66f309",
+                "sha256:80772e3bda6787510d9620bc0c7572be404a922f8ccdfd436bf6c3778119464c",
+                "sha256:80992eb20755701753e30a6952a96aa58f353d12a65ad3c9d48a8da5ec4690cf",
+                "sha256:841128a22e6ac04070a0f84776d07e9c38c4dcce8e28792a95e45fc621605517",
+                "sha256:861d25ae0985a1dd5297fee35f476b60c6029e2e6e19847d5b4d0a43a390b696",
+                "sha256:872f3dcaa8bf2245944861d7311179d2c0c9b2aaa7d3b464d99a7c2e401f01fa",
+                "sha256:87c93b25d538c433fb053da6228c6290117ba53ff6a537c133b0f2087948a582",
+                "sha256:8856aa76839dc234d3469f1e270918ce6bec1d6a601eba928f45d68a15f04fc3",
+                "sha256:885e023e73ce09b11b89ab91fc60f35d80878d2c19d6213a32b42ff36543c291",
+                "sha256:899b5e7e2d5a8bc92aa533c2d4e55e5ebba095c485568a5e4bedbc163421259a",
+                "sha256:8ce8caa29ebbdcde67e5fd652c811d34bc01f249dbc0d61e5cc4db05ae79a83b",
+                "sha256:8e1c68303ccf7fceb50fbab79064a2636119fd9aca121f28453709283dbca727",
+                "sha256:8e7e2b3577e97fa43c2c2b12a16139b2cedbd0770235d5179c0412b4794efd9b",
+                "sha256:92f05fc7d832e970047662b3440b190d24ea04f8d3c760e33e7163b67308c878",
+                "sha256:97f5811df21703446b42303475b8b855ee07d6ab6cdf8565eff115540624f25d",
+                "sha256:9affee8cb1ec453382c27eb9043378ab32f49cd4bc24a24275f5c39bf186c279",
+                "sha256:a2da4a8c6d465fde36cea7d54bf47b5cf089073452f0e47c8632ecb9dec23c07",
+                "sha256:a6903cdca64f1e301af9be424798328c1fe3b4b14aede35f04510989fc72f012",
+                "sha256:a8ab1adf04ae2d6d65835995218fd3f3eb644fe20655ca8ee233e2c7270ff53b",
+                "sha256:a8edd467551c1102dc0f5754ab55cd0703431cd3044edf8c8e7d9208d63fa453",
+                "sha256:ac00c41dd315d147b129976204839ca9de699d83519ff1272afbe4fb9d362d12",
+                "sha256:ad277f74b1c164f7248afa968700e410651eb858d7c160d109fb451dc45a2f09",
+                "sha256:ae46a50d235f1631d9ec4670503f7b30405103034830bc13df29fd947207f795",
+                "sha256:afe6b5a04b2ab1aa89bad32ca47bf71358e7302a06fdfdad857389dca8fb5f04",
+                "sha256:b1cb078f54af0abd835ca76f93a3152565b73be0f056264da45117d0adf5e99c",
+                "sha256:b25136212a3d064a8f0b9ebbb6c57094c5229e0de76d15c79b76feff26aeb7b8",
+                "sha256:b3226b246facae14909b465061ddcfa2dfeadb6a64f407f24300d42d69bcb1a1",
+                "sha256:b98e75b21fc2ba5285aef8efaf34131d16af1c38df36bdca2f50634bea2d3060",
+                "sha256:bbd7b24d108509a1b9b6679fcc1166a7dd031dbef1f3c2c73788f42e3ebb3beb",
+                "sha256:bed57543c99249ab3a4586ddc8786529fbc33309e5e8a1351802a06ca2baf4c2",
+                "sha256:c0583f69522732bdd79dca4cd3873e63a29acf4a299769c7541f2ca1e4dd4bc6",
+                "sha256:c1e0e9916301e3b3d970814b1439ca59487f0616d30f36a44cead66ee1748c31",
+                "sha256:c651847545422c8131660704c58606d841e228ed576c8f1666d98b3d318f89da",
+                "sha256:c7853f27195598e550fe089f78f0732c66ee1d1f0eaae8ad081589a5a2f5d4af",
+                "sha256:cbae50d352e4717ffc22c566afc2d0da744380e87ed44a144508e3fb9114a3f4",
+                "sha256:cdbed8f21204398f47de39b0a9b180d7e571f02dfb18bf5f1b618e238454b685",
+                "sha256:d08395595c42bcd82c3608762ce734504c6d025eef1c06f42326a6023a584186",
+                "sha256:d4639111e73997567343df6551da9dd90d66aece1b9fc26c786d328439488103",
+                "sha256:d63787f289944cc4bde518ad2b5e70a4f0d6e2ce76324635359c74c113fd188f",
+                "sha256:d6d5f061f6a2aa55790b9e64a23dfd87b6664ab56e24cd06c78eb43986cb260b",
+                "sha256:d7865df1fb564092bcf46dac61b5def25342faf6352e4bc0e61a286e3fa26a3d",
+                "sha256:db6585b600b2e76e98131e0ac0e5195759082b51687ad0c94505970c90718f4a",
+                "sha256:e36d7369363d2707d5f68950a64c4e025991eb0177db01ccb6aa6facae48b69f",
+                "sha256:e7947d9a6264c727a556541b1630296bbd5d0a05068d21c38dde8e7a1c703ef0",
+                "sha256:eb2d59bc196e6d3b1827c7db06c1a898bfa0787c0574af398e65ccf2e97c0fbe",
+                "sha256:ee9c2f6ca9774c2c24bbf7b23086264e6b5fa178201450535ec0859739e6f78d",
+                "sha256:f4760e1b02173f4155203054f77a5dc0b4078de7645c922b208d28e7eb99f3e2",
+                "sha256:f70bec8a14a692be6dbe7ce8aab303e88df891cbd4a39af091f90b6702e28055",
+                "sha256:f869e34d2326e417baee430ae998e91412cc8e7fdd83d979277a90a0e79a5b47",
+                "sha256:f8b9a7cd381970e64849070aca7c32d53ab7d96c66db6c2ef7aa23c6e803f514",
+                "sha256:f99d74ddf9d3b6126b509e81865f89bd1283e3fc1b568b68cd7bd9dfa15583d7",
+                "sha256:f9e7e493ded7042712a374471203dd43ae3fff5b81e3de1a0513fa241af9fd41",
+                "sha256:fc72ae476732cdb7b2c1acb5af23b478b8a0d4b6fcf19b90dd150291e0d5b26b",
+                "sha256:fccbf0cd3411719e4c9426755df90bf3449d9fc5a89f077f4a7f1abd4f70c910",
+                "sha256:ffcf18ad3edf1c170e27e88b10282a2c449aa0358659592462448d71b2000cfc"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==0.10.0"
+        },
+        "rsa": {
+            "hashes": [
+                "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7",
+                "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"
+            ],
+            "markers": "python_version >= '3.6' and python_version < '4'",
+            "version": "==4.9"
+        },
+        "scikit-learn": {
+            "hashes": [
+                "sha256:0e8102d5036e28d08ab47166b48c8d5e5810704daecf3a476a4282d562be9a28",
+                "sha256:151ac2bf65ccf363664a689b8beafc9e6aae36263db114b4ca06fbbbf827444a",
+                "sha256:1d54fb9e6038284548072df22fd34777e434153f7ffac72c8596f2d6987110dd",
+                "sha256:3a11936adbc379a6061ea32fa03338d4ca7248d86dd507c81e13af428a5bc1db",
+                "sha256:436aaaae2c916ad16631142488e4c82f4296af2404f480e031d866863425d2a2",
+                "sha256:552fd1b6ee22900cf1780d7386a554bb96949e9a359999177cf30211e6b20df6",
+                "sha256:6a885a9edc9c0a341cab27ec4f8a6c58b35f3d449c9d2503a6fd23e06bbd4f6a",
+                "sha256:7617164951c422747e7c32be4afa15d75ad8044f42e7d70d3e2e0429a50e6718",
+                "sha256:79970a6d759eb00a62266a31e2637d07d2d28446fca8079cf9afa7c07b0427f8",
+                "sha256:850a00b559e636b23901aabbe79b73dc604b4e4248ba9e2d6e72f95063765603",
+                "sha256:8be549886f5eda46436b6e555b0e4873b4f10aa21c07df45c4bc1735afbccd7a",
+                "sha256:981287869e576d42c682cf7ca96af0c6ac544ed9316328fd0d9292795c742cf5",
+                "sha256:9877af9c6d1b15486e18a94101b742e9d0d2f343d35a634e337411ddb57783f3",
+                "sha256:998d38fcec96584deee1e79cd127469b3ad6fefd1ea6c2dfc54e8db367eb396b",
+                "sha256:9d953531f5d9f00c90c34fa3b7d7cfb43ecff4c605dac9e4255a20b114a27369",
+                "sha256:ae80c08834a473d08a204d966982a62e11c976228d306a2648c575e3ead12111",
+                "sha256:c470f53cea065ff3d588050955c492793bb50c19a92923490d18fcb637f6383a",
+                "sha256:c7e28d8fa47a0b30ae1bd7a079519dd852764e31708a7804da6cb6f8b36e3630",
+                "sha256:ded35e810438a527e17623ac6deae3b360134345b7c598175ab7741720d7ffa7",
+                "sha256:ee04835fb016e8062ee9fe9074aef9b82e430504e420bff51e3e5fffe72750ca",
+                "sha256:fd6e2d7389542eae01077a1ee0318c4fec20c66c957f45c7aac0c6eb0fe3c612"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==1.3.0"
+        },
+        "scipy": {
+            "hashes": [
+                "sha256:0f3261f14b767b316d7137c66cc4f33a80ea05841b9c87ad83a726205b901423",
+                "sha256:10eb6af2f751aa3424762948e5352f707b0dece77288206f227864ddf675aca0",
+                "sha256:1342ca385c673208f32472830c10110a9dcd053cf0c4b7d4cd7026d0335a6c1d",
+                "sha256:214cdf04bbae7a54784f8431f976704ed607c4bc69ba0d5d5d6a9df84374df76",
+                "sha256:2b997a5369e2d30c97995dcb29d638701f8000d04df01b8e947f206e5d0ac788",
+                "sha256:2c91cf049ffb5575917f2a01da1da082fd24ed48120d08a6e7297dfcac771dcd",
+                "sha256:3aeb87661de987f8ec56fa6950863994cd427209158255a389fc5aea51fa7055",
+                "sha256:4447ad057d7597476f9862ecbd9285bbf13ba9d73ce25acfa4e4b11c6801b4c9",
+                "sha256:542a757e2a6ec409e71df3d8fd20127afbbacb1c07990cb23c5870c13953d899",
+                "sha256:8d9886f44ef8c9e776cb7527fb01455bf4f4a46c455c4682edc2c2cc8cd78562",
+                "sha256:90d3b1364e751d8214e325c371f0ee0dd38419268bf4888b2ae1040a6b266b2a",
+                "sha256:95763fbda1206bec41157582bea482f50eb3702c85fffcf6d24394b071c0e87a",
+                "sha256:ac74b1512d38718fb6a491c439aa7b3605b96b1ed3be6599c17d49d6c60fca18",
+                "sha256:afdb0d983f6135d50770dd979df50bf1c7f58b5b33e0eb8cf5c73c70600eae1d",
+                "sha256:b0620240ef445b5ddde52460e6bc3483b7c9c750275369379e5f609a1050911c",
+                "sha256:b133f237bd8ba73bad51bc12eb4f2d84cbec999753bf25ba58235e9fc2096d80",
+                "sha256:b29318a5e39bd200ca4381d80b065cdf3076c7d7281c5e36569e99273867f61d",
+                "sha256:b8425fa963a32936c9773ee3ce44a765d8ff67eed5f4ac81dc1e4a819a238ee9",
+                "sha256:d2b813bfbe8dec6a75164523de650bad41f4405d35b0fa24c2c28ae07fcefb20",
+                "sha256:d690e1ca993c8f7ede6d22e5637541217fc6a4d3f78b3672a6fe454dbb7eb9a7",
+                "sha256:e367904a0fec76433bf3fbf3e85bf60dae8e9e585ffd21898ab1085a29a04d16",
+                "sha256:ea932570b1c2a30edafca922345854ff2cd20d43cd9123b6dacfdecebfc1a80b",
+                "sha256:f28f1f6cfeb48339c192efc6275749b2a25a7e49c4d8369a28b6591da02fbc9a",
+                "sha256:f73102f769ee06041a3aa26b5841359b1a93cc364ce45609657751795e8f4a4a",
+                "sha256:fa4909c6c20c3d91480533cddbc0e7c6d849e7d9ded692918c76ce5964997898"
+            ],
+            "markers": "python_version < '3.13' and python_version >= '3.9'",
+            "version": "==1.11.2"
+        },
+        "setuptools": {
+            "hashes": [
+                "sha256:3d4dfa6d95f1b101d695a6160a7626e15583af71a5f52176efa5d39a054d475d",
+                "sha256:3d8083eed2d13afc9426f227b24fd1659489ec107c0e86cec2ffdde5c92e790b"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==68.1.2"
+        },
+        "six": {
+            "hashes": [
+                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.16.0"
+        },
+        "tenacity": {
+            "hashes": [
+                "sha256:5398ef0d78e63f40007c1fb4c0bff96e1911394d2fa8d194f77619c05ff6cc8a",
+                "sha256:ce510e327a630c9e1beaf17d42e6ffacc88185044ad85cf74c0a8887c6a0f88c"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==8.2.3"
+        },
+        "tensorboard": {
+            "hashes": [
+                "sha256:ab69961ebddbddc83f5fa2ff9233572bdad5b883778c35e4fe94bf1798bd8481"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==2.13.0"
+        },
+        "tensorboard-data-server": {
+            "hashes": [
+                "sha256:255c02b7f5b03dd5c0a88c928e563441ff39e1d4b4a234cdbe09f016e53d9594",
+                "sha256:9938bd39f5041797b33921066fba0eab03a0dd10d1887a05e62ae58841ad4c3f",
+                "sha256:be8d016a1aa394e6198280d4a3dc37898f56467310c5f5e617cac10a783e055a"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==0.7.1"
+        },
+        "tensorflow": {
+            "hashes": [
+                "sha256:00060c5516a61e30c51936084ebc37091d116efe9ae74b2818cbd8b2006218e7",
+                "sha256:06559eeaa69e6561cccbe2d02b015bcec663e875c8bbc4643f55692476e52147",
+                "sha256:076d953a1508dc58bf95f30f58bcc9ee364b1353c61e143cb20c2dada91afb05",
+                "sha256:11ad6a7ff49b4a690fbf37a5adaf28ba2686350a859c5f13c58dc8d2cc670375",
+                "sha256:19ee67901702b26787ad685cca65730c163c101c0c2f238a2584d714e0fa8c25",
+                "sha256:2822ac48c38d69b7fb104e606dacbd763c4bf5d3b20791f25be16a5076d01912",
+                "sha256:5e0fdadec59de3d11c5b5129ddc38e739bde7aa13095b82e19d4380e14d04999",
+                "sha256:6fff426661d286a4c634da44275d2ea2b951f392f3e65c8603681e7cb040586a",
+                "sha256:72d68b8c2f382e2d01b956c8ba516c0a7d5dad98111dd351bf82bfa646aa1c72",
+                "sha256:7a08c0e2938ed5b642a8787678123827477b81d316055d5073fff82fa183eb82",
+                "sha256:89125443e998548059c4e4a129dfab2b1ff7f2fd4c8eaed97842c3cd9b663101",
+                "sha256:948003b5a23b72b3d89746d729e62ec5f01e47460f05521b2211d95069f569ba",
+                "sha256:9c04bc3023b6c4cfb9ee9759c3f03f21993891b4c345df52eb5519204fbf28c0",
+                "sha256:b2978b39e8b3919059b5fd9e28508d50a77965d06ed0b537ed71c97de22dabdf",
+                "sha256:cbb83561bb7d55859eaefc70c674e58713d4e10c10927423ed836a5289bbfa86",
+                "sha256:de77306c0c22c9d8754f54700752ac3a1efee895c5357308e6594436404bfbc0",
+                "sha256:e0cf94d36ceaba8f158c6e15404a81fd5b3aa4cb04147c674cf55bd1aec78154",
+                "sha256:e8f0b69ee2f800399fc6bc7ec55fecfa33662d136e425485959d90638f32a32a",
+                "sha256:fa7abe265cc3ebccc9b405a280bf674824c6d85df5e6ccfa985987b3c9d265b4",
+                "sha256:fb2ff1129c93e853c19897d6a22ed0ec56387f5c6290ec03dec1c6f7b80bc396"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==2.13.0"
+        },
+        "tensorflow-estimator": {
+            "hashes": [
+                "sha256:6f868284eaa654ae3aa7cacdbef2175d0909df9fcf11374f5166f8bf475952aa"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.13.0"
+        },
+        "tensorflow-io-gcs-filesystem": {
+            "hashes": [
+                "sha256:2dd49262831ee20f03fd3f5d2c679e7111cd1575e0ad60f60b5632f2da555bfc",
+                "sha256:4657f92dcc2474adc773bf69b836818b416c22cfadaac05b9b64f2a53f3009ee",
+                "sha256:4e1d833f6856aec465652c0d7a75a7c28cf83b132b8351ba0c4df4e05136c403",
+                "sha256:58f953665620725c842de8f4074c14779bf11d9081e4d0d8f2b75145de9ee20a",
+                "sha256:68db367697353184667bbd94faf53a58e7b695acb905f23da1e8ccad8bd6b451",
+                "sha256:8295a65fd4fa731b06b31fab223e3ba11369430537169934a17f7bcc07dfef76",
+                "sha256:8d3ddd86a0f7cf4d35f2401d5b28d574d0f296b4e4349c69c671f7b83fc6ce8f",
+                "sha256:99c063f766fdb431d555f17fa185979195abb0477445f054fe16567bfd340fd7",
+                "sha256:a57e64cd5d22085f9b475df9d12086a894eb8861524970c8839a2ec315841a20",
+                "sha256:ac69d8ba4d27435a5e199248b3a3befc19e65d86a97a52a19ee1f43195f51207",
+                "sha256:b85c793e313e9cfed6caa328ec1a162844006a4bc016ba1d116813d7541938a9",
+                "sha256:c7916ca0accdd259c3fbee1b1f0816d61d6e8a639aa5bc1d4cdfbaf63b344623",
+                "sha256:cfa1df21535f7c945041fda99da2940a56b67d86e20aa2ac8cde3d371bc08659",
+                "sha256:dcf4fc3a44f75b7dccb7b40ca709872bf7f0e812522f82aa7881ecdc0d86af48"
+            ],
+            "markers": "platform_machine != 'arm64' or platform_system != 'Darwin'",
+            "version": "==0.33.0"
+        },
+        "tensorflow-probability": {
+            "hashes": [
+                "sha256:fc10597d2b1a26ecdfae2086307944dd7f1082a0e8a2f615b56c1f0121a7763d"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==0.20.1"
+        },
+        "termcolor": {
+            "hashes": [
+                "sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475",
+                "sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.3.0"
+        },
+        "tf-agents": {
+            "hashes": [
+                "sha256:6f694ac64c00eda2bd2cb0811f9dea99fb6cd521f8ac65b208a5a26bbda309cc"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==0.17.0"
+        },
+        "threadpoolctl": {
+            "hashes": [
+                "sha256:2b7818516e423bdaebb97c723f86a7c6b0a83d3f3b0970328d66f4d9104dc032",
+                "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==3.2.0"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "tqdm": {
+            "hashes": [
+                "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386",
+                "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==4.66.1"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb",
+                "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==4.5.0"
+        },
+        "tzdata": {
+            "hashes": [
+                "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a",
+                "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"
+            ],
+            "markers": "python_version >= '2'",
+            "version": "==2023.3"
+        },
+        "umap-learn": {
+            "hashes": [
+                "sha256:dbd57cb181c2b66d238acb5635697526bf24c798082daed0cf9b87f6a3a6c0c7"
+            ],
+            "index": "pypi",
+            "version": "==0.5.3"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f",
+                "sha256:8f135f6502756bde6b2a9b28989df5fbe87c9970cecaa69041edcce7f0589b14"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "version": "==1.26.16"
+        },
+        "werkzeug": {
+            "hashes": [
+                "sha256:2b8c0e447b4b9dbcc85dd97b6eeb4dcbaf6c8b6c3be0bd654e25553e0a2157d8",
+                "sha256:effc12dba7f3bd72e605ce49807bbe692bd729c3bb122a3b91747a6ae77df528"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==2.3.7"
+        },
+        "wheel": {
+            "hashes": [
+                "sha256:0c5ac5ff2afb79ac23ab82bab027a0be7b5dbcf2e54dc50efe4bf507de1f7985",
+                "sha256:75909db2664838d015e3d9139004ee16711748a52c8f336b52882266540215d8"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==0.41.2"
+        },
+        "wrapt": {
+            "hashes": [
+                "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0",
+                "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420",
+                "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a",
+                "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c",
+                "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079",
+                "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923",
+                "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f",
+                "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1",
+                "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8",
+                "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86",
+                "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0",
+                "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364",
+                "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e",
+                "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c",
+                "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e",
+                "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c",
+                "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727",
+                "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff",
+                "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e",
+                "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29",
+                "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7",
+                "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72",
+                "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475",
+                "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a",
+                "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317",
+                "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2",
+                "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd",
+                "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640",
+                "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98",
+                "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248",
+                "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e",
+                "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d",
+                "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec",
+                "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1",
+                "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e",
+                "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9",
+                "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92",
+                "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb",
+                "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094",
+                "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46",
+                "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29",
+                "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd",
+                "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705",
+                "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8",
+                "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975",
+                "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb",
+                "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e",
+                "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b",
+                "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418",
+                "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019",
+                "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1",
+                "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba",
+                "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6",
+                "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2",
+                "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3",
+                "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7",
+                "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752",
+                "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416",
+                "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f",
+                "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1",
+                "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc",
+                "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145",
+                "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee",
+                "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a",
+                "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7",
+                "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b",
+                "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653",
+                "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0",
+                "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90",
+                "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29",
+                "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6",
+                "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034",
+                "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09",
+                "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559",
+                "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==1.15.0"
+        }
+    },
+    "develop": {
+        "exceptiongroup": {
+            "hashes": [
+                "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9",
+                "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"
+            ],
+            "markers": "python_version < '3.11'",
+            "version": "==1.1.3"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:3ebb78df84a805d7698245025b975d9d67053cd94c79245ba4b3eb694abe68bb",
+                "sha256:dbace7892d8c0c4ac1ad096662232f831d4e64f4c4545bd53016a3e9d4654743"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==6.8.0"
+        },
+        "iniconfig": {
+            "hashes": [
+                "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3",
+                "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.0.0"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61",
+                "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==23.1"
+        },
+        "platformdirs": {
+            "hashes": [
+                "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d",
+                "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==3.10.0"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12",
+                "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==1.3.0"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32",
+                "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.7'",
+            "version": "==7.4.0"
+        },
+        "tomli": {
+            "hashes": [
+                "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+                "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.0.1"
+        },
+        "yapf": {
+            "hashes": [
+                "sha256:958587eb5c8ec6c860119a9c25d02addf30a44f75aa152a4220d30e56a98037c",
+                "sha256:b8bfc1f280949153e795181768ca14ef43d7312629a06c43e7abd279323af313"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.7'",
+            "version": "==0.40.1"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:679e51dd4403591b2d6838a48de3d283f3d188412a9782faadf845f298736ba0",
+                "sha256:ebc15946aa78bd63458992fc81ec3b6f7b1e92d51c35e6de1c3804e73b799147"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==3.16.2"
+        }
+    }
+}
diff --git a/llvm-ir-dataset-utils/README.md b/llvm-ir-dataset-utils/README.md
new file mode 100644
index 000000000000000..52cf6cd20b941b3
--- /dev/null
+++ b/llvm-ir-dataset-utils/README.md
@@ -0,0 +1,70 @@
+<h1 align='center'>LLVM-IR Dataset Utilities</h1>
+
+This repository contains utilities to construct large LLVM IR datasets from
+multiple sources.
+
+## Getting Started
+
+To get started with the dataset construction utilities, we'd suggest to use the packaged [pipenv](https://pipenv.pypa.io) to isolate the Python from your system isolation or other environments. To install, you then have to
+
+```bash
+pipenv install
+```
+
+or if you seek to utilize the packaged lockfile
+
+```bash
+pipenv sync
+```
+
+After that you are ready to activate the environment, and install the dataset construction utilities into it
+
+```bash
+pipenv shell && pip install .
+```
+
+In case you want to develop the package, this becomes
+
+```bash
+pipenv shell && pip install -e .
+```
+
+## Creating First Data
+
+To create your first small batch of IR data you then have to run from the root directory of the package
+
+```bash
+python3 ./llvm_ir_dataset_utils/tools/corpus_from_description.py \
+  --source_dir=/path/to/store/dataset/to/source \
+  --corpus_dir=/path/to/store/dataset/to/corpus \
+  --build_dir=/path/to/store/dataset/to/build \
+  --corpus_description=./corpus_descriptions_test/manual_tree.json
+```
+
+> Beware! You'll need to have a version of `llvm-objcopy` on your `$PATH`. If you are missing `llvm-objcopy`, an easy way to obtain it is by downloading an llvm-release from either your preferred package channel such as `apt`, `dnf` or `pacman`, or build llvm from [source](https://github.com/llvm/llvm-project) where only the LLVM project itself needs to be enabled during the build, i.e. `-DLLVM_ENABLE_PROJECTS="llvm"`.
+
+You'll then receive a set of `.bc` files in `/path/to/store/dataset/to/corpus/tree`, which you can convert with `llvm-dis` into LLVM-IR, i.e. from inside of the folder
+
+```bash
+llvm-dis *.bc
+```
+
+> Last steps into the dataloader to be described here.
+
+## Corpus Description
+
+> Basics of the corpus description to be outlined here to easily enable someone to point the package at a new source.
+
+## IR Sources
+
+The package contains a number of builders to target the LLVM-based languages, and extract IR. The current status of builders for individual sources is as follows:
+
+- [x] Individual projects
+- [x] Rust crates
+- [x] Spack packages
+- [x] Autoconf
+- [x] Cmake
+- [x] Julia packages
+- [x] Swift packages
+- ~~[ ] Haskell packages~~ (deferred to a later version of the package)
+- [ ] Python packages (numba)
diff --git a/llvm-ir-dataset-utils/corpus_descriptions/chromium.json b/llvm-ir-dataset-utils/corpus_descriptions/chromium.json
new file mode 100644
index 000000000000000..7891a5708e9e441
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions/chromium.json
@@ -0,0 +1,25 @@
+{
+  "sources": [
+    {
+      "type": "tar",
+      "archive_url": "https://commondatastorage.googleapis.com/chromium-browser-official/chromium-116.0.5845.140.tar.xz"
+    }
+  ],
+  "folder_name": "chromium",
+  "build_system": "manual",
+  "environment_variables": {
+    "cc": "clang",
+    "cxx": "clang++",
+    "ar": "llvm-ar",
+    "nm": "llvm-nm",
+    "cflags": "-Wno-error=format -Wno-error=shadow",
+    "cxxflags": "-Wno-error=format -Wno-error=shadow"
+  },
+  "commands": [
+    "mkdir -p third_party/node/linux/node-linux-x64/bin",
+    "ln -s /usr/bin/node third_party/node/linux/node-linux-x64/bin/",
+    "gn gen ./out/Release --args='is_official_build=true use_thin_lto=false is_cfi=false use_cfi_icall=false use_cfi_cast=false clang_use_chrome_plugins=false clang_embed_bitcode=true is_debug=false symbol_level=0 enable_rust=false use_sysroot=false use_qt=false clang_base_path=\"/usr\" enable_nacl=false use_vaapi=false custom_toolchain=\"//build/toolchain/linux/unbundle:default\" host_toolchain=\"//build/toolchain/linux/unbundle:default\"'",
+    "ninja -C ./out/Release chrome"
+  ],
+  "license": "BSD-3-Clause"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions/firefox.json b/llvm-ir-dataset-utils/corpus_descriptions/firefox.json
new file mode 100644
index 000000000000000..aeb75e6bd0e0f11
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions/firefox.json
@@ -0,0 +1,22 @@
+{
+  "sources": [
+    {
+      "type": "tar",
+      "archive_url": "https://ftp.mozilla.org/pub/firefox/releases/117.0/source/firefox-117.0.source.tar.xz"
+    }
+  ],
+  "folder_name": "firefox",
+  "build_system": "manual",
+  "environment_variables": {
+    "CFLAGS": "-Xclang -fembed-bitcode=all",
+    "CXXFLAGS": "-Xclang -fembed-bitcode=all",
+    "RUSTFLAGS": "--emit=llvm-bc"
+  },
+  "commands": [
+    "echo 'ac_add_options --without-wasm-sandboxed-libraries' > mozconfig",
+    "./mach build"
+  ],
+  "raw_bc_corpus": "bitcode",
+  "license": "MPL-2.0"
+}
+
diff --git a/llvm-ir-dataset-utils/corpus_descriptions/linux.json b/llvm-ir-dataset-utils/corpus_descriptions/linux.json
new file mode 100644
index 000000000000000..6bb9d62da9d9e84
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions/linux.json
@@ -0,0 +1,16 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/torvalds/linux",
+      "commit_sha": "a92b7d26c743b9dc06d520f863d624e94978a1d9"
+    }
+  ],
+  "folder_name": "linux",
+  "build_system": "manual",
+  "commands": [
+    "make CC=clang KCFLAGS='-Xclang -fembed-bitcode=all' allyesconfig",
+    "make CC=clang KCFLAGS='-Xclang -fembed-bitcode=all' -j128"
+  ],
+  "license": "GPL-2.0-only"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions/llvm.json b/llvm-ir-dataset-utils/corpus_descriptions/llvm.json
new file mode 100644
index 000000000000000..6e74fb203280e57
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions/llvm.json
@@ -0,0 +1,17 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/llvm/llvm-project",
+      "commit_sha": "3391bdc255f1a75c59d71c7305959e84d8d5f468"
+    }
+  ],
+  "folder_name": "llvm-project",
+  "build_system": "cmake",
+  "cmake_flags": {
+    "CMAKE_BUILD_TYPE": "Release",
+    "LLVM_ENABLE_PROJECTS": "all"
+  },
+  "cmake_root": "./llvm",
+  "license": "Apache-2.0"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_cpython.json b/llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_cpython.json
new file mode 100644
index 000000000000000..85a6db3f5c717d8
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_cpython.json
@@ -0,0 +1,14 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/python/cpython",
+      "commit_sha": "d8f87cdf94a6533c5cf2d25e09e6fa3eb06720b9"
+    }
+  ],
+  "repo_name": "cpython",
+  "folder_name": "cpython",
+  "build_system": "autoconf",
+  "autoconf_flags": {},
+  "license": "PSF-2.0"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_ffmpeg.json b/llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_ffmpeg.json
new file mode 100644
index 000000000000000..aab74cea09fc830
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/autoconf_ffmpeg.json
@@ -0,0 +1,16 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/FFmpeg/FFmpeg",
+      "commit_sha": "98346606015c309305587457f0197013df64872c"
+    }
+  ],
+  "folder_name": "FFmpeg",
+  "build_system": "autoconf",
+  "autoconf_flags": {
+    "cc": "clang",
+    "cxx": "clang++"
+  },
+  "license": "LGLP-2.1-or-later"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_azure_sdk_for_rust.json b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_azure_sdk_for_rust.json
new file mode 100644
index 000000000000000..5b56149bc4f7e11
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_azure_sdk_for_rust.json
@@ -0,0 +1,12 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/Azure/azure-sdk-for-rust.json",
+      "commit_sha": "7fe4da3b4388f6be5aa3049cb3e0658425ce3295"
+    }
+  ],
+  "folder_name": "azure-sdk-for-rust",
+  "build_system": "cargo",
+  "license": "Apache-2.0"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_fall_back_to_tar.json b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_fall_back_to_tar.json
new file mode 100644
index 000000000000000..9b02d63a2870d70
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_fall_back_to_tar.json
@@ -0,0 +1,16 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/efm32-rs/efm32wg-pacs",
+      "commit_sha": null
+    },
+    {
+      "type": "tar",
+      "archive_url": "https://crates.io/api/v1/crates/efm32wg940-pac/0.1.0/download"
+    }
+  ],
+  "folder_name": "efm32wg-pacs",
+  "build_system": "cargo",
+  "license": "BSD-3-Clause"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_syn.json b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_syn.json
new file mode 100644
index 000000000000000..67c4f4f868b20b0
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_syn.json
@@ -0,0 +1,12 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/dtolnay/syn",
+      "commit_sha": "782fd24671315587d0d3cda3e8603528ce650299"
+    }
+  ],
+  "folder_name": "syn",
+  "build_system": "cargo",
+  "license": "MIT"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_tar_archive.json b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_tar_archive.json
new file mode 100644
index 000000000000000..b562832f3e357ce
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_tar_archive.json
@@ -0,0 +1,11 @@
+{
+  "sources": [
+    {
+      "type": "tar",
+      "archive_url": "https://crates.io/api/v1/crates/syn/2.0.23/download"
+    }
+  ],
+  "folder_name": "syn",
+  "build_system": "cargo",
+  "license": "MIT"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_timeout.json b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_timeout.json
new file mode 100644
index 000000000000000..f682c83dfcb0df8
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/cargo_timeout.json
@@ -0,0 +1,12 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/tetcoin/tetcore",
+      "commit_sha": null
+    }
+  ],
+  "folder_name": "tetcore",
+  "build_system": "cargo",
+  "license": "Apache-2.0"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/cmake_cpuinfo.json b/llvm-ir-dataset-utils/corpus_descriptions_test/cmake_cpuinfo.json
new file mode 100644
index 000000000000000..98cbd636915152a
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/cmake_cpuinfo.json
@@ -0,0 +1,13 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/pytorch/cpuinfo",
+      "commit_sha": "5366f69c0f998e943a338f282b774c6a4386b005"
+    }
+  ],
+  "folder_name": "cpuinfo",
+  "build_system": "cmake",
+  "cmake_flags": {},
+  "cmake_root": "./"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/cmake_googletest.json b/llvm-ir-dataset-utils/corpus_descriptions_test/cmake_googletest.json
new file mode 100644
index 000000000000000..cfb1b0c14e975c7
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/cmake_googletest.json
@@ -0,0 +1,12 @@
+{
+	"sources": [
+		{
+			"type": "tar",
+			"archive_url": "https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz"
+		}
+	],
+	"folder_name": "googletest",
+	"build_system": "cmake",
+	"cmake_flags": {},
+	"cmake_root": "./"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/julia_gen.json b/llvm-ir-dataset-utils/corpus_descriptions_test/julia_gen.json
new file mode 100644
index 000000000000000..3a528e3ef6a1ff4
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/julia_gen.json
@@ -0,0 +1,7 @@
+{
+  "sources": [],
+  "folder_name": "Gen",
+  "build_system": "julia",
+  "package_name": "Gen",
+  "license": "Apache-2.0"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/julia_zomato.json b/llvm-ir-dataset-utils/corpus_descriptions_test/julia_zomato.json
new file mode 100644
index 000000000000000..27773986958070d
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/julia_zomato.json
@@ -0,0 +1,13 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/rahulkp220/Zomato.jl",
+      "commit_sha": "9ba4dfbf1f22b38c164f2e0087558b150350ccaf"
+    }
+  ],
+  "folder_name": "Zomato",
+  "build_system": "julia",
+  "package_name": "Zomtao",
+  "license": "MIT"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/manual_bc_corpus.json b/llvm-ir-dataset-utils/corpus_descriptions_test/manual_bc_corpus.json
new file mode 100644
index 000000000000000..2c5027b60949aac
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/manual_bc_corpus.json
@@ -0,0 +1,17 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/glample/fastBPE.git",
+      "commit_sha": "036711f8fdc3265d64e8e123a0761be12c5a8e74"
+    }
+  ],
+  "folder_name": "fastbpe",
+  "build_system": "manual",
+  "commands": [
+    "clang++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -Xclang -fembed-bitcode=all -c -o fast.o",
+    "clang++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -emit-llvm -c -o fast.bc"
+  ],
+  "raw_bc_corpus": "bitcode",
+  "license": "MIT"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/manual_no_license.json b/llvm-ir-dataset-utils/corpus_descriptions_test/manual_no_license.json
new file mode 100644
index 000000000000000..fd56e4bf829b902
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/manual_no_license.json
@@ -0,0 +1,14 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://gitlab.com/OldManProgrammer/unix-tree",
+      "commit_sha": "84fa3ddff51b30835a0f9c4a9e4c9225970f9aff"
+    }
+  ],
+  "folder_name": "tree-no-license",
+  "build_system": "manual",
+  "commands": [
+    "make CC=clang CFLAGS='-Xclang -fembed-bitcode=all -O3' -j${jobs}"
+  ]
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/manual_tree.json b/llvm-ir-dataset-utils/corpus_descriptions_test/manual_tree.json
new file mode 100644
index 000000000000000..d9f678d8652bc7d
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/manual_tree.json
@@ -0,0 +1,15 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://gitlab.com/OldManProgrammer/unix-tree",
+      "commit_sha": "84fa3ddff51b30835a0f9c4a9e4c9225970f9aff"
+    }
+  ],
+  "folder_name": "tree",
+  "build_system": "manual",
+  "commands": [
+    "make CC=clang CFLAGS='-Xclang -fembed-bitcode=all -O3' -j${jobs}"
+  ],
+  "license": "GPL-2.0-or-later"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/sources_fallback.json b/llvm-ir-dataset-utils/corpus_descriptions_test/sources_fallback.json
new file mode 100644
index 000000000000000..075daed947e2351
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/sources_fallback.json
@@ -0,0 +1,19 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/thisisafakerepo",
+      "commit_sha": "84fa3ddff51b30835a0f9c4a9e4c9225970f9aff"
+    },
+    {
+      "type": "tar",
+      "archive_url": "https://github.com/Old-Man-Programmer/tree/archive/refs/tags/2.1.1.tar.gz"
+    }
+  ],
+  "folder_name": "tree",
+  "build_system": "manual",
+  "commands": [
+    "make CC=clang CFLAGS='-Xclang -fembed-bitcode=all' -j${jobs}"
+  ],
+  "license": "GPL-2.0-or-later"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/sources_tar_archive.json b/llvm-ir-dataset-utils/corpus_descriptions_test/sources_tar_archive.json
new file mode 100644
index 000000000000000..33a21c0073d8135
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/sources_tar_archive.json
@@ -0,0 +1,14 @@
+{
+  "sources": [
+    {
+      "type": "tar",
+      "archive_url": "https://github.com/Old-Man-Programmer/tree/archive/refs/tags/2.1.1.tar.gz"
+    }
+  ],
+  "folder_name": "tree",
+  "build_system": "manual",
+  "commands": [
+    "make CC=clang CFLAGS='-Xclang -fembed-bitcode=all' -j${jobs}"
+  ],
+  "license": "GPL-2.0-or-later"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/spack_gmake.json b/llvm-ir-dataset-utils/corpus_descriptions_test/spack_gmake.json
new file mode 100644
index 000000000000000..d53a39a4e640431
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/spack_gmake.json
@@ -0,0 +1,9 @@
+{
+  "sources": [],
+  "folder_name": "gmake",
+  "build_system": "spack",
+  "package_name": "gmake",
+  "package_spec": "gmake@=4.4.1%clang@=18.0.0 cflags=\"-Xclang -fembed-bitcode=all\" cxxflags=\"-Xclang -fembed-bitcode=all\" ~guile build_system=generic arch=linux-ubuntu22.04-zen2 license=\"GPL-3.0-only\"",
+  "package_hash": "urhhmbrocyrrhqn5btbiopsrrtc5ng7d",
+  "license": "GPL-3.0-only"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib.json b/llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib.json
new file mode 100644
index 000000000000000..88b66626217c101
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib.json
@@ -0,0 +1,9 @@
+{
+  "sources": [],
+  "folder_name": "zlib",
+  "build_system": "spack",
+  "package_name": "zlib",
+  "package_spec": "zlib at 1.2.13%clang at 18.0.0 cflags=\"-Xclang -fembed-bitcode=all\" cxxflags=\"-Xclang -fembed-bitcode=all\" +optimize+pic+shared build_system=makefile arch=linux-ubuntu22.04-x86_64_v3",
+  "package_hash": "urhhmbrocyrrhqn5btbiopsrrtc5ng7d",
+  "license": "Zlib"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib_cray.json b/llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib_cray.json
new file mode 100644
index 000000000000000..4f11453f029376d
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/spack_zlib_cray.json
@@ -0,0 +1,9 @@
+{
+  "sources": [],
+  "folder_name": "zlib",
+  "build_system": "spack",
+  "package_name": "zlib",
+  "package_spec": "zlib at 1.2.13%clang at 16.0.6 cflags=\"-Xclang -fembed-bitcode=all\" cxxflags=\"-Xclang -fembed-bitcode=all\" +optimize+pic+shared build_system=makefile arch=cray-ubuntu22.04-x86_64",
+  "package_hash": "ctg354lial5zjl552xriw2cpnhcwhk3i",
+  "license": "Zlib"
+}
diff --git a/llvm-ir-dataset-utils/corpus_descriptions_test/swift_swift_blocks.json b/llvm-ir-dataset-utils/corpus_descriptions_test/swift_swift_blocks.json
new file mode 100644
index 000000000000000..2008db42351bd49
--- /dev/null
+++ b/llvm-ir-dataset-utils/corpus_descriptions_test/swift_swift_blocks.json
@@ -0,0 +1,13 @@
+{
+  "sources": [
+    {
+      "type": "git",
+      "repo_url": "https://github.com/dirtyhenry/swift-blocks",
+      "commit_sha": null
+    }
+  ],
+  "folder_name": "swift-blocks",
+  "build_system": "swift",
+  "package_name": "swift-blocks",
+  "license": "MIT"
+}
diff --git a/llvm-ir-dataset-utils/docs/building-corpora.md b/llvm-ir-dataset-utils/docs/building-corpora.md
new file mode 100644
index 000000000000000..bfefc87d2a5666e
--- /dev/null
+++ b/llvm-ir-dataset-utils/docs/building-corpora.md
@@ -0,0 +1,18 @@
+# Building Corpora
+
+### Building a corpus from an individual description
+
+To build a corpus from an individual description, run the following command from
+the root directory of this repoistory:
+
+```bash
+PYTHONPATH="./" python3 ./llvm_ir_dataset_utils/tools/corpus_from_description.py \
+  --base_dir=<path to build> \
+  --corpus_dir=<path to corpus> \
+  --corpus_description=<path to corpus description json>
+```
+
+The script will take the application description, clone the source, build the
+application with the appropriate flags, and then extract unoptimized IR from
+the build, placing it in a subdirectory of the directory passed to
+`--corpus_dir` in the ml-compiler-opt corpus format.
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/__init__.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/__init__.py
new file mode 100644
index 000000000000000..e69de29bb2d1d64
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/__init__.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/__init__.py
new file mode 100644
index 000000000000000..e69de29bb2d1d64
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/autoconf_builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/autoconf_builder.py
new file mode 100644
index 000000000000000..e95138043d88d7a
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/autoconf_builder.py
@@ -0,0 +1,68 @@
+"""Module for building and extracting bitcode from applications using autoconf"""
+
+import os
+import subprocess
+
+from compiler_opt.tools import extract_ir_lib
+
+CONFIGURE_LOG_NAME = './configure.log'
+BUILD_LOG_NAME = './build.log'
+
+
+def generate_configure_command(root_path, options_dict):
+  command_vector = [os.path.join(root_path, "configure")]
+  for option in options_dict:
+    command_vector.append(f"--{option}=\"{options_dict[option]}\"")
+  return command_vector
+
+
+def generate_build_command(threads):
+  command_vector = ["make", f"-j{threads}"]
+  return command_vector
+
+
+def perform_build(configure_command_vector, build_command_vector, build_dir,
+                  corpus_dir):
+  configure_env = os.environ.copy()
+  configure_env["CC"] = "clang"
+  configure_env["CXX"] = "clang++"
+  configure_env["CFLAGS"] = "-Xclang -fembed-bitcode=all"
+  configure_env["CXXFLAGS"] = "-Xclang -fembed-bitcode=all"
+  configure_command = " ".join(configure_command_vector)
+  configure_log_path = os.path.join(corpus_dir, CONFIGURE_LOG_NAME)
+  with open(configure_log_path, 'w') as configure_log_file:
+    configure_process = subprocess.run(
+        configure_command,
+        cwd=build_dir,
+        env=configure_env,
+        shell=True,
+        stdout=configure_log_file,
+        stderr=configure_log_file)
+    configure_success = configure_process.returncode == 0
+  build_log_path = os.path.join(corpus_dir, BUILD_LOG_NAME)
+  with open(build_log_path, 'w') as build_log_file:
+    build_process = subprocess.run(
+        build_command_vector,
+        cwd=build_dir,
+        stdout=build_log_file,
+        stderr=build_log_file)
+    build_success = build_process.returncode == 0
+  return {
+      'targets': [{
+          'success': build_success and configure_success,
+          'build_log': BUILD_LOG_NAME,
+          'configure_log': CONFIGURE_LOG_NAME,
+          'name': os.path.basename(corpus_dir),
+          'build_success': build_success,
+          'configure_success': configure_success
+      }]
+  }
+
+
+def extract_ir(build_dir, corpus_dir, threads):
+  objects = extract_ir_lib.load_from_directory(build_dir, corpus_dir)
+  relative_output_paths = extract_ir_lib.run_extraction(objects, threads,
+                                                        "llvm-objcopy", None,
+                                                        None, ".llvmcmd",
+                                                        ".llvmbc")
+  extract_ir_lib.write_corpus_manifest(None, relative_output_paths, corpus_dir)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/builder.py
new file mode 100644
index 000000000000000..fcc55d3edf568c5
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/builder.py
@@ -0,0 +1,231 @@
+"""Module that parses application description, downloads source code, and invokes the correct builder"""
+
+import os
+import json
+import pathlib
+import multiprocessing
+import shutil
+import glob
+import logging
+import hashlib
+
+import ray
+
+from llvm_ir_dataset_utils.builders import cmake_builder
+from llvm_ir_dataset_utils.builders import manual_builder
+from llvm_ir_dataset_utils.builders import autoconf_builder
+from llvm_ir_dataset_utils.builders import cargo_builder
+from llvm_ir_dataset_utils.builders import spack_builder
+from llvm_ir_dataset_utils.builders import julia_builder
+from llvm_ir_dataset_utils.builders import swift_builder
+
+from llvm_ir_dataset_utils.sources import source
+
+from llvm_ir_dataset_utils.util import file
+from llvm_ir_dataset_utils.util import licenses
+
+
+def get_corpus_size(corpus_dir):
+  total_size = 0
+  for bitcode_file in glob.glob(
+      os.path.join(corpus_dir, '**/*.bc'), recursive=True):
+    total_size += os.path.getsize(bitcode_file)
+  return total_size
+
+
+def get_build_future(corpus_description,
+                     source_base_dir,
+                     build_base_dir,
+                     corpus_dir,
+                     threads,
+                     extra_env_variables,
+                     extra_builder_arguments={},
+                     cleanup=False,
+                     archive_corpus=False):
+  return parse_and_build_from_description.options(num_cpus=threads).remote(
+      corpus_description,
+      source_base_dir,
+      build_base_dir,
+      corpus_dir,
+      threads,
+      extra_env_variables,
+      extra_builder_arguments=extra_builder_arguments,
+      cleanup=cleanup,
+      archive_corpus=archive_corpus)
+
+
+def get_license_information(source_dir, corpus_dir):
+  license_files = licenses.get_all_license_files(source_dir)
+  license_file_list = []
+  for license_description in license_files:
+    # copy each license over
+    license_file = license_description['file']
+    with open(os.path.join(source_dir, license_file),
+              'rb') as license_file_handle:
+      license_data = license_file_handle.read()
+      license_hash = hashlib.sha256(license_data).hexdigest()
+      new_license_path = f'./license-{license_hash}.txt'
+      new_license_dict = license_description
+      new_license_dict['file'] = new_license_path
+      license_file_list.append(new_license_dict)
+    with open(os.path.join(corpus_dir, new_license_path),
+              'wb') as new_license_file_handle:
+      new_license_file_handle.write(license_data)
+  return license_file_list
+
+
+ at ray.remote(num_cpus=multiprocessing.cpu_count())
+def parse_and_build_from_description(corpus_description,
+                                     source_base_dir,
+                                     build_base_dir,
+                                     corpus_base_dir,
+                                     threads,
+                                     extra_env_variables,
+                                     extra_builder_arguments={},
+                                     cleanup=False,
+                                     archive_corpus=False):
+  # Construct relevant paths for the build
+  corpus_dir = os.path.join(corpus_base_dir, corpus_description["folder_name"])
+  if corpus_description["build_system"] == "manual":
+    build_dir = os.path.join(build_base_dir, corpus_description["folder_name"])
+  else:
+    build_dir = os.path.join(build_base_dir,
+                             corpus_description["folder_name"] + "-build")
+  source_dir = os.path.join(source_base_dir, corpus_description["folder_name"])
+
+  # Handle the case where we are archiving corpora and we already have some
+  # packages that have finished building.
+  if archive_corpus and os.path.exists(f'{corpus_dir}.tar'):
+    # We already have an archived corpus for this package, so we can exit early
+    # without doing the build.
+    logging.warning(
+        f'Found already built version of package at {corpus_dir}, skipping')
+    return {}
+  else:
+    if os.path.exists(corpus_dir):
+      shutil.rmtree(corpus_dir, ignore_errors=True)
+    if os.path.exists(build_dir):
+      shutil.rmtree(build_dir)
+    if os.path.exists(source_dir):
+      shutil.rmtree(source_dir)
+
+  pathlib.Path(corpus_dir).mkdir(exist_ok=True, parents=True)
+  pathlib.Path(source_base_dir).mkdir(exist_ok=True)
+  pathlib.Path(build_base_dir).mkdir(exist_ok=True)
+  to_download_dir = build_base_dir if corpus_description[
+      "build_system"] == "manual" else source_base_dir
+  source_logs = source.download_source(corpus_description['sources'],
+                                       to_download_dir, corpus_dir,
+                                       corpus_description['folder_name'])
+
+  if not os.path.exists(build_dir):
+    os.makedirs(build_dir)
+  build_log = {}
+  if corpus_description["build_system"] == "cmake":
+    configure_command_vector = cmake_builder.generate_configure_command(
+        os.path.join(source_dir, corpus_description["cmake_root"]),
+        corpus_description["cmake_flags"])
+    build_command_vector = cmake_builder.generate_build_command([], threads)
+    build_log = cmake_builder.perform_build(configure_command_vector,
+                                            build_command_vector, build_dir,
+                                            corpus_dir)
+    cmake_builder.extract_ir(build_dir, corpus_dir, threads)
+  elif corpus_description["build_system"] == "manual":
+    if 'environment_variables' in corpus_description:
+      environment_variables = corpus_description['environment_variables']
+    else:
+      environment_variables = {}
+    build_log = manual_builder.perform_build(corpus_description["commands"],
+                                             build_dir, threads, corpus_dir,
+                                             environment_variables)
+    manual_builder.extract_ir(build_dir, corpus_dir, threads)
+    if 'raw_bc_corpus' in corpus_description:
+      bc_corpus_dir = f'{corpus_dir}-{corpus_description["raw_bc_corpus"]}'
+      os.makedirs(bc_corpus_dir)
+      manual_builder.extract_raw_ir(build_dir, bc_corpus_dir, threads)
+  elif corpus_description["build_system"] == "autoconf":
+    configure_command_vector = autoconf_builder.generate_configure_command(
+        source_dir, corpus_description["autoconf_flags"])
+    build_command_vector = autoconf_builder.generate_build_command(threads)
+    build_log = autoconf_builder.perform_build(configure_command_vector,
+                                               build_command_vector, build_dir,
+                                               corpus_dir)
+    autoconf_builder.extract_ir(build_dir, corpus_dir, threads)
+  elif corpus_description["build_system"] == "cargo":
+    build_log = cargo_builder.build_all_targets(source_dir, build_dir,
+                                                corpus_dir, threads,
+                                                extra_env_variables, cleanup)
+    if len(build_log['targets']) == 0 and source_logs[-1]['type'] == 'git':
+      logging.warn('Cargo builder detected no targets from git repository, '
+                   'retrying with tar archive.')
+      shutil.rmtree(source_dir)
+      # The git repositry is always guaranteed to be the first source as long
+      # as parse_crates_database.py was the source
+      corpus_description['sources'].pop(0)
+      build_future = get_build_future(corpus_description, source_base_dir,
+                                      build_base_dir, corpus_base_dir, threads,
+                                      extra_env_variables, cleanup)
+      ray.get(build_future)
+      return {}
+  elif corpus_description["build_system"] == "spack":
+    if 'dependency_futures' in extra_builder_arguments:
+      dependency_futures = extra_builder_arguments['dependency_futures']
+    else:
+      dependency_futures = []
+    build_log = spack_builder.build_package(
+        dependency_futures, corpus_description['package_name'],
+        corpus_description['package_spec'], corpus_description['package_hash'],
+        corpus_dir, threads, extra_builder_arguments['buildcache_dir'],
+        build_dir, cleanup)
+  elif corpus_description["build_system"] == "julia":
+    build_log = julia_builder.perform_build(corpus_description['package_name'],
+                                            build_dir, corpus_dir, threads)
+  elif corpus_description["build_system"] == "swift":
+    build_log = swift_builder.perform_build(source_dir, build_dir, corpus_dir,
+                                            threads,
+                                            corpus_description['package_name'])
+  else:
+    raise ValueError(
+        f"Build system {corpus_description['build_system']} is not supported")
+
+  # Collect license files from the build
+  source_license_dir = source_dir
+  if corpus_description['build_system'] == 'spack':
+    # Spack doesn't use the source directory, so we should instead pull
+    # information from the build directory.
+    spack_stage_dir = spack_builder.get_spack_stage_directory(
+        corpus_description['package_hash'], build_dir)
+    if spack_stage_dir is None:
+      source_license_dir = None
+    else:
+      source_license_dir = os.path.join(spack_stage_dir, 'spack-src')
+  elif corpus_description['build_system'] == 'manual':
+    # The manual builder clones everything into the build directory, so
+    # just use that.
+    source_license_dir = build_dir
+  if source_license_dir is not None:
+    build_log['license_files'] = get_license_information(
+        source_license_dir, corpus_dir)
+  else:
+    build_log['license_files'] = []
+
+  if cleanup:
+    file.delete_directory(build_dir, corpus_dir)
+    file.delete_directory(source_dir, corpus_dir)
+  build_log['sources'] = source_logs
+  build_log['size'] = get_corpus_size(corpus_dir)
+
+  if 'license' in corpus_description:
+    build_log['license'] = corpus_description['license']
+  else:
+    build_log['license'] = None
+
+  with open(os.path.join(corpus_dir, 'build_manifest.json'),
+            'w') as build_manifest:
+    json.dump(build_log, build_manifest, indent=2)
+  if archive_corpus:
+    # Use corpus_dir for the file path as make_archive automatically adds the
+    # .tar extension to the path
+    shutil.make_archive(corpus_dir, 'tar', corpus_dir)
+    shutil.rmtree(corpus_dir, ignore_errors=True)
+  return build_log
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cargo_builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cargo_builder.py
new file mode 100644
index 000000000000000..2712583bf84a0d2
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cargo_builder.py
@@ -0,0 +1,177 @@
+"""Module for building and extracting bitcode from applications using cargo"""
+
+import subprocess
+import os
+import json
+import multiprocessing
+import shutil
+import pathlib
+import logging
+
+import ray
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+from compiler_opt.tools import make_corpus_lib
+from compiler_opt.tools import combine_training_corpus_lib
+
+BUILD_TIMEOUT = 900
+
+
+def get_spec_from_id(id):
+  sections = id.split('(')
+  file_path = sections[1][5:-1]
+  name_version = sections[0].split(' ')
+  name = name_version[0]
+  version = name_version[1]
+  return f'{file_path}#{name}@{version}'
+
+
+def get_packages_from_manifest(source_dir):
+  command_vector = ["cargo", "metadata", "--no-deps"]
+  if not os.path.exists(source_dir):
+    return []
+  try:
+    # TODO(boomanaiden154): Dump the stderr of the metadata command to a log
+    # somewhere
+    out = subprocess.check_output(
+        command_vector, cwd=source_dir, stderr=subprocess.PIPE)
+    manifest = json.loads(out.decode("utf-8"))
+    packages = {}
+    for package in manifest["packages"]:
+      targets = []
+      for target in package["targets"]:
+        targets.append({
+            "name": target["name"],
+            "kind": target["kind"][0],
+            "spec": get_spec_from_id(package['id']),
+            "package": package['name']
+        })
+      packages[package["name"]] = targets
+    return packages
+  except subprocess.SubprocessError:
+    return []
+
+
+def get_build_log_name(target):
+  return './' + target['name'] + '.' + target['kind'] + '.build.log'
+
+
+def build_all_targets(source_dir, build_dir, corpus_dir, threads,
+                      extra_env_variables, cleanup):
+  package_list = get_packages_from_manifest(source_dir)
+  build_log = {'targets': []}
+  package_futures = []
+  for package in package_list:
+    package_build_dir = build_dir + '-' + package
+    package_futures.append(
+        build_package_future(source_dir, package_build_dir, corpus_dir,
+                             package_list[package], threads,
+                             extra_env_variables, cleanup))
+  package_build_logs = ray.get(package_futures)
+  for package_build_log in package_build_logs:
+    build_log['targets'].extend(package_build_log)
+  combine_training_corpus_lib.combine_corpus(corpus_dir)
+  return build_log
+
+
+def build_package_future(source_dir, build_dir, corpus_dir, targets, threads,
+                         extra_env_variables, cleanup):
+  return build_package.options(num_cpus=threads).remote(source_dir, build_dir,
+                                                        corpus_dir, targets,
+                                                        threads,
+                                                        extra_env_variables,
+                                                        cleanup)
+
+
+ at ray.remote(num_cpus=multiprocessing.cpu_count())
+def build_package(source_dir, build_dir, corpus_dir, targets, threads,
+                  extra_env_variables, cleanup):
+  build_log = []
+  for target in targets:
+    build_log.append(
+        perform_build(source_dir, build_dir, corpus_dir, target, threads,
+                      extra_env_variables))
+  package_corpus_dir = os.path.join(corpus_dir, targets[0]["package"])
+  # We should never be creating the parents of the folder as they should be
+  # provided by builder.py and the folder should never exist before we create
+  # it.
+  pathlib.Path(package_corpus_dir).mkdir(exist_ok=False, parents=False)
+  extract_ir(build_dir, package_corpus_dir)
+  if cleanup:
+    if os.path.exists(build_dir):
+      try:
+        shutil.rmtree(build_dir)
+      except:
+        logging.warn(
+            f'Failed to delete directory {build_dir}, probably deleted by another process.'
+        )
+  return build_log
+
+
+def perform_build(source_dir, build_dir, corpus_dir, target, threads,
+                  extra_env_variables):
+  logging.info(
+      f"Building target {target['name']} of type {target['kind']} from package {target['package']}"
+  )
+  build_env = os.environ.copy()
+  build_env["CARGO_TARGET_DIR"] = build_dir
+  build_env.update(extra_env_variables)
+  build_command_vector = [
+      "cargo", "rustc", "-p", f"{target['spec']}", "-j",
+      str(threads)
+  ]
+  if target['kind'] == "lib":
+    build_command_vector.append("--lib")
+  elif target['kind'] == "test":
+    build_command_vector.extend(["--test", target['name']])
+  elif target['kind'] == "bench":
+    build_command_vector.extend(["--bench", target['name']])
+  elif target['kind'] == "bin":
+    build_command_vector.extend(["--bin", target['name']])
+  elif target['kind'] == "example":
+    build_command_vector.extend(["--example", target['name']])
+  else:
+    logging.warn(
+        f'{target["name"]} has unrecognized target type {target["kind"]} in package {target["package"]}'
+    )
+    return {
+        'success': False,
+        'build_log': None,
+        'name': target['name'] + '.' + target['kind']
+    }
+  build_command_vector.extend(
+      ["--", '--emit=llvm-bc', '-C', 'no-prepopulate-passes'])
+  try:
+    build_log_path = os.path.join(corpus_dir, get_build_log_name(target))
+    with open(build_log_path, 'w') as build_log_file:
+      subprocess.run(
+          build_command_vector,
+          cwd=source_dir,
+          env=build_env,
+          check=True,
+          stdout=build_log_file,
+          stderr=build_log_file,
+          timeout=BUILD_TIMEOUT)
+  except subprocess.SubprocessError:
+    logging.warn(
+        f"Failed to build target {target['name']} of type {target['kind']} from package {target['package']}"
+    )
+    build_success = False
+  else:
+    logging.info(
+        f"Finished building target {target['name']} of type {target['kind']} from package {target['package']}"
+    )
+    build_success = True
+  return {
+      'success': build_success,
+      'build_log': get_build_log_name(target),
+      'name': target['name'] + '.' + target['kind']
+  }
+
+
+def extract_ir(build_dir, corpus_dir):
+  # TODO(boomanaiden154): Look into getting a build manifest from cargo.
+  relative_paths = make_corpus_lib.load_bitcode_from_directory(build_dir)
+  make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+  make_corpus_lib.write_corpus_manifest(relative_paths, corpus_dir, '')
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cmake_builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cmake_builder.py
new file mode 100644
index 000000000000000..06c94c9d58f1b35
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/cmake_builder.py
@@ -0,0 +1,76 @@
+"""Module for building and extracting bitcode from applications using CMake"""
+
+import subprocess
+import json
+import os
+
+from compiler_opt.tools import extract_ir_lib
+
+CONFIGURE_LOG_NAME = './configure.log'
+BUILD_LOG_NAME = './build.log'
+
+
+def generate_configure_command(root_path, options_dict):
+  command_vector = ["cmake", "-G", "Ninja"]
+  for option in options_dict:
+    command_vector.append(f"-D{option}={options_dict[option]}")
+  # Add some default flags that are needed for bitcode extraction
+  command_vector.append("-DCMAKE_C_COMPILER=clang")
+  command_vector.append("-DCMAKE_CXX_COMPILER=clang++")
+  # These two flags assume this is a stanard non-LTO build, will need to fix
+  # later when we want to support (Thin)LTO builds.
+  command_vector.append("-DCMAKE_C_FLAGS='-Xclang -fembed-bitcode=all'")
+  command_vector.append("-DCMAKE_CXX_FLAGS='-Xclang -fembed-bitcode=all'")
+  command_vector.append("-DCMAKE_EXPORT_COMPILE_COMMANDS=ON")
+  command_vector.append(root_path)
+  return command_vector
+
+
+def generate_build_command(targets, threads):
+  command_vector = ["ninja", "-j", str(threads)]
+  command_vector.extend(targets)
+  return command_vector
+
+
+def perform_build(configure_command_vector, build_command_vector, build_dir,
+                  corpus_dir):
+  configure_log_path = os.path.join(corpus_dir, CONFIGURE_LOG_NAME)
+  with open(configure_log_path, 'w') as configure_log_file:
+    configure_process = subprocess.run(
+        configure_command_vector,
+        cwd=build_dir,
+        check=True,
+        stderr=configure_log_file,
+        stdout=configure_log_file)
+    configure_success = configure_process.returncode == 0
+  build_log_path = os.path.join(corpus_dir, BUILD_LOG_NAME)
+  with open(build_log_path, 'w') as build_log_file:
+    build_process = subprocess.run(
+        build_command_vector,
+        cwd=build_dir,
+        check=True,
+        stderr=build_log_file,
+        stdout=build_log_file)
+    build_success = build_process.returncode == 0
+  return {
+      'targets': [{
+          'success': build_success and configure_success,
+          'build_log': BUILD_LOG_NAME,
+          'configure_log': CONFIGURE_LOG_NAME,
+          'name': 'all',
+          'build_success': build_success,
+          'configure_success': configure_success
+      }]
+  }
+
+
+def extract_ir(build_dir, corpus_dir, threads):
+  with open(os.path.join(
+      build_dir, "./compile_commands.json")) as compilation_command_db_file:
+    objects = extract_ir_lib.load_from_compile_commands(
+        json.load(compilation_command_db_file), corpus_dir)
+  relative_output_paths = extract_ir_lib.run_extraction(objects, threads,
+                                                        "llvm-objcopy", None,
+                                                        None, ".llvmcmd",
+                                                        ".llvmbc")
+  extract_ir_lib.write_corpus_manifest(None, relative_output_paths, corpus_dir)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.jl b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.jl
new file mode 100644
index 000000000000000..b7fb9480774d4b3
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.jl
@@ -0,0 +1,27 @@
+"""
+Calling convention: julia julia_builder.jl <package name>
+After which you receive the system image, and a bitcode
+  archive which is to be unpacked with `ar -x`.
+"""
+
+using Pkg;
+
+# Rope in the ARGS given to Julia
+for x in ARGS
+    
+    # Adding the Julia package
+    try
+        Pkg.add(x);
+    catch e
+        # line is buggy rn and just prints whenever
+        # Error given when triggered alone: 
+        #   TypeError: in using, expected Symbol, got a value of type Core.SlotNumber
+        println("Package not found.");
+    end
+
+    try
+        Pkg.test(x);
+    catch e
+        println("Testing package failed.");
+    end
+end
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.py
new file mode 100644
index 000000000000000..00de3aa9bd4f037
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/julia_builder.py
@@ -0,0 +1,118 @@
+"""Module for building and extracting bitcode from Julia applications"""
+
+import subprocess
+import os
+import pathlib
+import json
+import logging
+import shutil
+import glob
+
+from compiler_opt.tools import make_corpus_lib
+"""
+Generates the command to compile a bitcode archive from a Julia package.
+The archive then needs to be unpacked with `ar -x`.
+"""
+
+
+def generate_build_command(package_to_build, thread_count):
+  command_vector = [
+      "julia",
+      "--threads",
+      f"{thread_count}",
+      "--quiet",
+  ]
+
+  # Close out the Julia command line switches
+  command_vector.append("--")
+
+  julia_builder_jl_path = os.path.join(
+      os.path.dirname(__file__), 'julia_builder.jl')
+  command_vector.append(julia_builder_jl_path)
+
+  # Add the package to build
+  command_vector.append(package_to_build)
+
+  return command_vector
+
+
+def perform_build(package_name, build_dir, corpus_dir, thread_count):
+  build_command_vector = generate_build_command(package_name, thread_count)
+
+  build_log_name = f'./{package_name}.build.log'
+  build_log_path = os.path.join(corpus_dir, build_log_name)
+
+  environment = os.environ.copy()
+  julia_depot_path = os.path.join(build_dir, 'julia_depot')
+  environment['JULIA_DEPOT_PATH'] = julia_depot_path
+  environment['JULIA_PKG_SERVER'] = ''
+  julia_bc_path = os.path.join(build_dir, 'unopt_bc')
+  os.mkdir(julia_bc_path)
+  environment['JULIA_PKG_UNOPT_BITCODE_DIR'] = julia_bc_path
+  environment['JULIA_IMAGE_THREADS'] = '1'
+  environment['JULIA_CPU_TARGET'] = 'x86-64'
+
+  try:
+    with open(build_log_path, 'w') as build_log_file:
+      subprocess.run(
+          build_command_vector,
+          cwd=build_dir,
+          stdout=build_log_file,
+          stderr=build_log_file,
+          env=environment,
+          check=True)
+  except subprocess.SubprocessError:
+    logging.warn(f'Failed to build julia package {package_name}')
+    build_success = False
+  else:
+    build_success = True
+  if build_success:
+    extract_ir(build_dir, corpus_dir)
+  return {
+      'targets': [{
+          'success': build_success,
+          'build_log': build_log_name,
+          'name': package_name
+      }]
+  }
+
+
+def unpack_archives(unopt_bc_archive_dir, unopt_bc_dir):
+  archive_files = os.listdir(unopt_bc_archive_dir)
+  for archive_file in archive_files:
+    full_archive_file_path = os.path.join(unopt_bc_archive_dir, archive_file)
+    # Strip the last two characters which will be the .a in the extensions
+    archive_package_name = archive_file[:-2]
+
+    archive_extraction_command_vector = ['llvm-ar', '-x', archive_file]
+
+    subprocess.run(
+        archive_extraction_command_vector,
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+        cwd=unopt_bc_archive_dir)
+
+    # Copy text_opt#0.bc to the output directory
+    unopt_bitcode_full_path = os.path.join(unopt_bc_archive_dir,
+                                           'text_unopt#0.bc')
+    copied_bitcode_full_path = os.path.join(unopt_bc_dir,
+                                            f'{archive_package_name}.bc')
+    shutil.copyfile(unopt_bitcode_full_path, copied_bitcode_full_path)
+
+    # Delete all bitcode files from the current extraction in preparation
+    # for the next archive.
+    for bitcode_file in glob.glob(os.path.join(unopt_bc_archive_dir, '*.bc')):
+      os.remove(bitcode_file)
+
+    os.remove(full_archive_file_path)
+
+
+def extract_ir(build_dir, corpus_dir):
+  unopt_bc_dir = os.path.join(build_dir, 'unopt_bc')
+  output_bc_dir = os.path.join(build_dir, 'output_bc')
+  os.mkdir(output_bc_dir)
+  unpack_archives(unopt_bc_dir, output_bc_dir)
+  relative_paths = make_corpus_lib.load_bitcode_from_directory(output_bc_dir)
+  make_corpus_lib.copy_bitcode(relative_paths, output_bc_dir, corpus_dir)
+  make_corpus_lib.write_corpus_manifest(relative_paths, corpus_dir, '')
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/manual_builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/manual_builder.py
new file mode 100644
index 000000000000000..89e969283101cd9
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/manual_builder.py
@@ -0,0 +1,58 @@
+"""Module for building and extracting bitcode from applications using an
+arbitrary build system by manually running specified commands."""
+
+import subprocess
+import os
+
+from compiler_opt.tools import extract_ir_lib
+from compiler_opt.tools import make_corpus_lib
+
+BUILD_LOG_NAME = './build.log'
+
+
+def perform_build(commands_list, build_dir, threads, corpus_dir,
+                  environment_variables):
+  command_statuses = []
+  build_log_path = os.path.join(corpus_dir, BUILD_LOG_NAME)
+  for command in commands_list:
+    environment = os.environ.copy()
+    environment['JOBS'] = str(threads)
+    for environment_variable in environment_variables:
+      environment[environment_variable] = environment_variables[
+          environment_variable]
+    with open(build_log_path, 'w') as build_log_file:
+      build_process = subprocess.run(
+          command,
+          cwd=build_dir,
+          env=environment,
+          shell=True,
+          stderr=build_log_file,
+          stdout=build_log_file)
+      command_statuses.append(build_process.returncode == 0)
+  overall_success = True
+  for command_status in command_statuses:
+    if not command_status:
+      overall_success = False
+      break
+  return {
+      'targets': [{
+          'success': overall_success,
+          'build_log': BUILD_LOG_NAME,
+          'name': os.path.basename(corpus_dir)
+      }]
+  }
+
+
+def extract_ir(build_dir, corpus_dir, threads):
+  objects = extract_ir_lib.load_from_directory(build_dir, corpus_dir)
+  relative_output_paths = extract_ir_lib.run_extraction(objects, threads,
+                                                        "llvm-objcopy", None,
+                                                        None, ".llvmcmd",
+                                                        ".llvmbc")
+  extract_ir_lib.write_corpus_manifest(None, relative_output_paths, corpus_dir)
+
+
+def extract_raw_ir(build_dir, corpus_dir, threads):
+  relative_paths = make_corpus_lib.load_bitcode_from_directory(build_dir)
+  make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+  make_corpus_lib.write_corpus_manifest(relative_paths, corpus_dir, '')
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/spack_builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/spack_builder.py
new file mode 100644
index 000000000000000..b8c32d5e2a20219
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/spack_builder.py
@@ -0,0 +1,226 @@
+"""Module for building and extracting bitcode from applications using spack"""
+
+import subprocess
+import os
+import tempfile
+import logging
+import pathlib
+import shutil
+import re
+
+import ray
+
+from compiler_opt.tools import extract_ir_lib
+
+from llvm_ir_dataset_utils.util import file
+from llvm_ir_dataset_utils.util import spack as spack_utils
+
+SPACK_THREAD_OVERSUBSCRIPTION_FACTOR = 1
+
+SPACK_GARBAGE_COLLECTION_TIMEOUT = 300
+
+BUILD_LOG_NAME = './spack_build.log'
+
+
+def get_spec_command_vector_section(spec):
+  filtered_spec = re.sub(r'license=".*?" ', '', spec)
+  # Strip the patches list from a package that we're pushing to a build cache.
+  # There is at least one case where Spack fails to match the package for pushing
+  # to the buildcache after installation due to the patches string.
+  # TODO(boomanaiden154): Investigate why this is and remove it once this gets
+  # fixed.
+  filtered_spec2 = re.sub(r'patches=.*? ', '', filtered_spec)
+  return filtered_spec2.split(' ')
+
+
+def generate_build_command(package_to_build, threads, build_dir):
+  command_vector = [
+      'spack', '--insecure', '-c', f'config:build_stage:{build_dir}', 'install',
+      '--keep-stage', '--overwrite', '-y', '--use-buildcache',
+      'package:never,dependencies:only', '-j',
+      f'{SPACK_THREAD_OVERSUBSCRIPTION_FACTOR * threads}',
+      '--no-check-signature', '--deprecated'
+  ]
+  command_vector.extend(get_spec_command_vector_section(package_to_build))
+  return command_vector
+
+
+def perform_build(package_name, assembled_build_command, corpus_dir, build_dir):
+  logging.info(f"Spack building package {package_name}")
+  environment = os.environ.copy()
+  # Set $HOME to the build directory so that spack doesn't run into weird
+  # errors with multiple machines trying to write to a common home directory.
+  environment['HOME'] = build_dir
+  build_log_path = os.path.join(corpus_dir, BUILD_LOG_NAME)
+  try:
+    with open(build_log_path, 'w') as build_log_file:
+      subprocess.run(
+          assembled_build_command,
+          stdout=build_log_file,
+          stderr=build_log_file,
+          check=True,
+          env=environment)
+  except subprocess.SubprocessError:
+    logging.warn(f"Failed to build spack package {package_name}")
+    return False
+  logging.info(f"Finished build spack package {package_name}")
+  return True
+
+
+def get_spack_stage_directory(package_hash, build_dir):
+  spack_build_directory = os.path.join(build_dir, os.getlogin())
+  if not os.path.exists(spack_build_directory):
+    return None
+  spack_stages = os.listdir(spack_build_directory)
+  spack_stages.append('')
+  for spack_stage_dir in spack_stages:
+    if package_hash in spack_stage_dir:
+      break
+  # spack_stage_dir now contains the name of the directory,  or None if we
+  # failed to find a stage directory (i.e., due to build failure)
+  if spack_stage_dir == '':
+    logging.warning(f'Failed to get stage dir for {package_hash}. This might '
+                    'have been caused by your spack installation and the '
+                    'package_list.json becoming out of sync.')
+    return None
+  return os.path.join(spack_build_directory, spack_stage_dir)
+
+
+def extract_ir(package_hash, corpus_dir, build_dir, threads):
+  build_directory = get_spack_stage_directory(package_hash, build_dir)
+  if build_directory is not None:
+    current_verbosity = logging.getLogger().getEffectiveLevel()
+    logging.getLogger().setLevel(logging.ERROR)
+    objects = extract_ir_lib.load_from_directory(build_directory, corpus_dir)
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objects, threads, "llvm-objcopy", None, None, ".llvmcmd", ".llvmbc")
+    extract_ir_lib.write_corpus_manifest(None, relative_output_paths,
+                                         corpus_dir)
+    logging.getLogger().setLevel(current_verbosity)
+
+
+def push_to_buildcache(package_spec, buildcache_dir, corpus_dir, build_dir):
+  command_vector = [
+      'spack', 'buildcache', 'push', '--unsigned', '--only', 'package',
+      buildcache_dir
+  ]
+  command_vector.extend(get_spec_command_vector_section(package_spec))
+  buildcache_push_log_path = os.path.join(corpus_dir, 'buildcache_push.log')
+  environment = os.environ.copy()
+  environment['HOME'] = build_dir
+  with open(buildcache_push_log_path, 'w') as buildcache_push_log_file:
+    subprocess.run(
+        command_vector,
+        check=True,
+        env=environment,
+        stdout=buildcache_push_log_file,
+        stderr=buildcache_push_log_file)
+
+
+def cleanup(package_name, package_spec, corpus_dir, build_dir, uninstall=True):
+  environment = os.environ.copy()
+  environment['HOME'] = build_dir
+  if uninstall:
+    uninstall_command_vector = ['spack', 'uninstall', '-y']
+    uninstall_command_vector.extend(
+        get_spec_command_vector_section(package_spec))
+    uninstall_log_path = os.path.join(corpus_dir, 'uninstall.log')
+    with open(uninstall_log_path, 'w') as uninstall_log_file:
+      subprocess.run(
+          uninstall_command_vector,
+          check=True,
+          env=environment,
+          stdout=uninstall_log_file,
+          stderr=uninstall_log_file)
+  # Garbage collect dependencies
+  try:
+    gc_command_vector = ['spack', 'gc', '-y']
+    gc_log_path = os.path.join(corpus_dir, 'gc.log')
+    with open(gc_log_path, 'w') as gc_log_file:
+      subprocess.run(
+          gc_command_vector,
+          check=True,
+          env=environment,
+          stdout=gc_log_file,
+          stderr=gc_log_file,
+          timeout=SPACK_GARBAGE_COLLECTION_TIMEOUT)
+  except subprocess.SubprocessError:
+    logging.warning(
+        f'Failed to garbage collect while cleaning up package {package_name}.')
+
+
+def construct_build_log(build_success, package_name):
+  return {
+      'targets': [{
+          'name': package_name,
+          'build_log': BUILD_LOG_NAME,
+          'success': build_success
+      }]
+  }
+
+
+def spack_add_mirror(build_dir, buildcache_dir):
+  environment = os.environ.copy()
+  environment['HOME'] = build_dir
+  command_vector = ['spack', 'mirror', 'add', 'buildcache', buildcache_dir]
+  subprocess.run(
+      command_vector,
+      check=True,
+      env=environment,
+      stdout=subprocess.DEVNULL,
+      stderr=subprocess.DEVNULL)
+
+
+def spack_setup_bootstrap_root(build_dir):
+  # TODO(boomanaiden154): Pull out the hardcoded /tmp/spack-boostrap path and
+  # make it a configurable somewhere.
+  bootstrap_dir = os.path.join(build_dir, 'spack-bootstrap')
+  shutil.copytree('/tmp/spack-bootstrap', bootstrap_dir)
+  command_vector = ['spack', 'bootstrap', 'root', bootstrap_dir]
+  environment = os.environ.copy()
+  environment['HOME'] = build_dir
+  subprocess.run(
+      command_vector,
+      env=environment,
+      check=True,
+      stdout=subprocess.DEVNULL,
+      stderr=subprocess.DEVNULL)
+
+
+def build_package(dependency_futures,
+                  package_name,
+                  package_spec,
+                  package_hash,
+                  corpus_dir,
+                  threads,
+                  buildcache_dir,
+                  build_dir,
+                  cleanup_build=False):
+  dependency_futures = ray.get(dependency_futures)
+  for dependency_future in dependency_futures:
+    if dependency_future['targets'][0]['success'] != True:
+      logging.warning(
+          f'Dependency {dependency_future["targets"][0]["name"]} failed to build for package {package_name}, not building.'
+      )
+      if cleanup_build:
+        cleanup(
+            package_name, package_spec, corpus_dir, build_dir, uninstall=False)
+      return construct_build_log(False, package_name)
+  spack_add_mirror(build_dir, buildcache_dir)
+  spack_utils.spack_setup_compiler(build_dir)
+  spack_utils.spack_setup_config(build_dir)
+  spack_setup_bootstrap_root(build_dir)
+  build_command = generate_build_command(package_spec, threads, build_dir)
+  build_result = perform_build(package_name, build_command, corpus_dir,
+                               build_dir)
+  if build_result:
+    extract_ir(package_hash, corpus_dir, build_dir, threads)
+    push_to_buildcache(package_spec, buildcache_dir, corpus_dir, build_dir)
+    logging.warning(f'Finished building {package_name}')
+  if cleanup_build:
+    if build_result:
+      cleanup(package_name, package_spec, corpus_dir, build_dir, package_hash)
+    else:
+      cleanup(
+          package_name, package_spec, corpus_dir, build_dir, uninstall=False)
+  return construct_build_log(build_result, package_name)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/swift_builder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/swift_builder.py
new file mode 100644
index 000000000000000..94c52150ae37d5e
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/builders/swift_builder.py
@@ -0,0 +1,56 @@
+"""Module for building and extracting bitcode from Swift packages."""
+
+import subprocess
+import os
+import logging
+
+from compiler_opt.tools import extract_ir_lib
+
+BUILD_TIMEOUT = 900
+
+BUILD_LOG_NAME = './build.log'
+
+
+def perform_build(source_dir, build_dir, corpus_dir, thread_count,
+                  package_name):
+  build_command_vector = [
+      'swift', 'build', '-c', 'release', '-Xswiftc', '-embed-bitcode',
+      '--emit-swift-module-separately', '-Xswiftc', '-Onone', '-j',
+      str(thread_count), '--build-path', build_dir
+  ]
+
+  build_log_path = os.path.join(corpus_dir, BUILD_LOG_NAME)
+
+  try:
+    with open(build_log_path, 'w') as build_log_file:
+      subprocess.run(
+          build_command_vector,
+          cwd=source_dir,
+          stdout=build_log_file,
+          stderr=build_log_file,
+          check=True,
+          timeout=BUILD_TIMEOUT)
+  except (subprocess.SubprocessError, FileNotFoundError):
+    # TODO(boomanaiden154): Figure out why a FileNotFoundError is thrown here
+    # sometimes because it should be handled earlier.
+    logging.warning(f'Failed to build swift package in {package_name}')
+    build_success = False
+  else:
+    build_success = True
+  if build_success:
+    extract_ir(build_dir, corpus_dir, thread_count)
+  return {
+      'targets': [{
+          'success': build_success,
+          'build_log': BUILD_LOG_NAME,
+          'name': package_name
+      }]
+  }
+
+
+def extract_ir(build_dir, corpus_dir, threads):
+  objects = extract_ir_lib.load_from_directory(build_dir, corpus_dir)
+  relative_output_paths = extract_ir_lib.run_extraction(
+      objects, threads, "llvm-objcopy", None, None, "__LLVM,__swift_cmdline",
+      "__LLVM,__bitcode")
+  extract_ir_lib.write_corpus_manifest(None, relative_output_paths, corpus_dir)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/git_source.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/git_source.py
new file mode 100644
index 000000000000000..b948d0a4188b326
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/git_source.py
@@ -0,0 +1,50 @@
+"""Module that downloads git repositories"""
+
+import os
+import subprocess
+import logging
+
+
+def download_source_code(repo_url, repo_name, commit_sha, base_dir, corpus_dir):
+  # If the directory already exists, we can skip downloading the source,
+  # currently just assuming that the requested commit is present
+  if not os.path.exists(os.path.join(base_dir, repo_name)):
+    with open(os.path.join(corpus_dir, 'git.log'), 'w') as git_log_file:
+      git_command_vector = ["git", "clone", repo_url]
+      if commit_sha is None or commit_sha == '':
+        git_command_vector.append('--depth=1')
+      git_command_vector.append(repo_name)
+      logging.info(f"Cloning git repository {repo_url}")
+      environment = os.environ.copy()
+      environment['GIT_TERMINAL_PROMPT'] = '0'
+      environment['GIT_ASKPASS'] = 'echo'
+      try:
+        subprocess.run(
+            git_command_vector,
+            cwd=base_dir,
+            stdout=git_log_file,
+            stderr=git_log_file,
+            env=environment,
+            check=True)
+        if commit_sha is not None and commit_sha != '':
+          commit_checkout_vector = ["git", "checkout", commit_sha]
+          logging.info(f"Checked out commit SHA {commit_sha}")
+          subprocess.run(
+              commit_checkout_vector,
+              cwd=os.path.join(base_dir, repo_name),
+              stdout=git_log_file,
+              stderr=git_log_file,
+              check=True)
+        success = True
+      except subprocess.SubprocessError:
+        logging.warning(
+            f'Cloning and checking out git repository {repo_url} failed.')
+        success = False
+  else:
+    success = True
+  return {
+      'type': 'git',
+      'repo_url': repo_url,
+      'commit_sha': commit_sha,
+      'success': success
+  }
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/source.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/source.py
new file mode 100644
index 000000000000000..0d415dda7c55153
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/source.py
@@ -0,0 +1,23 @@
+"""Module that automatically downloads source code based on a source
+description."""
+
+from llvm_ir_dataset_utils.sources import git_source
+from llvm_ir_dataset_utils.sources import tar_source
+
+
+def download_source(source_descriptions, base_dir, corpus_dir, folder_name):
+  source_logs = []
+  for source_description in source_descriptions:
+    if (source_description['type'] == 'git'):
+      source_logs.append(
+          git_source.download_source_code(source_description['repo_url'],
+                                          folder_name,
+                                          source_description['commit_sha'],
+                                          base_dir, corpus_dir))
+    elif (source_description['type'] == 'tar'):
+      source_logs.append(
+          tar_source.download_source_code(source_description['archive_url'],
+                                          base_dir, folder_name))
+    if source_logs[-1]['success']:
+      return source_logs
+  return source_logs
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/tar_source.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/tar_source.py
new file mode 100644
index 000000000000000..ccf13c68013de8b
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/sources/tar_source.py
@@ -0,0 +1,35 @@
+"""Module that downloads and extracts tar archives."""
+
+import os
+import tarfile
+import tempfile
+import shutil
+import logging
+import requests
+import io
+import urllib3
+
+
+def download_source_code(archive_url, base_dir, source_folder_name):
+  # Disable warnings, otherwise we get a lot of warnings about disabling SSL
+  # verification.
+  urllib3.disable_warnings()
+  try:
+    with tempfile.TemporaryDirectory() as download_dir:
+      tar_archive = requests.get(archive_url, verify=False)
+      tar_archive_file = io.BytesIO(tar_archive.content)
+      with tarfile.open(fileobj=tar_archive_file) as source_tar_archive:
+        source_tar_archive.extractall(download_dir)
+      download_dir_files = os.listdir(download_dir)
+      if len(download_dir_files) != 0:
+        real_source_folder_name = os.path.join(download_dir,
+                                               download_dir_files[0])
+        shutil.move(real_source_folder_name,
+                    os.path.join(base_dir, source_folder_name))
+        success = True
+      else:
+        success = False
+  except (EOFError, OSError, tarfile.ReadError):
+    logging.warning(f'Downloading tar archive {archive_url} failed.')
+    success = False
+  return {'type': 'tar', 'archive_url': archive_url, 'success': success}
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/__init__.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/__init__.py
new file mode 100644
index 000000000000000..e69de29bb2d1d64
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/aggregate_build_sizes.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/aggregate_build_sizes.py
new file mode 100644
index 000000000000000..14c698138782959
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/aggregate_build_sizes.py
@@ -0,0 +1,57 @@
+"""Tool for aggregating and providing statistics on bitcode size."""
+
+import os
+import logging
+
+from absl import flags
+from absl import app
+
+import ray
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None, 'The base directory of the corpus')
+flags.DEFINE_string(
+    'per_package_output', None,
+    'The path to a CSV file containing the name of each package and the amount '
+    'of bitcode that it has.')
+
+flags.mark_flag_as_required('corpus_dir')
+
+
+ at ray.remote
+def get_size_from_manifest(corpus_path):
+  build_manifest = dataset_corpus.load_json_from_corpus(
+      corpus_path, "./build_manifest.json")
+  package_name = dataset_corpus.get_corpus_name(corpus_path)
+  if build_manifest is None:
+    return (package_name, 0, False)
+  return (package_name, build_manifest['size'])
+
+
+def main(_):
+  build_corpora = os.listdir(FLAGS.corpus_dir)
+  logging.info(f'Gathering data from {len(build_corpora)} builds.')
+  size_futures = []
+  for build_corpus in build_corpora:
+    corpus_path = os.path.join(FLAGS.corpus_dir, build_corpus)
+    size_futures.append(get_size_from_manifest.remote(corpus_path))
+  names_sizes = ray.get(size_futures)
+
+  size_sum = 0
+  for name_size in names_sizes:
+    size_sum += name_size[1]
+  logging.info(f'Aggregate size:{size_sum}')
+
+  if FLAGS.per_package_output is not None:
+    names_sizes = sorted(
+        names_sizes, key=lambda name_size: name_size[1], reverse=True)
+    with open(FLAGS.per_package_output, 'w') as per_package_index_file:
+      for name_size in names_sizes:
+        per_package_index_file.write(f'{name_size[0]},{name_size[1]}\n')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_licenses.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_licenses.py
new file mode 100644
index 000000000000000..630eca836633aab
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_licenses.py
@@ -0,0 +1,100 @@
+"""A script for analyzing the license distribution of an already built corpus.
+"""
+
+import os
+import logging
+import sys
+
+from absl import flags
+from absl import app
+
+import ray
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None, 'The base directory of the corpus')
+flags.DEFINE_integer('max_projects', sys.maxsize,
+                     'The maximum number of projects to consider')
+flags.DEFINE_boolean(
+    'ignore_license_files', False,
+    'Whether or not to ignore the constraint that license files must be present for attribution'
+)
+
+flags.mark_flag_as_required('corpus_dir')
+
+PERMISSIVE_LICENSES = {
+    'MIT': True,
+    'Apache-2.0': True,
+    'BSD-3-Clause': True,
+    'BSD-2-Clause': True
+}
+
+
+ at ray.remote
+def get_information_from_manifest(corpus_path):
+  build_manifest = dataset_corpus.load_json_from_corpus(
+      corpus_path, './build_manifest.json')
+  package_name = dataset_corpus.get_corpus_name(corpus_path)
+  if build_manifest is None:
+    return (package_name, '', [])
+  license_files_ids = [
+      license_file['license']
+      for license_file in build_manifest['license_files']
+  ]
+  package_license = build_manifest['license']
+  return (package_name, package_license, license_files_ids,
+          build_manifest['size'])
+
+
+def main(_):
+  build_corpora = os.listdir(FLAGS.corpus_dir)
+  logging.info(f'Gathering data from {len(build_corpora)} builds.')
+  license_futures = []
+  for build_corpus in build_corpora:
+    corpus_path = os.path.join(FLAGS.corpus_dir, build_corpus)
+    license_futures.append(get_information_from_manifest.remote(corpus_path))
+
+    if len(license_futures) >= FLAGS.max_projects:
+      break
+  license_information = ray.get(license_futures)
+
+  logging.info(f'Processing license information')
+
+  valid_licenses = 0
+  invalid_licenses = 0
+  total_usable_bitcode = 0
+
+  for package_license_info in license_information:
+    license_parts = [
+        part.strip() for part in package_license_info[1].split('OR')
+    ]
+    has_valid_license = False
+    for license_part in license_parts:
+      if license_part not in PERMISSIVE_LICENSES:
+        continue
+      if FLAGS.ignore_license_files and license_part in PERMISSIVE_LICENSES:
+        has_valid_license = True
+        break
+      if license_part in package_license_info[2]:
+        has_valid_license = True
+        break
+
+    if has_valid_license:
+      valid_licenses += 1
+      total_usable_bitcode += package_license_info[3]
+    else:
+      invalid_licenses += 1
+
+  logging.info(
+      f'Found {valid_licenses} packages with valid license information and {invalid_licenses} packages with invalid license information'
+  )
+
+  logging.info(
+      f'A total of {total_usable_bitcode} is usable given the current licensing constraints.'
+  )
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_package_list_licenses.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_package_list_licenses.py
new file mode 100644
index 000000000000000..645bc2c921f8c4d
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/audit_package_list_licenses.py
@@ -0,0 +1,50 @@
+"""A script for analyzing the license buildup of a list of packages.
+"""
+
+import json
+import os
+import logging
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('input_file', None, 'The input file to look at')
+flags.DEFINE_boolean(
+    'is_spack', False,
+    'Whether or not to treat the input file as being a list of spack packages.')
+
+flags.mark_flag_as_required('input_file')
+
+PERMISSIVE_LICENSES = {
+    'MIT': True,
+    'Apache-2.0': True,
+    'BSD-3-Clause': True,
+    'BSD-2-Clause': True
+}
+
+
+def main(_):
+  with open(FLAGS.input_file) as input_file_handle:
+    input_data = json.load(input_file_handle)
+
+  good_licenses = 0
+  bad_licenses = 0
+
+  for package in input_data:
+    if FLAGS.is_spack:
+      package = input_data[package]
+    license_parts = [part.strip() for part in package['license'].split('OR')]
+    for license_part in license_parts:
+      if license_part in PERMISSIVE_LICENSES:
+        good_licenses += 1
+      else:
+        bad_licenses += 1
+
+  logging.info(f'Packages that can be used by the dataset: {good_licenses}')
+  logging.info(f'Packages that cannot be used by the dataset: {bad_licenses}')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_crate_from_repository.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_crate_from_repository.py
new file mode 100644
index 000000000000000..5e1f6955769aadb
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_crate_from_repository.py
@@ -0,0 +1,107 @@
+"""Tool to build a crate given just a repository."""
+
+import json
+import logging
+
+from absl import app
+from absl import flags
+import ray
+
+from llvm_ir_dataset_utils.builders import builder
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('repository', None, 'The repository url to clone from.')
+flags.DEFINE_string('repository_list', None,
+                    'Path to a file containing a list of repositories.')
+flags.DEFINE_string('source_dir', None,
+                    'The directory to download source code into.')
+flags.DEFINE_string('build_dir', None,
+                    'The base directory to and perform builds in.')
+flags.DEFINE_string('corpus_dir', None, 'The directory to place the corpus in.')
+flags.DEFINE_integer('thread_count', 8,
+                     'The number of threads to use per crate build.')
+flags.DEFINE_string('cargo_home', '/cargo', 'The default cargo directory.')
+flags.DEFINE_string('rustup_home', '/rustup',
+                    'The default rustup home directory.')
+flags.DEFINE_bool(
+    'archive_corpus', False,
+    'Whether or not to put the output corpus for each package into an archive.')
+
+flags.mark_flag_as_required('source_dir')
+flags.mark_flag_as_required('build_dir')
+flags.mark_flag_as_required('corpus_dir')
+
+
+ at flags.multi_flags_validator(
+    ['repository', 'repository_list'],
+    message=(
+        'Expected one and only one of --repository and --repository_list to be'
+        'defined.'),
+)
+def _validate_input_columns(flags_dict):
+  both_defined = flags_dict['repository'] is not None and flags_dict[
+      'repository_list'] is not None
+  neither_defined = flags_dict['repository'] is None and flags_dict[
+      'repository_list'] is None
+  return not both_defined and not neither_defined
+
+
+def main(_):
+  ray.init()
+  crates_list = []
+  if FLAGS.repository is not None:
+    crates_list.append(FLAGS.repository)
+  elif FLAGS.repository_list is not None:
+    with open(FLAGS.repository_list) as repository_list_file:
+      crates_list = json.load(repository_list_file)
+
+  build_futures = []
+  for index, crate_to_build in enumerate(crates_list):
+    sources = []
+    if crate_to_build['repository'] is not None:
+      sources.append({
+          'type': 'git',
+          'repo_url': crate_to_build['repository'],
+          'commit_sha': ''
+      })
+    if crate_to_build['tar_archive'] is not None:
+      sources.append({
+          'type': 'tar',
+          'archive_url': crate_to_build['tar_archive']
+      })
+    corpus_description = {
+        'sources': sources,
+        'folder_name': f'build-{index}',
+        'build_system': 'cargo',
+        'license': crate_to_build['license']
+    }
+
+    additional_build_env_variables = {
+        'RUSTUP_HOME': FLAGS.rustup_home,
+        'CARGO_HOME': FLAGS.cargo_home
+    }
+
+    build_futures.append(
+        builder.get_build_future(
+            corpus_description,
+            FLAGS.source_dir,
+            FLAGS.build_dir,
+            FLAGS.corpus_dir,
+            FLAGS.thread_count,
+            additional_build_env_variables,
+            cleanup=True,
+            archive_corpus=FLAGS.archive_corpus))
+
+  all_finished = []
+  while len(build_futures) > 0:
+    finished, build_futures = ray.wait(build_futures, timeout=5.0)
+    finished_data = ray.get(finished)
+    all_finished.extend(finished_data)
+    logging.info(
+        f'Just finished {len(finished_data)}, {len(all_finished)} done, {len(build_futures)} remaining'
+    )
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_julia_packages.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_julia_packages.py
new file mode 100644
index 000000000000000..cb8bb6679d8a57a
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_julia_packages.py
@@ -0,0 +1,71 @@
+"""Tool for building a list of julia packages."""
+
+import logging
+import json
+
+from absl import app
+from absl import flags
+
+import ray
+
+from llvm_ir_dataset_utils.builders import builder
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('package_list', None, 'The path to the package list.')
+flags.DEFINE_string('source_dir', '/tmp/source',
+                    'Path to a directory to download source code into.')
+flags.DEFINE_string('build_dir', None,
+                    'The base directory to perform builds in.')
+flags.DEFINE_string('corpus_dir', None, 'The directory to place the corpus in.')
+flags.DEFINE_integer('thread_count', 2,
+                     'The number of threads to use per package build.')
+flags.DEFINE_bool(
+    'archive_corpus', False,
+    'Whether or not to put the output corpus into an archive to reduce inode usage.'
+)
+
+flags.mark_flag_as_required('build_dir')
+flags.mark_flag_as_required('corpus_dir')
+flags.mark_flag_as_required('package_list')
+
+
+def main(_):
+  ray.init()
+
+  with open(FLAGS.package_list) as package_list_file:
+    package_list = json.load(package_list_file)
+
+  build_futures = []
+  for package in package_list:
+    corpus_description = {
+        'sources': [{
+            'type': 'git',
+            'repo_url': package['repo'],
+            'commit_sha': None
+        }],
+        'folder_name': package['name'],
+        'build_system': 'julia',
+        'package_name': package['name'],
+        'license': package['license']
+    }
+
+    build_futures.append(
+        builder.get_build_future(
+            corpus_description,
+            FLAGS.source_dir,
+            FLAGS.build_dir,
+            FLAGS.corpus_dir,
+            FLAGS.thread_count, {},
+            cleanup=True,
+            archive_corpus=FLAGS.archive_corpus))
+
+  while len(build_futures) > 0:
+    finished, build_futures = ray.wait(build_futures, timeout=5.0)
+    finished_data = ray.get(finished)
+    logging.info(
+        f'Just finished {len(finished_data)}, {len(build_futures)} remaining.')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_spack_package_from_list.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_spack_package_from_list.py
new file mode 100644
index 000000000000000..d4d5126f62f32b0
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_spack_package_from_list.py
@@ -0,0 +1,110 @@
+"""A tool for building individual spack packages or an entire list from a list
+of spack packages and their dependencies.
+"""
+
+import json
+
+from absl import app
+from absl import flags
+
+import ray
+
+from llvm_ir_dataset_utils.builders import builder
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('package_list', None, 'The list of spack packages and '
+                    'their dependencies.')
+flags.DEFINE_string('package_name', None, 'The name of an individual package '
+                    'to build.')
+flags.DEFINE_string('corpus_dir', None, 'The path to the corpus.')
+flags.DEFINE_string(
+    'source_dir', '/tmp/source', 'The source dir to pass along '
+    'to the builder. This is not used by the spack builder.')
+flags.DEFINE_string(
+    'build_dir', None, 'The build dir to pass along to '
+    'the builder. This is not used by the spack builder.')
+flags.DEFINE_string(
+    'buildcache_dir', None,
+    'The directory of the spack buildcache to store built packages in.')
+flags.DEFINE_integer('thread_count', 16,
+                     'The number of threads to use per job.')
+flags.DEFINE_bool(
+    'archive_corpus', False,
+    'Whether or not to put the output corpus for each package into an archive.')
+flags.DEFINE_bool('cleanup', True,
+                  'Whether or not to clean up the build directory')
+
+flags.mark_flag_as_required('package_list')
+flags.mark_flag_as_required('corpus_dir')
+flags.mark_flag_as_required('build_dir')
+flags.mark_flag_as_required('buildcache_dir')
+
+
+def get_package_future(package_dict, current_package_futures, package, threads):
+  if package in current_package_futures:
+    return current_package_futures[package]
+  dependency_futures = []
+  for dependency in package_dict[package]['deps']:
+    if dependency in current_package_futures:
+      dependency_futures.append(current_package_futures[dependency])
+    else:
+      dependency_futures.append(
+          get_package_future(package_dict, current_package_futures, dependency,
+                             threads))
+  corpus_description = {
+      'build_system': 'spack',
+      'folder_name': f'{package_dict[package]["name"]}-{package}',
+      'package_name': package_dict[package]['name'],
+      'package_spec': package_dict[package]['spec'],
+      'package_hash': package,
+      'license': package_dict[package]['license'],
+      'sources': []
+  }
+  extra_builder_arguments = {
+      'dependency_futures': dependency_futures,
+      'buildcache_dir': FLAGS.buildcache_dir
+  }
+  build_future = builder.get_build_future(
+      corpus_description,
+      FLAGS.source_dir,
+      FLAGS.build_dir,
+      FLAGS.corpus_dir,
+      threads, {},
+      extra_builder_arguments=extra_builder_arguments,
+      cleanup=FLAGS.cleanup,
+      archive_corpus=FLAGS.archive_corpus)
+  current_package_futures[package] = build_future
+  return build_future
+
+
+def main(_):
+  with open(FLAGS.package_list) as package_list_file:
+    package_dict = json.load(package_list_file)
+
+  ray.init()
+  build_futures = []
+  build_futures_dict = {}
+
+  if FLAGS.package_name:
+    package = None
+    for package in package_dict:
+      if package_dict[package]['name'] == FLAGS.package_name:
+        break
+    if package is None:
+      raise ValueError('At least one package must be specified to be built.')
+    build_futures.append(
+        get_package_future(package_dict, build_futures_dict, package,
+                           FLAGS.thread_count))
+  else:
+    for package in package_dict:
+      build_future = get_package_future(package_dict, build_futures_dict,
+                                        package, FLAGS.thread_count)
+      build_futures.append(build_future)
+      build_futures_dict[package] = build_future
+
+  ray.get(build_futures)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_swift_packages.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_swift_packages.py
new file mode 100644
index 000000000000000..382dd9678d5f894
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/build_swift_packages.py
@@ -0,0 +1,67 @@
+"""Tool for building a list of cargo packages."""
+
+import logging
+import json
+
+from absl import app
+from absl import flags
+
+import ray
+
+from llvm_ir_dataset_utils.builders import builder
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('package_list', None, 'The path to the package list.')
+flags.DEFINE_string('source_dir', None,
+                    'The path to the directory to download source code into.')
+flags.DEFINE_string('build_dir', None,
+                    'The base directory to perform builds in.')
+flags.DEFINE_string('corpus_dir', None, 'The directory to place the corpus in')
+flags.DEFINE_integer('thread_count', 2,
+                     'The number of threads to use per package build.')
+flags.DEFINE_bool(
+    'archive_corpus', False,
+    'Whether or not to put the output corpus for each package into an archive.')
+
+
+def main(_):
+  ray.init()
+
+  with open(FLAGS.package_list) as package_list_file:
+    package_repositories = json.load(package_list_file)
+
+  build_futures = []
+
+  for index, package_repository in enumerate(package_repositories):
+    corpus_description = {
+        'sources': [{
+            'type': 'git',
+            'repo_url': package_repository['repo'],
+            'commit_sha': None
+        }],
+        'folder_name': f'build-{index}',
+        'build_system': 'swift',
+        'package_name': f'build-{index}',
+        'license': package_repository['license']
+    }
+
+    build_futures.append(
+        builder.get_build_future(
+            corpus_description,
+            FLAGS.source_dir,
+            FLAGS.build_dir,
+            FLAGS.corpus_dir,
+            FLAGS.thread_count, {},
+            cleanup=True,
+            archive_corpus=FLAGS.archive_corpus))
+
+  while len(build_futures) > 0:
+    finished, build_futures = ray.wait(build_futures, timeout=5.0)
+    finished_data = ray.get(finished)
+    logging.info(
+        f'Just finished {len(finished_data)}, {len(build_futures)} remaining.')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/collect_textual_ir.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/collect_textual_ir.py
new file mode 100644
index 000000000000000..dfa9d8bbb3a4c77
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/collect_textual_ir.py
@@ -0,0 +1,58 @@
+"""A script for collecting a large amount of textual IR into a single file,
+aimed primarily at training basic BPE tokenizers."""
+
+import os
+import logging
+import subprocess
+
+from absl import app
+from absl import flags
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+from llvm_ir_dataset_utils.util import bitcode_module
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string(
+    'corpus_dir', None,
+    'The corpora to use for generating the set of textual IR.')
+flags.DEFINE_string('output_file', None,
+                    'The output file to put all the textual IR into.')
+flags.DEFINE_integer('max_projects', 10,
+                     'The maximum number of projects per corpus.')
+
+flags.mark_flag_as_required('corpus_dir')
+flags.mark_flag_as_required('output_file')
+
+
+def process_single_project(project_dir):
+  all_textual_ir = ''
+  try:
+    bitcode_paths = dataset_corpus.get_bitcode_file_paths(project_dir)
+  except:
+    return ''
+  for bitcode_path in bitcode_paths:
+    bitcode_file_data = dataset_corpus.load_file_from_corpus(
+        project_dir, bitcode_path)
+    textual_ir_or_error = bitcode_module.get_textual_ir(bitcode_file_data)
+    if textual_ir_or_error[0]:
+      continue
+    all_textual_ir += textual_ir_or_error[1]
+  return all_textual_ir
+
+
+def main(_):
+  all_textual_ir = ''
+
+  for corpus_dir in FLAGS.corpus_dir:
+    for project_dir in os.listdir(corpus_dir)[:FLAGS.max_projects]:
+      logging.info(f'Processing {project_dir} in {corpus_dir}')
+      full_project_dir = os.path.join(corpus_dir, project_dir)
+      all_textual_ir += process_single_project(full_project_dir)
+
+  with open(FLAGS.output_file, 'w') as output_file:
+    output_file.write(all_textual_ir)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/corpus_from_description.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/corpus_from_description.py
new file mode 100644
index 000000000000000..b5c3e23b4952a50
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/corpus_from_description.py
@@ -0,0 +1,62 @@
+"""Tool that builds a bitcode corpus from a description"""
+
+import json
+import multiprocessing
+import logging
+
+from absl import app
+from absl import flags
+import ray
+
+from llvm_ir_dataset_utils.builders import builder
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("corpus_description", None,
+                    "The path to the JSON description file")
+flags.DEFINE_string("source_dir", None,
+                    "The base directory to download source code into.")
+flags.DEFINE_string("build_dir", None,
+                    "The base directory to perform the build in")
+flags.DEFINE_string("corpus_dir", None, "The base directory to put the corpus")
+flags.DEFINE_string(
+    "buildcache_dir", "/tmp/buildcache",
+    "The directory of the spack build cache to store packages in. Only used "
+    "the spack builder.")
+flags.DEFINE_bool(
+    'cleanup', False, 'Whether or not to cleanup the source and '
+    'build directories after finishing a build.')
+flags.DEFINE_integer('thread_count', multiprocessing.cpu_count(), 'The number '
+                     'of threads to use per job.')
+flags.DEFINE_bool(
+    'archive_corpus', False,
+    'Whether or not to put the output corpus into an archive to reduce inode usage.'
+)
+
+flags.mark_flag_as_required("corpus_description")
+flags.mark_flag_as_required("source_dir")
+flags.mark_flag_as_required("build_dir")
+flags.mark_flag_as_required("corpus_dir")
+
+
+def main(_):
+  ray.init()
+  with open(FLAGS.corpus_description) as corpus_description_file:
+    corpus_description = json.load(corpus_description_file)
+    extra_builder_arguments = {'buildcache_dir': FLAGS.buildcache_dir}
+    build_future = builder.get_build_future(
+        corpus_description,
+        FLAGS.source_dir,
+        FLAGS.build_dir,
+        FLAGS.corpus_dir,
+        FLAGS.thread_count, {},
+        cleanup=FLAGS.cleanup,
+        extra_builder_arguments=extra_builder_arguments,
+        archive_corpus=FLAGS.archive_corpus)
+    logging.info('Starting build.')
+    ray.get(build_future)
+    logging.info('Build finished.')
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/count_tokens.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/count_tokens.py
new file mode 100644
index 000000000000000..c3a7063595b25e9
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/count_tokens.py
@@ -0,0 +1,35 @@
+"""A tool for counting tokens from gathered statistics CSV files."""
+
+import logging
+import csv
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string(
+    'stats_path', None,
+    'The path to a statistics file containing a token count.')
+flags.DEFINE_string('key', 'token_count', 'The column in the CSV to sum over.')
+
+
+def count_tokens_from_file(file_path):
+  token_count = 0
+  with open(file_path) as token_count_file:
+    token_count_reader = csv.DictReader(token_count_file)
+    for token_count_entry in token_count_reader:
+      token_count += int(token_count_entry[FLAGS.key])
+  return token_count
+
+
+def main(_):
+  total_token_count = 0
+  for stats_path in FLAGS.stats_path:
+    total_token_count += count_tokens_from_file(stats_path)
+
+  logging.info(f'Counted {total_token_count} tokens.')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/delete_folder.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/delete_folder.py
new file mode 100644
index 000000000000000..1bd2eec8105eb5a
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/delete_folder.py
@@ -0,0 +1,43 @@
+"""Tool for deleting a lot of inodes in parallel."""
+
+import os
+import shutil
+import logging
+
+import ray
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('folder', None,
+                    'The folder to delete all files/folders in.')
+
+flags.mark_flag_as_required('folder')
+
+
+ at ray.remote
+def delete_folder(folder_path):
+  if os.path.isdir(folder_path):
+    shutil.rmtree(folder_path)
+  elif os.path.isfile(folder_path):
+    os.remove(folder_path)
+  else:
+    logging.warning(f'Failed to delete {folder_path}, no file or directory')
+    return False
+  logging.warning(f'Deleted {folder_path}')
+  return True
+
+
+def main(_):
+  subfolders = os.listdir(FLAGS.folder)
+  subfolder_futures = []
+  for subfolder in subfolders:
+    subfolder_futures.append(
+        delete_folder.remote(os.path.join(FLAGS.folder, subfolder)))
+  ray.get(subfolder_futures)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/export_deduplicated_corpus.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/export_deduplicated_corpus.py
new file mode 100644
index 000000000000000..81d7a7f58d41dcc
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/export_deduplicated_corpus.py
@@ -0,0 +1,163 @@
+"""Tool for taking in a list of module hashes and extracting all deduplicated
+modules into a separate directory."""
+
+import os
+import logging
+import csv
+import shutil
+import pathlib
+import json
+
+from absl import flags
+from absl import app
+
+import ray
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+from llvm_ir_dataset_utils.util import parallel
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string('module_hash_list', None,
+                          'A list of module hashes to pull from.')
+flags.DEFINE_string(
+    'output_path', None,
+    'The output path to place all the deduplicated modules into.')
+flags.DEFINE_integer('batch_size', 256,
+                     'The number of modules to put in each batch.')
+flags.DEFINE_boolean(
+    'split_by_corpora', True,
+    'Whether or not to put separate corpora (defined by module hash lists) into separate folders.'
+)
+
+flags.mark_flag_as_required('module_hash_list')
+flags.mark_flag_as_required('output_path')
+
+
+def load_module_hashes(file_path):
+  logging.info(f'Loading data from {file_path}')
+  module_hash_map = {}
+  corpus_name = os.path.splitext(os.path.basename(file_path))[0]
+  all_modules_count = 0
+  with open(file_path) as module_hashes_file:
+    module_hash_reader = csv.DictReader(module_hashes_file)
+    for module_hash_entry in module_hash_reader:
+      all_modules_count += 1
+      module_hash = module_hash_entry['module_hashes']
+      file_path = module_hash_entry['name']
+      # Skip empty modules which get hashes to the default value of 4
+      if module_hash == '4':
+        continue
+      module_hash_map[module_hash] = (file_path, corpus_name)
+  logging.info(f'Read {all_modules_count} modules.')
+  logging.info(f'Found {len(module_hash_map)} unique modules.')
+  return module_hash_map
+
+
+def create_manifest(file_path, modules_list):
+  corpus_description = {'has_thinlto': False, 'modules': []}
+  for module_tuple in modules_list:
+    # Omit the .bc file extension because it gets added on by different
+    # tooling.
+    corpus_description['modules'].append(f'{module_tuple[1]}')
+  with open(file_path, 'w') as corpus_description_file:
+    json.dump(corpus_description, corpus_description_file, indent=2)
+
+
+ at ray.remote(num_cpus=1)
+def process_module_batch(batch_path, modules_to_process):
+  pathlib.Path(batch_path).mkdir(parents=True)
+  for module_path in modules_to_process:
+    file_path_full = module_path[0]
+    module_hash = module_path[1]
+    file_path_parts = file_path_full.split(':')
+    bitcode_file = dataset_corpus.load_file_from_corpus(file_path_parts[0],
+                                                        file_path_parts[1])
+    with open(os.path.join(batch_path, f'{module_hash}.bc'),
+              'wb') as bitcode_file_handle:
+      bitcode_file_handle.write(bitcode_file)
+    # Process the .cmd file
+    command_line_file_path = file_path_parts[1][:-3] + '.cmd'
+    command_line_data = ''
+    if dataset_corpus.is_file_in_corpus(file_path_parts[0],
+                                        command_line_file_path):
+      command_line_data = dataset_corpus.load_file_from_corpus(
+          file_path_parts[0], command_line_file_path).decode('utf-8')
+    else:
+      command_line_data = ''
+    with open(os.path.join(batch_path, f'{module_hash}.cmd'),
+              'w') as command_line_file_handle:
+      command_line_file_handle.write(command_line_data)
+
+  create_manifest(
+      os.path.join(batch_path, 'corpus_description.json'), modules_to_process)
+  shutil.make_archive(batch_path, 'tar', batch_path)
+  shutil.rmtree(batch_path)
+
+
+def extract_files_from_hash_map(module_hash_map, output_path):
+  modules_to_process = {}
+
+  for module_hash in module_hash_map:
+    # Each key in the map accesses a tuple with the format(file path, corpus name)
+    # format to use is (path, hash)
+    file_path, corpus_name = module_hash_map[module_hash]
+    if corpus_name in modules_to_process:
+      modules_to_process[corpus_name].append((file_path, module_hash))
+    else:
+      modules_to_process[corpus_name] = [(file_path, module_hash)]
+
+  module_batches = []
+
+  for corpus_name in modules_to_process:
+    current_module_batches = parallel.split_batches(
+        modules_to_process[corpus_name], FLAGS.batch_size)
+    output_module_batches = []
+    for current_module_batch in current_module_batches:
+      # Once the issue in the parallel module related to creating empty batches
+      # on boundaries gets fixed, remove this.
+      # TODO(boomanaiden154)
+      if len(current_module_batch) == 0:
+        continue
+      output_module_batch = []
+      for module_info in current_module_batch:
+        file_path, module_hash = module_info
+        output_module_batch.append((file_path, module_hash, corpus_name))
+      output_module_batches.append(output_module_batch)
+    module_batches.extend(output_module_batches)
+
+  module_batch_futures = []
+
+  for index, module_batch in enumerate(module_batches):
+    if FLAGS.split_by_corpora:
+      corpus_name = module_batch[0][2]
+      batch_path = os.path.join(FLAGS.output_path, corpus_name,
+                                f'batch-{index}')
+    else:
+      batch_path = os.path.join(FLAGS.output_path, f'batch-{index}')
+    module_batch_futures.append(
+        process_module_batch.remote(batch_path, module_batch))
+
+  while len(module_batch_futures) > 0:
+    finished, module_batch_futures = ray.wait(module_batch_futures, timeout=5.0)
+    finished_data = ray.get(finished)
+    logging.info(
+        f'Just finished {len(finished_data)}, {len(module_batch_futures)} remaining.'
+    )
+
+
+def main(_):
+  ray.init()
+
+  pathlib.Path(FLAGS.output_path).mkdir(exist_ok=True, parents=True)
+
+  module_hash_map = {}
+
+  for module_hash_list_path in FLAGS.module_hash_list:
+    module_hash_map.update(load_module_hashes(module_hash_list_path))
+
+  extract_files_from_hash_map(module_hash_map, FLAGS.output_path)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/extract_build_failure_logs.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/extract_build_failure_logs.py
new file mode 100644
index 000000000000000..31e9e67651b4fa7
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/extract_build_failure_logs.py
@@ -0,0 +1,54 @@
+"""Tool to get build failure logs and copy them into a folder."""
+
+import os
+import shutil
+
+from absl import app
+from absl import flags
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None, 'The corpus directory.')
+flags.DEFINE_string(
+    'build_failures', None,
+    'The list of build failures from get_build_failure_logs.py')
+flags.DEFINE_string('output_dir', None, 'The path to the output directory.')
+
+flags.mark_flag_as_required('corpus_dir')
+flags.mark_flag_as_required('build_failures')
+
+
+def process_build_log(build_log_path):
+  if ':' in build_log_path:
+    # We have a tar archive, extract the file and write it to the output
+    # directory.
+    path_parts = build_log_path.split(':')
+    build_log = dataset_corpus.load_file_from_corpus(path_parts[0],
+                                                     path_parts[1])
+    corpus_name = os.path.basename(path_parts[0])[:-4]
+    output_file_path = os.path.join(FLAGS.output_dir, f'{corpus_name}.log')
+    print(output_file_path)
+    with open(output_file_path, 'wb') as output_file:
+      output_file.write(build_log)
+  else:
+    # We have a normal file and con just copy it over.
+    corpus_name = os.path.basename(os.path.dirname(build_log_path))
+    output_file_path = os.path.join(FLAGS.output_dir, f'{corpus_name}.log')
+    shutil.copyfile(build_log_path, output_file_path)
+
+
+def main(_):
+  # TODO(boomanaiden154): Probably turn this into a CSV reader at some point,
+  # but the other scripts shouldn't create any edge cases.
+  with open(FLAGS.build_failures) as build_failures_file:
+    build_failures = [line.rstrip() for line in build_failures_file]
+    for build_failure in build_failures:
+      failure_description_parts = build_failure.split(',')
+      if failure_description_parts[2] != 'NULL':
+        process_build_log(failure_description_parts[2])
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_build_failure_logs.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_build_failure_logs.py
new file mode 100644
index 000000000000000..83a547113f81b90
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_build_failure_logs.py
@@ -0,0 +1,78 @@
+"""Tool to find all the logs for targets that failed to build from a corpus
+directory."""
+
+import glob
+import os
+import json
+import logging
+
+from absl import app
+from absl import flags
+
+import ray
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None,
+                    'The corpus directory to look for build logs in.')
+
+flags.mark_flag_as_required('corpus_dir')
+
+
+ at ray.remote(num_cpus=1)
+def process_corpus(build_corpus_path):
+  build_manifest = dataset_corpus.load_json_from_corpus(
+      build_corpus_path, './build_manifest.json')
+  if build_manifest is None:
+    return None
+  for target in build_manifest['targets']:
+    if target['success'] == False and target['build_log'] is not None:
+      # We're assuming the spack builder here because that's mainly what this
+      # script is being used for currently.
+      # TODO(boomanaiden154): Make this more generic when #77 is fixed and the
+      # corpora have been rebuilt.
+      if build_corpus_path[-3:] == 'tar':
+        build_log_path = f'{build_corpus_path}:./spack_build.log'
+      else:
+        build_log_path = target['build_log']
+      return ('build_failure', target['name'], build_log_path)
+    if target['build_log'] is None:
+      return ('missing_logs', target['name'], None)
+  return None
+
+
+def main(_):
+  ray.init()
+
+  build_corpora = os.listdir(FLAGS.corpus_dir)
+  corpus_futures = []
+
+  for build_corpus in build_corpora:
+    corpus_path = os.path.join(FLAGS.corpus_dir, build_corpus)
+    corpus_futures.append(process_corpus.remote(corpus_path))
+
+  build_failures = 0
+  missing_logs = 0
+
+  while len(corpus_futures) > 0:
+    finished, corpus_futures = ray.wait(corpus_futures, timeout=5.0)
+    finished_data = ray.get(finished)
+    logging.info(
+        f'Just finished {len(finished)}, {len(corpus_futures)} remaining.')
+    for finished_corpus in finished_data:
+      if finished_corpus is not None:
+        if finished_corpus[0] == 'build_failure':
+          build_failures += 1
+          print(f'{finished_corpus[1]},failure,{finished_corpus[2]}')
+        elif finished_corpus[0] == 'missing_logs':
+          missing_logs += 1
+          print(f'{finished_corpus[1]},no_logs,NULL')
+
+  logging.warning(f'Found {build_failures} build failures.')
+  logging.warning(f'{missing_logs} targets were missing logs.')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_common_constants.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_common_constants.py
new file mode 100644
index 000000000000000..7f03fdf442c852a
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_common_constants.py
@@ -0,0 +1,120 @@
+"""Tool for getting common tokenizer constants from bitcode modules."""
+
+import os
+import logging
+import sys
+
+from absl import app
+from absl import flags
+
+import ray
+
+from llvm_ir_dataset_utils.util import bitcode_module
+from llvm_ir_dataset_utils.util import dataset_corpus
+from llvm_ir_dataset_utils.util import parallel
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None,
+                    'The corpus directory to look for modules in.')
+flags.DEFINE_integer(
+    'max_projects',
+    sys.maxsize,
+    'The maximum number of projects to process.',
+    lower_bound=1)
+flags.DEFINE_string('output_file', None, 'The output file to place results in.')
+
+flags.mark_flag_as_required('corpus_dir')
+flags.mark_flag_as_required('output_file')
+
+
+def combine_constant_histograms(part_a, part_b):
+  result_histogram = {}
+  for constant in list(set(list(part_a.keys()) + list(part_b.keys()))):
+    if constant in part_b and constant in part_a:
+      result_histogram[constant] = part_a[constant] + part_b[constant]
+    elif constant in part_a:
+      result_histogram[constant] = part_a[constant]
+    elif constant in part_b:
+      result_histogram[constant] = part_b[constant]
+  return result_histogram
+
+
+def get_constants_from_bitcode(project_dir, bitcode_file_path):
+  bitcode_file = dataset_corpus.load_file_from_corpus(project_dir,
+                                                      bitcode_file_path)
+  tokenized_functions = bitcode_module.get_tokenization(
+      bitcode_file)['functions']
+  constant_histogram = {}
+  for function in tokenized_functions:
+    for token in function['tokens']:
+      if token['type'] == 'constant_integer_operand':
+        if token['integer_constant'] in constant_histogram:
+          constant_histogram[token['integer_constant']] += 1
+        else:
+          constant_histogram[token['integer_constant']] = 1
+  return constant_histogram
+
+
+ at ray.remote(num_cpus=1)
+def get_constants_from_bitcode_batch(project_dir, bitcode_file_paths):
+  constant_histogram = {}
+  for bitcode_file_path in bitcode_file_paths:
+    constant_histogram = combine_constant_histograms(
+        constant_histogram,
+        get_constants_from_bitcode(project_dir, bitcode_file_path))
+  return constant_histogram
+
+
+ at ray.remote(num_cpus=1)
+def get_constants_from_project(project_dir):
+  try:
+    bitcode_file_paths = dataset_corpus.get_bitcode_file_paths(project_dir)
+  except:
+    return {}
+
+  batches = parallel.split_batches(bitcode_file_paths, 16)
+  batch_futures = []
+  for batch in batches:
+    batch_futures.append(
+        get_constants_from_bitcode_batch.remote(project_dir, batch))
+
+  constant_histogram = {}
+  constant_histograms = ray.get(batch_futures)
+  for partial_constant_histogram in constant_histograms:
+    constant_histogram = combine_constant_histograms(
+        constant_histogram, partial_constant_histogram)
+
+  return constant_histogram
+
+
+def main(_):
+  ray.init()
+
+  projects = os.listdir(FLAGS.corpus_dir)
+
+  project_futures = []
+  for project in projects:
+    project_dir = os.path.join(FLAGS.corpus_dir, project)
+    project_futures.append(get_constants_from_project.remote(project_dir))
+
+    if len(project_futures) >= FLAGS.max_projects:
+      break
+
+  constant_histogram = {}
+
+  while len(project_futures) > 0:
+    finished, project_futures = ray.wait(project_futures, timeout=5.0)
+    logging.info(
+        f'Just finished {len(finished)}, {len(project_futures)} remaining.')
+    for project_histogram in ray.get(finished):
+      constant_histogram = combine_constant_histograms(constant_histogram,
+                                                       project_histogram)
+
+  with open(FLAGS.output_file, 'w') as output_file:
+    for constant in constant_histogram:
+      output_file.write(f'{constant},{constant_histogram[constant]}\n')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_julia_packages.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_julia_packages.py
new file mode 100644
index 000000000000000..acab9b8a6cf4fb9
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_julia_packages.py
@@ -0,0 +1,112 @@
+"""Tool for getting Julia packages."""
+
+import glob
+import subprocess
+import tempfile
+import os
+import logging
+import json
+import sys
+
+from llvm_ir_dataset_utils.util import licenses
+
+from absl import app
+from absl import flags
+
+import toml
+import ray
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('package_list', 'julia_package_list.json',
+                    'The path to write all the list of Julia packages to.')
+flags.DEFINE_string(
+    'gh_pat', None,
+    'Your Github personal access token. Needed to query license information.')
+flags.DEFINE_boolean(
+    'github_ld', False,
+    'Whether or not to download the repositories that have not already been '
+    'tagged with license information and use go-license-detector to detect '
+    'license information')
+flags.DEFINE_integer('max_projects', sys.maxsize,
+                     'The max number of projects to process')
+
+flags.mark_flag_as_required('gh_pat')
+
+REGISTRY_REPOSITORY = 'https://github.com/JuliaRegistries/General'
+
+
+ at ray.remote(num_cpus=1)
+def get_detected_license_repo_future(repo_url, repo_name):
+  return (repo_name,
+          licenses.get_detected_license_from_repo(repo_url, repo_name))
+
+
+def main(_):
+  package_list = []
+  repository_url_list = []
+  with tempfile.TemporaryDirectory() as download_dir:
+    registry_path = os.path.join(download_dir, 'registry')
+    repository_clone_vector = [
+        'git', 'clone', REGISTRY_REPOSITORY, '--depth=1', registry_path
+    ]
+    logging.info('Cloning registry repository.')
+    subprocess.run(
+        repository_clone_vector,
+        check=True,
+        stderr=subprocess.PIPE,
+        stdout=subprocess.PIPE)
+    logging.info('Processing registry.')
+    for package_toml_path in glob.glob(
+        os.path.join(registry_path, '**/Package.toml'), recursive=True):
+      with open(package_toml_path) as package_toml_file:
+        package_description = toml.load(package_toml_file)
+        package_name = package_description['name']
+        package_repo = package_description['repo']
+        if 'jll' not in package_name:
+          package_list.append({'name': package_name, 'repo': package_repo})
+          # Omit the last four characters as julia includes .git by default
+          # in all their repository urls which we don't want.
+          repository_url_list.append(package_repo[:-4])
+      if len(package_list) >= FLAGS.max_projects:
+        break
+
+  logging.info('Gathering license information from the Github API.')
+  repo_license_map = licenses.get_repository_licenses(repository_url_list,
+                                                      FLAGS.gh_pat)
+  for package_dict in package_list:
+    package_dict['license'] = repo_license_map[package_dict['repo'][:-4]]
+
+  if FLAGS.github_ld:
+    logging.info('Gathering license information through license detection')
+    ray.init()
+
+    repo_license_futures = []
+    repo_name_license_map = {}
+
+    for package_dict in package_list:
+      if package_dict['license'] == 'NOASSERTION':
+        repo_license_futures.append(
+            get_detected_license_repo_future.remote(package_dict['repo'],
+                                                    package_dict['name']))
+
+    while len(repo_license_futures) > 0:
+      finished, repo_license_futures = ray.wait(
+          repo_license_futures, timeout=5.0)
+      logging.info(f'Just got license information on {len(finished)} repos, '
+                   f'{len(repo_license_futures)} remaining.')
+      repo_names_licenses = ray.get(finished)
+      for repo_name, repo_license in repo_names_licenses:
+        repo_name_license_map[repo_name] = repo_license
+
+    for package_dict in package_list:
+      if package_dict['name'] in repo_name_license_map:
+        package_dict['license'] = repo_name_license_map[package_dict['name']]
+
+  logging.info('Writing packages to list.')
+  with open(FLAGS.package_list, 'w') as package_list_file:
+    json.dump(package_list, package_list_file, indent=2)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_spack_package_list.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_spack_package_list.py
new file mode 100644
index 000000000000000..f80e40b0cf6afdd
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_spack_package_list.py
@@ -0,0 +1,152 @@
+"""Tool for getting all spack packages that are usable for producing LLVM
+bitcode.
+
+Note: This must be run with `spack-python` or `spack python` rather than your
+default python interpreter.
+"""
+
+import json
+import multiprocessing
+import tempfile
+import os
+import subprocess
+import logging
+import sys
+import re
+
+from absl import app
+from absl import flags
+
+import ray
+
+import spack.repo
+import spack.environment
+import spack.spec
+import spack.config
+
+from llvm_ir_dataset_utils.util import spack as spack_utils
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('package_list', 'package_list.json',
+                    'The path to write the package list to.')
+flags.DEFINE_string(
+    'error_log', None,
+    'The path to write the output of failed concretization commands to.')
+flags.DEFINE_integer('max_projects', sys.maxsize,
+                     'The max number of projects to process.')
+
+
+def add_concrete_package_and_all_deps(concretized_packages, spec):
+  spec_string = str(spec)
+  license_string = re.findall('license=".*?"', spec_string)[0][9:-1]
+  concretized_packages[spec.dag_hash()] = {
+      'spec': spec_string,
+      'deps': [dep_spec.dag_hash() for dep_spec in spec.dependencies()],
+      'name': str(spec.package.fullname.split('.')[1]),
+      'license': license_string
+  }
+  for dep_spec in spec.dependencies():
+    if dep_spec.dag_hash() not in concretized_packages:
+      add_concrete_package_and_all_deps(concretized_packages, dep_spec)
+
+
+ at ray.remote(num_cpus=1)
+def concretize_environment(package_name):
+  concretized_packages = {}
+  with tempfile.TemporaryDirectory() as tempdir:
+    env = spack.environment.create_in_dir(tempdir)
+    env.add(spack.spec.Spec(package_name))
+    env.unify = False
+    env.write()
+
+    os.mkdir(os.path.join(tempdir, '.spack'))
+    command_env = os.environ.copy()
+    command_env['HOME'] = tempdir
+    spack_utils.spack_setup_compiler(tempdir)
+
+    concretize_command_vector = ['spack', '-e', './', 'concretize']
+
+    command_output = subprocess.run(
+        concretize_command_vector,
+        cwd=tempdir,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        env=command_env,
+        universal_newlines=True)
+
+    if command_output.returncode == 0:
+      env = spack.environment.Environment(tempdir)
+
+      concretized_specs = env.all_specs()
+      for concretized_spec in concretized_specs:
+        add_concrete_package_and_all_deps(concretized_packages,
+                                          concretized_spec)
+      return (command_output.stdout, concretized_packages, package_name)
+    else:
+      return (command_output.stdout, None, package_name)
+
+
+def get_concretization_future(package_name):
+  return concretize_environment.remote(package_name)
+
+
+def main(_):
+  ray.init()
+  logging.info('Getting packages.')
+  packages = spack.repo.all_package_names(include_virtuals=True)
+
+  full_package_list = []
+
+  for package in packages:
+    pkg_class = spack.repo.PATH.get_pkg_class(package)
+    # TODO(boomanaiden154): Look into other build systems that are likely to be
+    # composed of c/c++ projects.
+    pkg = pkg_class(spack.spec.Spec(package))
+    if (pkg.build_system_class == 'CMakePackage' or
+        pkg.build_system_class == 'MakefilePackage' or
+        pkg.build_system_class == 'AutotoolsPackage' or
+        pkg.build_system_class == 'MesonPackage'):
+      full_package_list.append(pkg.name)
+
+    if len(full_package_list) >= FLAGS.max_projects:
+      break
+
+  logging.info('Concretizing packages')
+  concretization_futures = []
+  for package in full_package_list:
+    concretization_futures.append(get_concretization_future(package))
+
+  concretized_packages = {}
+
+  erorr_log_file = None
+
+  if FLAGS.error_log is not None:
+    error_log_file = open(FLAGS.error_log, 'w')
+
+  while len(concretization_futures) > 0:
+    finished, concretization_futures = ray.wait(
+        concretization_futures, timeout=5.0)
+    finished_data = ray.get(finished)
+    for data in finished_data:
+      if data[1] is None:
+        if error_log_file is not None:
+          error_log_file.write(
+              f'Encountered the following errors while concretizing {data[2]}:\n'
+          )
+          error_log_file.write(data[0])
+      else:
+        concretized_packages.update(data[1])
+    logging.info(
+        f'Just finished {len(finished_data)}, {len(concretization_futures)} remaining'
+    )
+
+  if error_log_file is not None:
+    error_log_file.close()
+
+  with open(FLAGS.package_list, 'w') as package_list_file:
+    json.dump(concretized_packages, package_list_file, indent=2)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_swift_packages.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_swift_packages.py
new file mode 100644
index 000000000000000..36ac322315b362f
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/get_swift_packages.py
@@ -0,0 +1,116 @@
+"""Tool for getting Swift package list."""
+
+import subprocess
+import tempfile
+import logging
+import json
+import os
+import sys
+
+from llvm_ir_dataset_utils.util import licenses
+
+from absl import app
+from absl import flags
+
+import ray
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('package_list', 'swift_package_list.txt',
+                    'The path to write the list of swift packages to.')
+flags.DEFINE_string(
+    'gh_pat', None,
+    'Your github personal access token. Needed to query license information')
+flags.DEFINE_boolean(
+    'github_ld', False,
+    'Whether or not to download the repositories that have not already been '
+    'tagged with license information and use go-license-detector to detect '
+    'license information')
+flags.DEFINE_integer('max_projects', sys.maxsize,
+                     'The maximum number of projects to process.')
+
+flags.mark_flag_as_required('gh_pat')
+
+REGISTRY_REPOSITORY = 'https://github.com/SwiftPackageIndex/PackageList'
+
+
+# TODO(boomanaiden154): This and some of the code below can be refactored
+# out into some common utilities as quite a bit is duplicated with
+# get_julia_packages.py
+ at ray.remote(num_cpus=1)
+def get_detected_license_repo_future(repo_url, repo_name):
+  return (repo_name,
+          licenses.get_detected_license_from_repo(repo_url, repo_name))
+
+
+def main(_):
+  package_list = []
+  with tempfile.TemporaryDirectory() as download_dir:
+    registry_path = os.path.join(download_dir, 'registry')
+    registry_clone_vector = [
+        'git', 'clone', REGISTRY_REPOSITORY, '--depth=1', registry_path
+    ]
+    logging.info('Cloning registry repository.')
+    subprocess.run(
+        registry_clone_vector,
+        check=True,
+        stderr=subprocess.PIPE,
+        stdout=subprocess.PIPE)
+    logging.info('Processing registry.')
+    package_list_json_path = os.path.join(registry_path, 'packages.json')
+    with open(package_list_json_path) as package_list_json_file:
+      package_list = json.load(package_list_json_file)
+
+  package_list = package_list[:FLAGS.max_projects]
+
+  logging.info('Collecting license information from the Github API.')
+  sanitized_package_list = []
+  for package in package_list:
+    # We don't want the .git that is automatically at the end
+    sanitized_package_list.append(package[:-4])
+  repository_license_map = licenses.get_repository_licenses(
+      sanitized_package_list, FLAGS.gh_pat)
+
+  logging.info('Writing packages to list.')
+  output_package_list = []
+  for package in package_list:
+    current_package = {
+        'repo': package,
+        'name': package.split('/')[-1][:-4],
+        'license': repository_license_map[package[:-4]]
+    }
+    output_package_list.append(current_package)
+
+  if FLAGS.github_ld:
+    logging.info('Gathering license information through license detection/')
+    ray.init()
+
+    repo_license_futures = []
+    repo_name_license_map = {}
+
+    for package_dict in output_package_list:
+      if package_dict['license'] == 'NOASSERTION':
+        repo_license_futures.append(
+            get_detected_license_repo_future.remote(package_dict['repo'],
+                                                    package_dict['name']))
+
+    while len(repo_license_futures) > 0:
+      finished, repo_license_futures = ray.wait(
+          repo_license_futures, timeout=5.0)
+      logging.info(
+          f'Just got license information in {len(finished)} repos, {len(repo_license_futures)} remaining.'
+      )
+      repo_names_licenses = ray.get(finished)
+      for repo_name, repo_license in repo_names_licenses:
+        repo_name_license_map[repo_name] = repo_license
+
+    for package_dict in output_package_list:
+      if package_dict['name'] in repo_name_license_map:
+        package_dict['license'] = repo_name_license_map[package_dict['name']]
+
+  with open(FLAGS.package_list, 'w') as package_list_file:
+    json.dump(output_package_list, package_list_file, indent=2)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/link_files.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/link_files.py
new file mode 100644
index 000000000000000..ef67a0c8fbf264f
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/link_files.py
@@ -0,0 +1,89 @@
+"""Tool for running llvm-link over all bitcode files in a corpus."""
+
+import pathlib
+import os
+import subprocess
+import logging
+
+from absl import app
+from absl import flags
+
+import ray
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None, 'The path to the corpus directory.')
+flags.DEFINE_string('output_dir', None, 'The path to the output directory.')
+
+flags.mark_flag_as_required('corpus_dir')
+
+
+ at ray.remote(num_cpus=1)
+def link_package(folder_path, output_dir):
+  # TODO(boomanaiden154): Pull from a corpus_manifest/meta corpus manifest
+  # rather than glob for the bitcode files once they're available in all of
+  # my builds.
+  bitcode_files_gen = pathlib.Path(folder_path).glob('**/*.bc')
+  bitcode_files = list(bitcode_files_gen)
+
+  if len(bitcode_files) == 0:
+    return (False, None)
+
+  command_vector = ['llvm-link']
+
+  command_vector.append(bitcode_files[0])
+  for bitcode_file in bitcode_files[1:]:
+    command_vector.extend(['-override', bitcode_file])
+
+  package_name = os.path.basename(folder_path)
+  output_file_path = os.path.join(output_dir, package_name + '.bc')
+  command_vector.extend(['-o', output_file_path])
+
+  try:
+    command_output = subprocess.run(
+        command_vector, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+  except OSError:
+    return (False, None)
+
+  if command_output.returncode == 0:
+    return (True, output_file_path)
+  else:
+    return (False, output_file_path)
+
+
+def main(_):
+  pathlib.Path(FLAGS.output_dir).mkdir(exist_ok=True, parents=True)
+
+  corpus_folders = os.listdir(FLAGS.corpus_dir)
+
+  package_processing_futures = []
+  for corpus_folder in corpus_folders:
+    corpus_folder_full_path = os.path.join(FLAGS.corpus_dir, corpus_folder)
+    package_processing_future = link_package.remote(corpus_folder_full_path,
+                                                    FLAGS.output_dir)
+    package_processing_futures.append(package_processing_future)
+
+  link_success = 0
+  link_failures = []
+  while len(package_processing_futures) > 0:
+    to_wait_for = 128
+    if len(package_processing_futures) < 256:
+      to_wait_for = 1
+    finished, package_processing_futures = ray.wait(
+        package_processing_futures, timeout=5.0, num_returns=to_wait_for)
+    finished_data = ray.get(finished)
+    for finished_link in finished_data:
+      if finished_link[0]:
+        link_success += 1
+      else:
+        link_failures.append(finished_link[1])
+    logging.info(
+        f'Just finished {len(finished_data)}, {len(package_processing_futures)} remaining.'
+    )
+
+  logging.info(
+      f'Got {link_success} successes and {len(link_failures)} failures.')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/module_statistics.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/module_statistics.py
new file mode 100644
index 000000000000000..001eb3b86fa087c
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/module_statistics.py
@@ -0,0 +1,174 @@
+"""Tool for getting statistics on bitcode modules."""
+
+import os
+import logging
+import csv
+import sys
+
+from absl import app
+from absl import flags
+
+import ray
+
+from llvm_ir_dataset_utils.util import bitcode_module
+from llvm_ir_dataset_utils.util import dataset_corpus
+from llvm_ir_dataset_utils.util import parallel
+
+MODULE_STATISTICS_TYPES = [
+    'parsing', 'module_size', 'module_size_text', 'get_lowered_size',
+    'get_opt_lowered_size', 'call_names', 'function_hashes',
+    'module_properties', 'module_hashes', 'module_instruction_distribution',
+    'defined_function_names', 'token_count', 'post_O3_function_hashes',
+    'module_instruction_distribution_O3', 'module_properties_O3'
+]
+
+FUNCTION_STATISTICS_TYPES = [
+    'properties', 'passes', 'post_opt_properties', 'instruction_distribution'
+]
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None,
+                    'The corpus directory to look for modules in.')
+flags.DEFINE_string('output_file_path', None, 'The output file.')
+flags.DEFINE_enum('type', 'properties',
+                  MODULE_STATISTICS_TYPES + FUNCTION_STATISTICS_TYPES,
+                  'The type of statistics to collect.')
+flags.DEFINE_integer(
+    'max_projects',
+    sys.maxsize,
+    'The maximum number of projects to process.',
+    lower_bound=1)
+flags.DEFINE_string('error_file_path', None, 'The path to log errors in.')
+flags.DEFINE_enum(
+    'language_filter', 'none', ['c', 'cpp', 'none'], 'Specify a '
+    'language to filter for. This is mostly aimed at filtering '
+    'for c/c++ which can coexist in the same project.')
+flags.DEFINE_string(
+    'vocab_path', None, 'The path to the vocab '
+    'file for doing BPE tokenization. Only used for the '
+    'token_count module statistics.')
+flags.DEFINE_string(
+    'project_filter', None,
+    'A filter for projects. If the filter string is present in the project '
+    'name, it is included in the statistics.')
+
+flags.mark_flag_as_required('corpus_dir')
+flags.mark_flag_as_required('output_file_path')
+
+BITCODE_MODULE_CHUNK_SIZE = 32
+
+
+ at ray.remote(num_cpus=1)
+def get_statistics_module_functions(project_dir, bitcode_file_path,
+                                    statistics_type):
+  bitcode_file = dataset_corpus.load_file_from_corpus(project_dir,
+                                                      bitcode_file_path)
+  module_path = f'{project_dir}:{bitcode_file_path}'
+  return bitcode_module.get_bitcode_module_function_statistics(
+      bitcode_file, statistics_type, module_path)
+
+
+ at ray.remote(num_cpus=1)
+def process_single_project(project_dir, statistics_type, language_filter,
+                           extra_properties):
+  statistics = []
+  try:
+    bitcode_modules = dataset_corpus.get_bitcode_file_paths(project_dir)
+  except:
+    return []
+
+  module_futures = []
+  if statistics_type in MODULE_STATISTICS_TYPES:
+    # We're computing a module level statistic. Split modules into batches
+    # and then compute statistics over them.
+    batches = parallel.split_batches(bitcode_modules, BITCODE_MODULE_CHUNK_SIZE)
+    for batch in batches:
+      module_futures.append(
+          bitcode_module.get_module_statistics_batch.remote(
+              project_dir, batch, statistics_type, language_filter,
+              extra_properties))
+  else:
+    for bitcode_file_path in bitcode_modules:
+      module_futures.append(
+          get_statistics_module_functions.remote(project_dir, bitcode_file_path,
+                                                 statistics_type))
+
+  module_statistics = ray.get(module_futures)
+  for module_statistic in module_statistics:
+    statistics.extend(module_statistic)
+  return statistics
+
+
+def collect_statistics(projects_list, statistics_type):
+  project_futures = []
+
+  for project_dir in projects_list:
+    if FLAGS.project_filter:
+      if FLAGS.project_filter not in project_dir:
+        continue
+    full_project_path = os.path.join(FLAGS.corpus_dir, project_dir)
+    extra_properties = {'bpe_vocab_path': FLAGS.vocab_path}
+    project_futures.append(
+        process_single_project.remote(full_project_path, statistics_type,
+                                      FLAGS.language_filter, extra_properties))
+    if len(project_futures) >= FLAGS.max_projects:
+      break
+
+  statistics = []
+
+  while len(project_futures) > 0:
+    to_return = 128 if len(project_futures) > 256 else 1
+    finished, project_futures = ray.wait(
+        project_futures, timeout=5.0, num_returns=to_return)
+    logging.info(
+        f'Just finished {len(finished)}, {len(project_futures)} remaining.')
+    for project_statistics in ray.get(finished):
+      statistics.extend(project_statistics)
+
+  combined_statistics = {}
+  errors = []
+  for statistic in statistics:
+    if statistic[0]:
+      errors.append(statistic)
+    else:
+      individual_data = statistic[1]
+      data_length = 0
+      if len(individual_data) != 0:
+        data_length = len(next(iter(individual_data.values())))
+      individual_data['name'] = [statistic[2]] * data_length
+      if 'instruction_distribution' in statistics_type or 'properties' in statistics_type:
+        fill_value = 0
+      else:
+        fill_value = False
+      combined_statistics = bitcode_module.combine_statistics(
+          combined_statistics, individual_data, fill_value)
+
+  if FLAGS.error_file_path:
+    with open(FLAGS.error_file_path, 'w') as error_file:
+      for error in errors:
+        error_file.write(f'{error[2]},{error[0]}\n')
+
+  logging.info('Writing statistics to csv file.')
+
+  with open(FLAGS.output_file_path, 'w') as output_file:
+    csv_writer = csv.writer(output_file)
+    csv_writer.writerow(combined_statistics.keys())
+    csv_writer.writerows(zip(*combined_statistics.values()))
+
+
+def main(_):
+  ray.init()
+
+  # Perform some basic input validation
+  if FLAGS.type == 'token_count' and FLAGS.vocab_path is None:
+    logging.fatal('A vocab path must be specified when gathering token counts.')
+    sys.exit(1)
+
+  projects = os.listdir(FLAGS.corpus_dir)
+
+  collect_statistics(projects, FLAGS.type)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/parse_crates_database.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/parse_crates_database.py
new file mode 100644
index 000000000000000..b851559e0feb67c
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/parse_crates_database.py
@@ -0,0 +1,125 @@
+"""A tool for downloading and parsing the crates.io database to get repositories
+and corpus descriptions out.
+"""
+
+import csv
+import tempfile
+import os
+import tarfile
+import sys
+import json
+import requests
+from urllib import parse
+
+from absl import app
+from absl import flags
+import logging
+
+csv.field_size_limit(sys.maxsize)
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('repository_list', 'repository_list.json',
+                    'The path to write the repository list to.')
+flags.DEFINE_string(
+    'db_dump_archive', None,
+    'The path to the database dump. Only pass a value to this flag if you '
+    'don\'t want the script to download the dump itself.')
+
+
+def process_git_url(git_repo_url):
+  url_struct = parse.urlparse(git_repo_url)
+  if url_struct.netloc == 'github.com':
+    # Remove everything except for the first three components of the path
+    test = '/'.join(url_struct.path.split(os.sep)[:3])
+    return parse.urlunparse(url_struct._replace(path=test))
+  else:
+    return parse.urlunparse(url_struct)
+
+
+def dedeuplicate_repositories(crates_list):
+  repository_dict = {}
+  new_crates_list = []
+  # We're making the assumption here that if multiple crates point to the
+  # same repository, all of them can be built from that repository.
+  # TODO(boomanaiden154): Investigate further whether or not this assumption
+  # makes sense.
+  for crate in crates_list:
+    if crate['repository'] == None:
+      new_crates_list.append(crate)
+    elif crate['repository'] not in repository_dict:
+      repository_dict[crate['repository']] = True
+      new_crates_list.append(crate)
+  return new_crates_list
+
+
+def canonicalize_license(license_string):
+  # Some of the licenses include / as a seperator. This is equivalent to OR
+  # within the rust crates index, but not standard in the SPDX format.
+  license_string = license_string.replace('/', ' OR ')
+  return license_string
+
+
+def main(_):
+  with tempfile.TemporaryDirectory() as download_dir:
+    file_download_path = FLAGS.db_dump_archive
+    if file_download_path is None:
+      logging.info('Downloading crates.io database dump.')
+      file_download_path = os.path.join(download_dir, 'db-dump.tar.gz')
+      response = requests.get('https://static.crates.io/db-dump.tar.gz')
+      with open(file_download_path, 'wb') as file_download_file:
+        file_download_file.write(response.content)
+      logging.info('Extracting relevant data from the downloaded tar archive.')
+    else:
+      logging.info('Not downloading crates.io database dump, using user '
+                   'archive.')
+    logging.info('Extracting relevant files from archive.')
+    with tarfile.open(file_download_path) as crates_tar_archive:
+      files_to_extract = {}
+      for crates_file_name in crates_tar_archive.getnames():
+        if 'crates.csv' in crates_file_name:
+          files_to_extract['crates.csv'] = crates_file_name
+        elif 'versions.csv' in crates_file_name:
+          files_to_extract['versions.csv'] = crates_file_name
+      for file_to_extract in files_to_extract:
+        crates_tar_archive.extract(files_to_extract[file_to_extract],
+                                   download_dir)
+      logging.info('Parsing crates list.')
+      with open(os.path.join(download_dir,
+                             files_to_extract['crates.csv'])) as crates_file:
+        reader = csv.DictReader(crates_file)
+        crates_list = [row for row in reader]
+      logging.info('Parsing versions list.')
+      with open(os.path.join(
+          download_dir, files_to_extract['versions.csv'])) as versions_file:
+        reader = csv.DictReader(versions_file)
+        versions_map = {}
+        for version_entry in reader:
+          if version_entry['crate_id'] not in versions_map or versions_map[
+              version_entry['crate_id']][0] < version_entry['num']:
+            versions_map[version_entry['crate_id']] = (
+                version_entry['num'],
+                canonicalize_license(version_entry['license']))
+  logging.info('Generating and deduplicating repository list.')
+  source_list = []
+  for crate in crates_list:
+    crate_source_dict = {
+        'repository':
+            crate['repository'] if crate["repository"] != '' else None,
+    }
+    if crate['id'] in versions_map:
+      crate_version = versions_map[crate['id']][0]
+      crate_source_dict[
+          'tar_archive'] = f'https://crates.io/api/v1/crates/{crate["name"]}/{crate_version}/download'
+      crate_source_dict['license'] = versions_map[crate['id']][1]
+    else:
+      crate_source_dict['tar_archive'] = None
+    source_list.append(crate_source_dict)
+  source_list = dedeuplicate_repositories(source_list)
+  logging.info(f'Writing {len(source_list)} crate sources.')
+  with open(FLAGS.repository_list, 'w') as repository_list_file:
+    json.dump(source_list, repository_list_file, indent=2)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/process_to_parquet.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/process_to_parquet.py
new file mode 100644
index 000000000000000..57ec44a915f588f
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/process_to_parquet.py
@@ -0,0 +1,67 @@
+"""This is a script that allows for the conversion of a deduplicated dataset
+into a parquet dataset for distribution.
+"""
+
+import logging
+import os
+import sys
+
+from absl import app
+from absl import flags
+
+import pandas
+
+import pyarrow
+
+from pyarrow import parquet
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None, 'The corpus to pull bitcode from.')
+flags.DEFINE_integer('max_projects', sys.maxsize,
+                     'The maximum number of projects to process')
+
+flags.mark_flag_as_required('corpus_dir')
+
+# TODO(boomanaiden154): Add in support for propogating license information
+# and other project provenance information once we have it.
+
+
+def process_single_project(project_dir, dataset_dir):
+  try:
+    bitcode_paths = dataset_corpus.get_bitcode_file_paths(project_dir)
+  except:
+    return
+
+  module_content = []
+
+  for bitcode_path in bitcode_paths:
+    bitcode_file_data = dataset_corpus.load_file_from_corpus(
+        project_dir, bitcode_path)
+    module_content.append(bitcode_file_data)
+
+  dataframe = pandas.DataFrame.from_dict({'content': module_content})
+
+  table = pyarrow.Table.from_pandas(dataframe, preserve_index=False)
+
+  parquet.write_table(table, dataset_dir)
+
+
+def main(_):
+  projects_list = os.listdir(FLAGS.corpus_dir)
+
+  logging.info(f'Processing {len(projects_list)} projects')
+
+  for index, project_dir in enumerate(projects_list):
+    project_path = os.path.join(FLAGS.corpus_dir, project_dir)
+    process_single_project(project_path, '/tmp/test.parquet')
+    logging.info(f'Just finished processing {project_dir}')
+
+    if index >= FLAGS.max_projects:
+      break
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/search_strings.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/search_strings.py
new file mode 100644
index 000000000000000..1567472796384d2
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/search_strings.py
@@ -0,0 +1,74 @@
+"""Search for strings in bc files that will be in the dataset distribution.
+"""
+
+import logging
+import sys
+import os
+
+from absl import app
+from absl import flags
+
+import ray
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('corpus_dir', None, 'The corpus to scan for strings')
+flags.DEFINE_multi_string('strings', None,
+                          'The strings to look for in the corpus')
+flags.DEFINE_integer('max_projects', sys.maxsize,
+                     'The maximum number of projects to process.')
+
+flags.mark_flag_as_required('corpus_dir')
+flags.mark_flag_as_required('strings')
+
+
+ at ray.remote
+def does_project_contain_strings(project_dir, strings):
+  try:
+    bitcode_paths = dataset_corpus.get_bitcode_file_paths(project_dir)
+  except:
+    return False
+
+  for bitcode_path in bitcode_paths:
+    bitcode_file_data = dataset_corpus.load_file_from_corpus(
+        project_dir, bitcode_path)
+    for possible_string in strings:
+      if bitcode_file_data.find(possible_string.encode('utf-8')) != -1:
+        return True
+  return False
+
+
+def main(_):
+  ray.init()
+
+  projects = os.listdir(FLAGS.corpus_dir)[:FLAGS.max_projects]
+  project_futures = []
+  for project_dir in projects:
+    full_project_dir = os.path.join(FLAGS.corpus_dir, project_dir)
+    project_futures.append(
+        does_project_contain_strings.remote(full_project_dir, FLAGS.strings))
+
+  has_strings = 0
+  no_strings = 0
+
+  while len(project_futures) > 0:
+    num_to_return = 1024 if len(project_futures) > 2048 else 1
+    finished_projects, project_futures = ray.wait(
+        project_futures, timeout=5.0, num_returns=num_to_return)
+    logging.info(
+        f'Just finished processing {len(finished_projects)} projects, {len(project_futures)} projects remaining.'
+    )
+    finished_data = ray.get(finished_projects)
+    for project_status in finished_data:
+      if project_status:
+        has_strings += 1
+      else:
+        no_strings += 1
+
+  logging.info(f'{has_strings} projects contain the specified strings.')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/spack_analyze_failures.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/spack_analyze_failures.py
new file mode 100644
index 000000000000000..e8fb82a69feebb3
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/spack_analyze_failures.py
@@ -0,0 +1,87 @@
+"""A tool for finding spack build failures that break the most dependent
+packages.
+"""
+
+import json
+import csv
+import os
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'build_failures', None,
+    'The path to the CSV file of build failures from get_build_failures.py')
+flags.DEFINE_string(
+    'package_list', None,
+    'The path to the package list jSON from get_spack_package_list.py')
+
+flags.mark_flag_as_required('build_failures')
+flags.mark_flag_as_required('package_list')
+
+
+def get_dependents_dict(package_dependencies_dict):
+  dependents_dict = {}
+  for package in package_dependencies_dict:
+    for package_dependency in package_dependencies_dict[package]['deps']:
+      if package_dependency in dependents_dict:
+        dependents_dict[package_dependency].append(package)
+      else:
+        dependents_dict[package_dependency] = [package]
+  return dependents_dict
+
+
+def get_dependents(package_hash, dependents_dict):
+  dependents = []
+  if package_hash not in dependents_dict:
+    return []
+  else:
+    dependents.extend(dependents_dict[package_hash])
+  for dependent_package_hash in dependents_dict[package_hash]:
+    dependents.extend(get_dependents(dependent_package_hash, dependents_dict))
+  return dependents
+
+
+def deduplicate_list(to_deduplicate):
+  return list(dict.fromkeys(to_deduplicate))
+
+
+def main(_):
+  with open(FLAGS.package_list) as package_list_file:
+    package_dict = json.load(package_list_file)
+
+  package_hash_failures = []
+  with open(FLAGS.build_failures) as build_failures_file:
+    build_failures_reader = csv.reader(build_failures_file)
+    for failure_row in build_failures_reader:
+      # Exclude failures that happen because a dependency fails to build.
+      if failure_row[2] != 'NULL':
+        package_name_hash = os.path.dirname(failure_row[2])
+        # Cut off the last six characters to get rid of the .tar: at the
+        # end of every line in an archived corpus.
+        # TODO(boomanaiden154): Make this robust against usage in a non-archived
+        # corpus.
+        package_hash = package_name_hash.split('-')[1][:-6]
+        package_hash_failures.append(package_hash)
+
+  dependents_dict = get_dependents_dict(package_dict)
+
+  failures_dependents = []
+  for failure_hash in package_hash_failures:
+    # Deduplicate the list of dependents because we're not checking some
+    # conditions while walking the dependents tree and this is a "cheap" way to
+    # fix that.
+    failures_dependents.append(
+        (failure_hash,
+         len(deduplicate_list(get_dependents(failure_hash, dependents_dict)))))
+
+  failures_dependents.sort(key=lambda a: a[1])
+
+  for failure_dependents_pair in failures_dependents:
+    print(f'{failure_dependents_pair[0]},{failure_dependents_pair[1]}')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/top_x_constants.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/top_x_constants.py
new file mode 100644
index 000000000000000..9021ed8e25b26c6
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/tools/top_x_constants.py
@@ -0,0 +1,37 @@
+"""Tool for getting the top x constants from a constant frequency histogram."""
+
+import logging
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('constant_histogram_file', None,
+                    'The path to the constant histogram CSV file.')
+flags.DEFINE_string('output_file', None, 'The path to the output file.')
+flags.DEFINE_integer('constant_count', None,
+                     'The number of constants to pull from the histogram.')
+
+flags.mark_flag_as_required('constant_histogram_file')
+flags.mark_flag_as_required('output_file')
+flags.mark_flag_as_required('constant_count')
+
+
+def main(_):
+  constants = []
+  with open(FLAGS.constant_histogram_file) as constant_histogram_file:
+    for line in constant_histogram_file:
+      line_stripped = line.rstrip()
+      line_parts = line_stripped.split(',')
+      constants.append((int(line_parts[0]), int(line_parts[1])))
+
+  constants.sort(key=lambda const: const[1], reverse=True)
+
+  with open(FLAGS.output_file, 'w') as output_file:
+    for constant in constants[0:FLAGS.constant_count]:
+      output_file.write(f'{constant[0]}\n')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/bitcode_module.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/bitcode_module.py
new file mode 100644
index 000000000000000..ea9959209a0f468
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/bitcode_module.py
@@ -0,0 +1,705 @@
+"""Utilities for working with bitcode modules."""
+
+import subprocess
+import os
+import tempfile
+import logging
+import json
+import shutil
+
+import ray
+
+from llvm_ir_dataset_utils.util import dataset_corpus
+from llvm_ir_dataset_utils.util import pass_list_constants
+from llvm_ir_dataset_utils.util import parallel
+
+BITCODE_FILE_CHUNK_SIZE = 16
+
+OPT_TIMEOUT_SECONDS = 60
+FASTBPE_TIMEOUT_SECONDS = 180
+LLVM_DIS_TIMEOUT_SECONDS = 180
+
+
+def get_function_symbols(bitcode_module):
+  llvm_nm_command_vector = ['llvm-nm', '--defined-only', '--format=posix', '-']
+  with subprocess.Popen(
+      llvm_nm_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      stdin=subprocess.PIPE) as llvm_nm_process:
+    stdout = llvm_nm_process.communicate(
+        input=bitcode_module)[0].decode('utf-8')
+    if llvm_nm_process.returncode != 0:
+      logging.warning('Failed to get functions from bitcode module.')
+      return (stdout.replace('\n', ''), None)
+    module_symbols = stdout.split('\n')[:-1]
+  module_list = []
+  for symbol in module_symbols:
+    symbol_parts = symbol.split(' ')
+    # Only look for t or T symbols (actual code)
+    if symbol_parts[1] == 't' or symbol_parts[1] == 'T':
+      module_list.append(symbol_parts[0])
+  return (None, module_list)
+
+
+def extract_individual_function(bitcode_module, extraction_path,
+                                function_symbol):
+  function_module_name = os.path.join(extraction_path, f'{function_symbol}.bc')
+  extract_command_vector = [
+      'llvm-extract', '-func', function_symbol, '-o', function_module_name
+  ]
+  try:
+    with subprocess.Popen(
+        extract_command_vector,
+        stderr=subprocess.STDOUT,
+        stdout=subprocess.PIPE,
+        stdin=subprocess.PIPE) as extraction_process:
+      stdout = extraction_process.communicate(
+          input=bitcode_module)[0].decode('utf-8')
+      if extraction_process.returncode != 0:
+        logging.info(f'Failed to extract {function_symbol}')
+        return (stdout.replace('\n', ''), None)
+  except OSError:
+    logging.info(f'Failed to extract {function_symbol} due to OSError')
+    return ('oserror', None)
+
+  return (None, function_module_name)
+
+
+def get_run_passes_opt(bitcode_function_path):
+  opt_command_vector = [
+      'opt', bitcode_function_path, '-print-changed', '-passes=default<O3>',
+      '-o', '/dev/null'
+  ]
+  try:
+    opt_process = subprocess.run(
+        opt_command_vector,
+        encoding='UTF-8',
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=OPT_TIMEOUT_SECONDS)
+  except:
+    return ('timeout', None)
+  if opt_process.returncode != 0:
+    return (opt_process.stdout.replace('\n', ''), None)
+  opt_process_lines = opt_process.stdout.split('\n')
+  pass_indexes = {}
+  passes = {}
+  for opt_process_line in opt_process_lines:
+    if opt_process_line[:3] == '***' and opt_process_line[-3:] == '***':
+      # We're in a pass status line
+      if opt_process_line[4:11] == 'IR Pass':
+        # Anything starting with IR Pass gets ignored, so we can't do anything
+        # with it.
+        continue
+      if opt_process_line[12:20] == 'At Start':
+        # Ignore the starting IR
+        continue
+      pass_name = opt_process_line.split(' on ')[0][12:]
+      pass_name = pass_name.split('After ')[1]
+      ir_changed = opt_process_line[-13:-4] == 'no change'
+      # Special case loop passes because they run once per loop rather than
+      # once per function.
+      if pass_name in pass_list_constants.LOOP_PASS_LIST:
+        pass_name = pass_name + '1'
+        if pass_name not in passes or not passes[pass_name]:
+          passes[pass_name] = ir_changed
+      elif pass_name in pass_indexes:
+        pass_indexes[pass_name] += 1
+        pass_name = f'{pass_name}{pass_indexes[pass_name]}'
+      else:
+        pass_indexes[pass_name] = 1
+        pass_name = pass_name + '1'
+      if ir_changed:
+        passes[pass_name] = [False]
+      else:
+        passes[pass_name] = [True]
+  return (None, passes)
+
+
+def combine_statistics(function_a, function_b, fill_value=False):
+  if function_a is None or function_a == {}:
+    return function_b
+  combined_statistics = function_a
+  combined_statistics_length = len(combined_statistics[list(
+      combined_statistics.keys())[0]])
+  for function_statistic in list(
+      set(list(function_a.keys()) + list(function_b.keys()))):
+    if function_statistic in combined_statistics and function_statistic in function_b:
+      combined_statistics[function_statistic].extend(
+          function_b[function_statistic])
+    elif function_statistic in function_b:
+      combined_statistics[function_statistic] = [
+          fill_value for i in range(0, combined_statistics_length)
+      ]
+      combined_statistics[function_statistic].extend(
+          function_b[function_statistic])
+    elif function_statistic in combined_statistics:
+      function_b_statistics_length = len(function_b[list(function_b.keys())[0]])
+      extra_values = [
+          fill_value for i in range(0, function_b_statistics_length)
+      ]
+      combined_statistics[function_statistic].extend(extra_values)
+  return combined_statistics
+
+
+def get_function_properties(bitcode_function_path,
+                            passes="forceattrs,print<func-properties>"):
+  properties_dict = {}
+  opt_command_vector = [
+      'opt', f'-passes={passes}', bitcode_function_path,
+      '-enable-detailed-function-properties', '-disable-output',
+      '-force-remove-attribute=optnone'
+  ]
+  try:
+    opt_process = subprocess.run(
+        opt_command_vector,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        encoding='utf-8',
+        timeout=OPT_TIMEOUT_SECONDS)
+  except subprocess.SubprocessError:
+    return ('timeout', None)
+  if opt_process.returncode != 0:
+    return (opt_process.stdout.replace('\n', ''), None)
+  output_lines = opt_process.stdout.split('\n')[1:-2]
+  for output_line in output_lines:
+    line_parts = output_line.split(': ')
+    if len(line_parts) < 2:
+      return ('invalid opt output', None)
+    properties_dict[line_parts[0]] = [line_parts[1]]
+  return (None, properties_dict)
+
+
+def get_function_properties_module(bitcode_module, extra_passes=''):
+  if extra_passes != '':
+    extra_passes += ','
+  properties_dict = {}
+  opt_command_vector = [
+      'opt', f'-passes={extra_passes}forceattrs,print<func-properties>',
+      '-enable-detailed-function-properties', '-force-remove-attribute=optnone',
+      '-disable-output', '-'
+  ]
+  with subprocess.Popen(
+      opt_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      stdin=subprocess.PIPE) as opt_process:
+    try:
+      stdout = opt_process.communicate(
+          input=bitcode_module, timeout=OPT_TIMEOUT_SECONDS)[0].decode('utf-8')
+    except subprocess.TimeoutExpired:
+      return ('timout', None)
+    if opt_process.returncode != 0:
+      return (stdout.replace('\n', ''), None)
+
+    start_index = 0
+    output_lines_raw = stdout.split('\n')[:-2]
+
+    while start_index < len(output_lines_raw):
+      if output_lines_raw[start_index].startswith('Printing'):
+        break
+      start_index += 1
+
+    output_lines = output_lines_raw[start_index:]
+    if len(output_lines) == 0:
+      return ('no functions found in bitcode file', None)
+    for output_line in output_lines:
+      if output_line.startswith('Printing'):
+        continue
+      elif output_line == '':
+        continue
+      line_parts = output_line.split(': ')
+      if line_parts[0] in properties_dict:
+        properties_dict[line_parts[0]].append(line_parts[1])
+      else:
+        if len(line_parts) < 2:
+          return ('invalid output from opt', None)
+        properties_dict[line_parts[0]] = [line_parts[1]]
+    return (None, properties_dict)
+
+
+def get_instruction_counts(bitcode_module, additional_passes=''):
+  properties_or_error = get_function_properties_module(bitcode_module,
+                                                       additional_passes)
+  if properties_or_error[0]:
+    return None
+  else:
+    return [
+        int(inst_count)
+        for inst_count in properties_or_error[1]['TotalInstructionCount']
+    ]
+
+
+def get_instruction_histogram(bitcode_module, additional_passes=''):
+  if additional_passes != '':
+    additional_passes += ','
+  instruction_histogram = {}
+  opt_command_vector = [
+      'opt', '-disable-output', f'-passes={additional_passes}instcount',
+      '-stats'
+  ]
+  with subprocess.Popen(
+      opt_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      stdin=subprocess.PIPE) as opt_process:
+    try:
+      output = opt_process.communicate(input=bitcode_module)[0].decode('utf-8')
+    except subprocess.TimeoutExpired:
+      return ('timeout', None)
+    if opt_process.returncode != 0:
+      return ('opt did not return with code zero', None)
+    # Work on parsing the output
+    output_lines = output.split('\n')
+    # Skip the first five lines as they contain the stats header
+    for output_line in output_lines[5:-2]:
+      if 'instcount' not in output_line:
+        continue
+      output_line_parts = output_line.split()
+      if len(output_line_parts) < 7:
+        return ('opt returned invalid output', None)
+      # Statistics line format is <count> <stat type> - number of <inst name>
+      # This check skips all non instruction statistics also collected by the pass.
+      if output_line_parts[6] != 'insts':
+        continue
+      instruction_name = output_line_parts[5]
+      instruction_count = int(output_line_parts[0])
+      instruction_histogram[instruction_name] = [instruction_count]
+  return (None, instruction_histogram)
+
+
+def get_instruction_histogram_from_file(bitcode_file_path):
+  with open(bitcode_file_path, 'rb') as bitcode_file:
+    return get_instruction_histogram(bitcode_file.read())
+
+
+ at ray.remote(num_cpus=1)
+def get_function_statistics_batch(bitcode_module, function_symbols,
+                                  statistics_type, module_path):
+  statistics = []
+  with tempfile.TemporaryDirectory() as extracted_functions_dir:
+    for function_symbol in function_symbols:
+      expected_extracted_function_path = extract_individual_function(
+          bitcode_module, extracted_functions_dir, function_symbol)
+      function_path = f'{module_path}:{function_symbol}'
+      if expected_extracted_function_path[0]:
+        statistics.append(
+            (expected_extracted_function_path[0], None, function_path))
+        continue
+      bitcode_function_path = expected_extracted_function_path[1]
+      if statistics_type == 'properties':
+        function_statistics_expected = get_function_properties(
+            bitcode_function_path)
+      elif statistics_type == 'passes':
+        function_statistics_expected = get_run_passes_opt(bitcode_function_path)
+      elif statistics_type == 'post_opt_properties':
+        function_statistics_expected = get_function_properties(
+            bitcode_function_path,
+            'forceattrs,default<O3>,print<func-properties>')
+      elif statistics_type == 'instruction_distribution':
+        function_statistics_expected = get_instruction_histogram_from_file(
+            bitcode_function_path)
+      if function_statistics_expected[0]:
+        statistics.append(
+            (function_statistics_expected[0], None, function_path))
+      else:
+        statistics.append(
+            (None, function_statistics_expected[1], function_path))
+  return statistics
+
+
+def get_bitcode_module_function_statistics(bitcode_module, statistics_type,
+                                           module_path):
+  with tempfile.TemporaryDirectory() as extracted_functions_dir:
+    function_symbols_expected = get_function_symbols(bitcode_module)
+
+    if function_symbols_expected[0]:
+      return [(function_symbols_expected[0], None, module_path)]
+
+    function_symbols = function_symbols_expected[1]
+
+    statistics_futures = []
+    batches = parallel.split_batches(function_symbols, BITCODE_FILE_CHUNK_SIZE)
+    for batch in batches:
+      statistics_futures.append(
+          get_function_statistics_batch.remote(bitcode_module, batch,
+                                               statistics_type, module_path))
+
+    statistics_chunks = ray.get(statistics_futures)
+    statistics = []
+    for statistics_chunk in statistics_chunks:
+      statistics.extend(statistics_chunk)
+  return statistics
+
+
+def test_parsing(bitcode_module):
+  opt_command_vector = ['opt', '-', '-o', '/dev/null']
+  with subprocess.Popen(
+      opt_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      stdin=subprocess.PIPE) as opt_process:
+    stdout = opt_process.communicate(
+        input=bitcode_module, timeout=OPT_TIMEOUT_SECONDS)[0].decode('utf-8')
+    return (stdout.replace('\n', ''), {
+        'parseable': [opt_process.returncode == 0]
+    })
+
+
+def get_size(bitcode_module):
+  return (None, {'size': [len(bitcode_module)]})
+
+
+def get_textual_ir(bitcode_module):
+  dis_command_vector = ['llvm-dis', '-']
+  with subprocess.Popen(
+      dis_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      stdin=subprocess.PIPE) as dis_process:
+    try:
+      output = dis_process.communicate(
+          input=bitcode_module,
+          timeout=LLVM_DIS_TIMEOUT_SECONDS)[0].decode('utf-8')
+    except subprocess.TimeoutExpired:
+      return ('timeout', None)
+    if dis_process.returncode != 0:
+      return ('llvm-dis returned code other than 0', None)
+    return (None, output)
+
+
+def get_size_text(bitcode_module):
+  textual_ir_or_error = get_textual_ir(bitcode_module)
+  if textual_ir_or_error[0]:
+    return (textual_ir_or_error[0], None)
+  return (None, {'size': [len(textual_ir_or_error[1])]})
+
+
+def get_token_count(bitcode_module, vocab_path):
+  textual_ir_or_error = get_textual_ir(bitcode_module)
+  if textual_ir_or_error[0]:
+    return (textual_ir_or_error[0], None)
+  with tempfile.NamedTemporaryFile(
+  ) as textual_ir_file, tempfile.NamedTemporaryFile() as tokenized_file:
+    textual_ir_file.write(textual_ir_or_error[1].encode('utf-8'))
+    fast_command_vector = [
+        'fast', 'applybpe', tokenized_file.name, textual_ir_file.name,
+        vocab_path
+    ]
+    try:
+      fast_process = subprocess.run(
+          fast_command_vector,
+          stdout=subprocess.PIPE,
+          stderr=subprocess.STDOUT,
+          timeout=FASTBPE_TIMEOUT_SECONDS)
+      if fast_process.returncode != 0:
+        return ('fastbpe returned non-zero exit code', None)
+      output = tokenized_file.read().decode('utf-8')
+    except subprocess.TimeoutExpired:
+      return ('fastbpe timeout expired', None)
+    return (None, output.count('@@'))
+
+
+def get_lowered_size(bitcode_module):
+  # Run llc on the bitcode to lower to assembly
+  llc_command_vector = ['llc', '-filetype=obj', '-']
+  with subprocess.Popen(
+      llc_command_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT) as llc_process:
+    llc_output = llc_process.communicate(input=bitcode_module)[0]
+    if llc_process.returncode != 0:
+      return ('llc returned non-zero exit code', None)
+  # Use llvm-size to measure the output size
+  # Note that the format specified here actually impacts the output text size
+  # as certain modes that LLVM aims to be compatible with count things differently.
+  # --format=sysv seems to specifically count data contained in .txt sections, which
+  # is what we're after.
+  llvm_size_command_vector = ['llvm-size', '--format=sysv', '-']
+  with subprocess.Popen(
+      llvm_size_command_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT) as llvm_size_process:
+    llvm_size_output = llvm_size_process.communicate(
+        input=llc_output)[0].decode('utf-8')
+  llvm_size_output_lines = llvm_size_output.split('\n')
+  if len(llvm_size_output_lines) < 3:
+    return ('llvm-size returned invalid output', None)
+  if len(llvm_size_output_lines[2].split()) < 2:
+    return ('llvm-size returned invalid output', None)
+  return (None, int(llvm_size_output_lines[2].split()[1]))
+
+
+def get_optimized_bitcode(bitcode_module):
+  # Run the opt O3 pipeline on the module.
+  opt_command_vector = ['opt', '-passes=default<O3>', '-']
+  with subprocess.Popen(
+      opt_command_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT) as opt_process:
+    return opt_process.communicate(input=bitcode_module)[0]
+
+
+def strip_debuginfo(bitcode_module):
+  # Run opt -strip-debug to get rid of debug information.
+  opt_command_vector = ['opt', '-strip-debug', '-']
+  with subprocess.Popen(
+      opt_command_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT) as opt_process:
+    return opt_process.communicate(input=bitcode_module)[0]
+
+
+def get_lowered_size_post_opt(bitcode_module):
+  optimized_bc = get_optimized_bitcode(bitcode_module)
+  return get_lowered_size(optimized_bc)
+
+
+def get_call_names_pass_path():
+  return shutil.which('libPrintCallNamesPass.so')
+
+
+def get_call_names(bitcode_module):
+  call_names_pass_path = get_call_names_pass_path()
+  opt_command_vector = [
+      'opt', '-load-pass-plugin', call_names_pass_path,
+      '-passes=print<call-names>', '-disable-output', '-'
+  ]
+  with subprocess.Popen(
+      opt_command_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT) as opt_process:
+    opt_output = opt_process.communicate(
+        input=bitcode_module)[0].decode('utf-8')
+    if (opt_process.returncode != 0):
+      return []
+    return opt_output.split('\n')[:-1]
+
+
+def get_defined_function_names(bitcode_module):
+  opt_command_vector = [
+      'opt', '-load-pass-plugin',
+      get_call_names_pass_path(), '-passes=print<definition-names>',
+      '-disable-output'
+  ]
+  with subprocess.Popen(
+      opt_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      stdin=subprocess.PIPE) as opt_process:
+    try:
+      stdout = opt_process.communicate(input=bitcode_module)[0].decode('utf-8')
+    except subprocess.TimeoutExpired:
+      return ('timeout', None)
+    if opt_process.returncode != 0:
+      return ('opt returned code other than 0', None)
+    return (None, stdout.split('\n')[:-1])
+
+
+def get_function_hashes(bitcode_module, additional_passes=''):
+  if additional_passes != '':
+    additional_passes = additional_passes + ','
+  opt_hashing_vector = [
+      'opt',
+      f'-passes={additional_passes}forceattrs,print<structural-hash><detailed>',
+      '-disable-output', '-', '-force-remove-attribute=optnone'
+  ]
+  with subprocess.Popen(
+      opt_hashing_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT) as opt_process:
+    try:
+      opt_output = opt_process.communicate(
+          input=bitcode_module, timeout=OPT_TIMEOUT_SECONDS)[0].decode('utf-8')
+    except subprocess.TimeoutExpired:
+      return ('timeout', None, None)
+    except UnicodeDecodeError:
+      return ('unicode error, opt returned invalid output', None, None)
+    if opt_process.returncode != 0:
+      return ('opt did not exit with code 0', None, None)
+    function_hashes = {}
+    output_lines = opt_output.split('\n')
+
+    start_line_index = 0
+    while start_line_index < len(output_lines):
+      if output_lines[start_line_index].startswith("Module Hash:"):
+        break
+      start_line_index += 1
+
+    if start_line_index == output_lines:
+      return ('invalid output from opt - did not find module hash line.', None)
+
+    module_hash_line_parts = output_lines[start_line_index].split()
+    module_hash = module_hash_line_parts[2]
+    for output_line in output_lines[(start_line_index + 1):-1]:
+      output_line_parts = output_line.split()
+      if len(output_line_parts) < 4:
+        return ('invalid output from opt', None)
+      function_name = output_line_parts[-3]
+      function_hash = output_line_parts[-1]
+      function_hashes[function_name] = function_hash
+    return (None, function_hashes, module_hash)
+
+
+ at ray.remote(num_cpus=1)
+def get_module_statistics_batch(project_dir,
+                                module_paths,
+                                statistics_type,
+                                filter='none',
+                                extra_properties={}):
+  statistics = []
+  for relative_module_path in module_paths:
+    bitcode_file = dataset_corpus.load_file_from_corpus(project_dir,
+                                                        relative_module_path)
+    if filter != 'none':
+      command_line_path = os.path.splitext(relative_module_path)[0] + '.cmd'
+      command_line = dataset_corpus.load_file_from_corpus(
+          project_dir, command_line_path).decode('utf-8')
+      # This is a very hacky heuristic, mostly based on how many include paths
+      # the driver tries to add to the frontend command line. Might need to be
+      # fixed in the future for portability.
+      if filter == 'cpp' and command_line.count('c++') <= 1:
+        continue
+      elif filter == 'c' and command_line.count('c++') > 1:
+        continue
+
+    module_path = f'{project_dir}:{relative_module_path}'
+    if statistics_type == 'parsing':
+      parse_result = test_parsing(bitcode_file)
+      if parse_result[1] == True:
+        statistics.append((None, parse_result[1], module_path))
+      else:
+        statistics.append((parse_result[0], parse_result[1], module_path))
+    elif statistics_type == 'module_size':
+      if bitcode_file is None:
+        continue
+      statistics.append((None, get_size(bitcode_file)[1], module_path))
+    elif statistics_type == 'module_size_text':
+      text_size_or_error = get_size_text(bitcode_file)
+      if text_size_or_error[0]:
+        statistics.append((text_size_or_error[0], None, module_path))
+      else:
+        statistics.append((None, text_size_or_error[1], module_path))
+    elif statistics_type == 'get_lowered_size':
+      lowered_size_or_error = get_lowered_size(bitcode_file)
+      if lowered_size_or_error[0] is not None:
+        statistics.append((lowered_size_or_error[0], None, module_path))
+        continue
+      lowered_size = lowered_size_or_error[1]
+      wrapped_result = {'lowered_size': [lowered_size]}
+      statistics.append((None, wrapped_result, module_path))
+    elif statistics_type == 'get_opt_lowered_size':
+      post_opt_lowered_size_or_error = get_lowered_size_post_opt(bitcode_file)
+      if post_opt_lowered_size_or_error[0] is not None:
+        statistics.append(
+            (post_opt_lowered_size_or_error[0], None, module_path))
+        continue
+      post_opt_lowered_size = post_opt_lowered_size_or_error[1]
+      wrapped_result = {'post_opt_lowered_size': [post_opt_lowered_size]}
+      statistics.append((None, wrapped_result, module_path))
+    elif statistics_type == 'call_names':
+      for call_name in get_call_names(bitcode_file):
+        call_names_wrapped = {'call_names': [call_name]}
+        statistics.append((None, call_names_wrapped, module_path))
+    elif statistics_type == 'function_hashes' or statistics_type == 'post_O3_function_hashes':
+      additional_passes = '' if statistics_type == 'function_hashes' else 'default<O3>'
+      function_hashes_or_error = get_function_hashes(bitcode_file,
+                                                     additional_passes)
+      if function_hashes_or_error[0]:
+        statistics.append((function_hashes_or_error[0], None, module_path))
+        continue
+      function_hashes = function_hashes_or_error[1]
+      for function_name in function_hashes:
+        hash_wrapped = {'function_hashes': [function_hashes[function_name]]}
+        statistics.append(
+            (None, hash_wrapped, f'{module_path}:{function_name}'))
+    elif statistics_type == 'module_hashes':
+      module_hash_or_error = get_function_hashes(bitcode_file)
+      if module_hash_or_error[0]:
+        statistics.append((module_hash_or_error[0], None, module_path))
+      else:
+        hash_wrapped = {'module_hashes': [module_hash_or_error[2]]}
+        statistics.append((None, hash_wrapped, module_path))
+    elif statistics_type == 'module_properties' or statistics_type == 'module_properties_O3':
+      additional_passes = '' if statistics_type == 'module_properties' else 'default<O3>'
+      properties_tuple = get_function_properties_module(bitcode_file,
+                                                        additional_passes)
+      if properties_tuple[0]:
+        statistics.append((properties_tuple[0], None, module_path))
+      else:
+        statistics.append((None, properties_tuple[1], module_path))
+    elif statistics_type == 'module_instruction_distribution' or statistics_type == 'module_instruction_distribution_O3':
+      additional_passes = '' if statistics_type == 'module_instruction_distribution' else 'default<O3>'
+      instruction_hist_or_error = get_instruction_histogram(
+          bitcode_file, additional_passes)
+      if instruction_hist_or_error[0]:
+        statistics.append((instruction_hist_or_error[0], None, module_path))
+      else:
+        statistics.append((None, instruction_hist_or_error[1], module_path))
+    elif statistics_type == 'defined_function_names':
+      function_names_or_error = get_defined_function_names(bitcode_file)
+      if function_names_or_error[0]:
+        statistics.append((function_names_or_error[0], None, module_path))
+      else:
+        for defined_function_name in function_names_or_error[1]:
+          function_name_wrapped = {'defined_function': [defined_function_name]}
+          statistics.append((None, function_name_wrapped, module_path))
+    elif statistics_type == 'token_count':
+      token_count_or_error = get_token_count(bitcode_file,
+                                             extra_properties['bpe_vocab_path'])
+      if token_count_or_error[0]:
+        statistics.append((token_count_or_error[0], None, module_path))
+      else:
+        token_count_wrapped = {'token_count': [token_count_or_error[1]]}
+        statistics.append((None, token_count_wrapped, module_path))
+  return statistics
+
+
+def get_tokenization(bitcode_module):
+  tokenizer_command_vector = ['llvm-tokenizer', '-output-mode=json', '-']
+  with subprocess.Popen(
+      tokenizer_command_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.PIPE) as tokenizer_process:
+    try:
+      stdout = tokenizer_process.communicate(input=bitcode_module)[0]
+      return json.loads(stdout)
+    except json.JSONDecodeError:
+      # TODO(boomanaiden154): This is failing pretty often. Get more debug
+      # information (like file path) into these logs so we can do downstream
+      # analysis.
+      logging.warning('Failed to decode JSON')
+      return {}
+
+
+def get_serialized_tokenization(bitcode_module, int_constants_path):
+  tokenizer_command_vector = [
+      'llvm-tokenizer', '-output-mode=json', '-mode=serialize',
+      f'-int-constants-list={int_constants_path}'
+  ]
+  with subprocess.Popen(
+      tokenizer_command_vector,
+      stdin=subprocess.PIPE,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.PIPE) as tokenizer_process:
+    try:
+      stdout = tokenizer_process.communicate(input=bitcode_module)[0]
+      tokenizer_output = json.loads(stdout)
+
+      tokenization = []
+
+      for function in tokenizer_output['functions']:
+        tokenization += function['tokens']
+
+      return tokenization
+    except json.JSONDecodeError:
+      logging.warning('Failed to decode JSON')
+      return []
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/dataset_corpus.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/dataset_corpus.py
new file mode 100644
index 000000000000000..5f09ce858929e48
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/dataset_corpus.py
@@ -0,0 +1,59 @@
+"""Tools for working with llvm-ir-dataset-utls corpora"""
+
+import tarfile
+import logging
+import os
+import json
+
+
+def load_file_from_corpus(corpus_path, file_name):
+  if corpus_path[-3:] == "tar":
+    with tarfile.open(corpus_path) as build_archive:
+      try:
+        file_to_extract = build_archive.extractfile(file_name)
+        return file_to_extract.read()
+      except (tarfile.TarError, KeyError):
+        logging.warning(
+            f'Failed to read {file_name} in {corpus_path}: tar archive error.')
+        return None
+  else:
+    file_path = os.path.join(corpus_path, file_name)
+    if not os.path.exists(file_path):
+      logging.warning(f'Expected {file_name} in {corpus_path} does not exist.')
+      return None
+    with open(file_path, 'rb') as file_to_read:
+      return file_to_read.read()
+
+
+def load_json_from_corpus(corpus_path, file_name):
+  file_contents = load_file_from_corpus(corpus_path, file_name)
+  if file_contents is None:
+    # Error logging should be handled by load_file_from_corpus
+    return None
+  return json.loads(file_contents)
+
+
+def get_bitcode_file_paths(corpus_path):
+  corpus_description = load_json_from_corpus(corpus_path,
+                                             './corpus_description.json')
+  return ['./' + module + '.bc' for module in corpus_description['modules']]
+
+
+def get_corpus_name(corpus_path):
+  if corpus_path[-3:] == 'tar':
+    return os.path.basename(corpus_path)[:-4]
+  return os.path.basename(corpus_path)
+
+
+def is_file_in_corpus(corpus_path, file_to_test):
+  if corpus_path[-3:] == 'tar':
+    with tarfile.open(corpus_path) as corpus_archive:
+      if file_to_test in corpus_archive.getnames():
+        return True
+      else:
+        return False
+  else:
+    if os.path.exists(os.path.join(corpus_path, file_to_test)):
+      return True
+    else:
+      return False
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/file.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/file.py
new file mode 100644
index 000000000000000..a1c9ea0f3354b7b
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/file.py
@@ -0,0 +1,16 @@
+"""File utilities"""
+
+import shutil
+import os
+
+
+def delete_directory(directory_path, corpus_path):
+  if os.path.exists(directory_path):
+    try:
+      shutil.rmtree(directory_path)
+    except Exception as e:
+      with open(os.path.join(corpus_path, 'error.log'), 'a+') as error_log_file:
+        error_log_file.write(f'{e}\n')
+  else:
+    with open(os.path.join(corpus_path, 'error.log'), 'a+') as error_log_file:
+      error_log_file.write(f'no directory {directory_path} to delete\n')
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/github_api.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/github_api.py
new file mode 100644
index 000000000000000..8812568d0283b2b
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/github_api.py
@@ -0,0 +1,16 @@
+"""Utilities for accessing parts of the github API"""
+
+import requests
+
+
+def get_license_from_repo(repo_owner, repo_name, api_token):
+  headers = {
+      'Accept': 'application/vnd.github+json',
+      'Authorization': f'Bearer {api_token}',
+      'X-Github-Api-Version': '2022-11-28'
+  }
+  endpoint = f'https://api.github.com/repos/{repo_owner}/{repo_name}/license'
+  # TODO(boomanaiden154): Get rid of verify=False and replace it with a
+  # REQUESTS_CA_BUNDLE definition in environments where it is necessary.
+  result = requests.get(endpoint, headers=headers, verify=False)
+  return result.json()['license']['spdx_id']
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/licenses.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/licenses.py
new file mode 100644
index 000000000000000..6b9ccc51138399c
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/licenses.py
@@ -0,0 +1,173 @@
+"""Some utilities to deal with license information"""
+
+import requests
+import json
+import logging
+import os
+import tempfile
+import subprocess
+
+from llvm_ir_dataset_utils.sources import git_source
+
+GITHUB_GRAPHQL_URL = 'https://api.github.com/graphql'
+
+
+def generate_repository_spdx_request(repo_index, repository_url):
+  repository_parts = repository_url.split('/')
+  repository_owner = repository_parts[3]
+  repository_name = repository_parts[4]
+  return (
+      f'repo{repo_index}: repository(owner: "{repository_owner}", name: "{repository_name}") {{\n'
+      '  licenseInfo {\n'
+      '    spdxId\n'
+      '  }\n'
+      '}\n')
+
+
+def get_repository_licenses(repository_list, api_token):
+  if len(repository_list) > 200:
+    # if the number of repositories is greater than 200, split up into
+    # multiple queries.
+    full_repository_license_map = {}
+    start_index = 0
+    while start_index < len(repository_list):
+      end_index = start_index + 200
+      full_repository_license_map.update(
+          get_repository_licenses(repository_list[start_index:end_index],
+                                  api_token))
+      start_index += 200
+      logging.info('Just collected license information on 200 repositories')
+
+    return full_repository_license_map
+
+  query_string = '{\n'
+
+  for index, repository_url in enumerate(repository_list):
+    query_string += generate_repository_spdx_request(index, repository_url)
+
+  query_string += '}'
+
+  query_json = {'query': query_string}
+  headers = {'Authorization': f'token {api_token}'}
+  api_request = requests.post(
+      url=GITHUB_GRAPHQL_URL, json=query_json, headers=headers)
+
+  license_data = json.loads(api_request.text)
+
+  repository_license_map = {}
+
+  if license_data['data'] is None:
+    print(license_data)
+    import sys
+    sys.exit(0)
+
+  for repository in license_data['data']:
+    repository_index = int(repository[4:])
+    repository_url = repository_list[repository_index]
+    if license_data['data'][repository] is None or license_data['data'][
+        repository]['licenseInfo'] is None:
+      repository_license_map[repository_url] = 'NOASSERTION'
+      continue
+    license_id = license_data['data'][repository]['licenseInfo']['spdxId']
+    repository_license_map[repository_url] = license_id
+
+  return repository_license_map
+
+
+def get_detected_license_from_dir(repo_dir):
+  detector_command_line = ['license-detector', '-f', 'json', './']
+  license_detector_process = subprocess.run(
+      detector_command_line, cwd=repo_dir, stdout=subprocess.PIPE, check=True)
+  license_info = json.loads(license_detector_process.stdout.decode('utf-8'))
+  primary_project = license_info[0]
+  if 'error' in primary_project:
+    return 'NOASSERTION'
+  licenses_matched = primary_project['matches']
+  if licenses_matched[0]['confidence'] > 0.9:
+    return licenses_matched[0]['license']
+  return 'NOASSERTION'
+
+
+def get_detected_license_from_repo(repo_url, repo_name):
+  with tempfile.TemporaryDirectory() as temp_dir:
+    base_dir = os.path.join(temp_dir, 'base')
+    corpus_dir = os.path.join(temp_dir, 'corpus')
+    os.mkdir(base_dir)
+    os.mkdir(corpus_dir)
+    source_status = git_source.download_source_code(repo_url, repo_name, None,
+                                                    base_dir, corpus_dir)
+    if source_status['success'] == False:
+      return 'NOASSERTION'
+    project_dir = os.path.join(base_dir, repo_name)
+    return get_detected_license_from_dir(project_dir)
+
+
+def upgrade_deprecated_spdx_id(spdx_id):
+  if not spdx_id.startswith('deprecated'):
+    # Nothing to do here
+    return spdx_id
+  match (spdx_id[11:]):
+    case 'AGPL-3.0':
+      return 'AGPL-3.0-only'
+    case 'GFDL-1.3':
+      return 'GFDL-1.3-only'
+    case 'GPL-2.0':
+      return 'GPL-2.0-only'
+    case 'GPL-2.0+':
+      return 'GPL-2.0-or-later'
+    case 'GPL-3.0':
+      return 'GPL-3.0-only'
+    case 'GPL-3.0+':
+      return 'GPL-3.0-or-later'
+    case 'LGPL-2.0':
+      return 'LGPL-2.0-only'
+    case 'LGPL-2.0+':
+      return 'LGPL-2.0-or-later'
+    case 'LGPL-2.1+':
+      return 'LGPL-2.1-or-later'
+    case 'LGPL-3.0':
+      return 'LGPL-3.0-only'
+    case 'LGPL-3.0+':
+      return 'LGPL-3.0-or-later'
+    case _:
+      # Just return the deprecated ID here if we don't have a translation
+      # to ensure that we aren't losing any information.
+      return spdx_id
+
+
+def get_all_license_files(repo_dir):
+  if not os.path.exists(repo_dir):
+    logging.warning(
+        f'Could not find any licenses in {repo_dir} as it does not exist')
+    return []
+  detector_command_line = ['license-detector', '-f', 'json', './']
+  license_detector_process = subprocess.run(
+      detector_command_line, cwd=repo_dir, stdout=subprocess.PIPE)
+  if license_detector_process.returncode != 0:
+    logging.warning('license detector failed with non-zero return code')
+    return []
+  license_info = json.loads(license_detector_process.stdout.decode('utf-8'))
+  if 'matches' not in license_info[0]:
+    return []
+  matches = license_info[0]['matches']
+  license_files_map = {}
+  license_files_confidence = {}
+  for license_match in matches:
+    if license_match['file'] not in license_files_confidence:
+      license_files_map[license_match['file']] = license_match['license']
+      license_files_confidence[
+          license_match['file']] = license_match['confidence']
+      continue
+    if license_files_confidence[
+        license_match['file']] > license_match['confidence']:
+      continue
+    license_files_map[license_match['file']] = license_match['license']
+    license_files_confidence[
+        license_match['file']] = license_match['confidence']
+  license_files = []
+  for license_file in license_files_map:
+    license_files.append({
+        'file': license_file,
+        'license': upgrade_deprecated_spdx_id(license_files_map[license_file])
+    })
+  return license_files
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/parallel.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/parallel.py
new file mode 100644
index 000000000000000..b2333448918caf6
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/parallel.py
@@ -0,0 +1,17 @@
+"""Utilities for job distribution and execution."""
+
+
+# TODO(boomanaiden154): Write some unit tests for this function.
+def split_batches(individual_jobs, batch_size):
+  batches = []
+  current_start_index = 0
+  while True:
+    end_index = current_start_index + batch_size
+    chunk = individual_jobs[current_start_index:end_index]
+    batches.append(chunk)
+    current_start_index = end_index
+    if current_start_index + batch_size >= len(individual_jobs):
+      last_chunk = individual_jobs[current_start_index:]
+      batches.append(last_chunk)
+      break
+  return batches
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/pass_list_constants.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/pass_list_constants.py
new file mode 100644
index 000000000000000..96490a443b676ee
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/pass_list_constants.py
@@ -0,0 +1,66 @@
+# This module contains a list of pass constants that are used throughout the
+# project while doing various analyses.
+
+LOOP_PASS_LIST = [
+    'IndVarSimplifyPass',
+    'LICMPass',
+    'LoopDeletionPass',
+    'LoopDistributionPass',
+    'LoopFullUnrollPass',
+    'LoopIdiomRecognizePass',
+    'LoopInstSimplifyPass',
+    'LoopLoadEliminationPass',
+    'LoopRotatePass',
+    'LoopSimplifyCFGPass',
+    'LoopSimplifyPass',
+    'LoopSinkPass',
+    'LoopUnrollPass',
+    'LoopVectorizePass',
+    'SimpleLoopUnswitchPass',
+]
+
+OPT_DEFAULT_O3_PASS_LIST = [
+    'Annotation2MetadataPass1', 'ForceFunctionAttrsPass1',
+    'InferFunctionAttrsPass1', 'CoroEarlyPass1', 'LowerExpectIntrinsicPass1',
+    'SimplifyCFGPass1', 'SROAPass1', 'EarlyCSEPass1', 'CallSiteSplittingPass1',
+    'OpenMPOptPass1', 'IPSCCPPass1', 'CalledValuePropagationPass1',
+    'GlobalOptPass1', 'PromotePass1', 'InstCombinePass1', 'SimplifyCFGPass2',
+    'RequireAnalysisPass<llvm::GlobalsAA, llvm::Module>1',
+    'InvalidateAnalysisPass<llvm::AAManager>1',
+    'RequireAnalysisPass<llvm::ProfileSummaryAnalysis, llvm::Module>1',
+    'InlinerPass1', 'InlinerPass2', 'PostOrderFunctionAttrsPass1',
+    'ArgumentPromotionPass1', 'OpenMPOptCGSCCPass1', 'SROAPass2',
+    'EarlyCSEPass2', 'SpeculativeExecutionPass1', 'JumpThreadingPass1',
+    'CorrelatedValuePropagationPass1', 'SimplifyCFGPass3', 'InstCombinePass2',
+    'AggressiveInstCombinePass1', 'LibCallsShrinkWrapPass1',
+    'TailCallElimPass1', 'SimplifyCFGPass4', 'ReassociatePass1',
+    'RequireAnalysisPass<llvm::OptimizationRemarkEmitterAnalysis, llvm::Function>1',
+    'LoopSimplifyPass1', 'LCSSAPass1', 'LoopInstSimplifyPass1',
+    'LoopSimplifyCFGPass1', 'LICMPass1', 'LoopRotatePass1',
+    'SimpleLoopUnswitchPass1', 'SimplifyCFGPass5', 'InstCombinePass3',
+    'LCSSAPass2', 'LoopIdiomRecognizePass1', 'IndVarSimplifyPass1',
+    'LoopDeletionPass1', 'LoopFullUnrollPass1', 'SROAPass3',
+    'VectorCombinePass1', 'MergedLoadStoreMotionPass1', 'GVNPass1', 'SCCPPass1',
+    'BDCEPass1', 'InstCombinePass4', 'JumpThreadingPass2',
+    'CorrelatedValuePropagationPass2', 'ADCEPass1', 'MemCpyOptPass1',
+    'DSEPass1', 'LCSSAPass3', 'CoroElidePass1', 'SimplifyCFGPass6',
+    'InstCombinePass5', 'CoroSplitPass1', 'InlinerPass3', 'InlinerPass4',
+    'PostOrderFunctionAttrsPass2', 'ArgumentPromotionPass2',
+    'OpenMPOptCGSCCPass2', 'CoroSplitPass2',
+    'InvalidateAnalysisPass<llvm::ShouldNotRunFunctionPassesAnalysis>1',
+    'DeadArgumentEliminationPass1', 'CoroCleanupPass1', 'GlobalOptPass2',
+    'GlobalDCEPass1', 'EliminateAvailableExternallyPass1',
+    'ReversePostOrderFunctionAttrsPass1', 'RecomputeGlobalsAAPass1',
+    'Float2IntPass1', 'LowerConstantIntrinsicsPass1', 'LCSSAPass4',
+    'LoopDistributePass1', 'InjectTLIMappings1', 'LoopVectorizePass1',
+    'LoopLoadEliminationPass1', 'InstCombinePass6', 'SimplifyCFGPass7',
+    'SLPVectorizerPass1', 'VectorCombinePass2', 'InstCombinePass7',
+    'LoopUnrollPass1', 'WarnMissedTransformationsPass1', 'SROAPass4',
+    'InstCombinePass8',
+    'RequireAnalysisPass<llvm::OptimizationRemarkEmitterAnalysis, llvm::Function>2',
+    'LCSSAPass5', 'AlignmentFromAssumptionsPass1', 'LoopSinkPass1',
+    'InstSimplifyPass1', 'DivRemPairsPass1', 'TailCallElimPass2',
+    'SimplifyCFGPass8', 'GlobalDCEPass2', 'ConstantMergePass1',
+    'CGProfilePass1', 'RelLookupTableConverterPass1', 'AnnotationRemarksPass1',
+    'VerifierPass1', 'BitcodeWriterPass1'
+]
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/spack.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/spack.py
new file mode 100644
index 000000000000000..c5bd70ec5753409
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/util/spack.py
@@ -0,0 +1,74 @@
+"""Utilities related to spack."""
+
+import subprocess
+import os
+
+
+def get_spack_arch_info(info_type):
+  spack_arch_command_vector = ['spack', 'arch', f'--{info_type}']
+  arch_process = subprocess.run(
+      spack_arch_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      check=True)
+  return arch_process.stdout.decode('utf-8').rsplit()[0]
+
+
+def get_compiler_version():
+  compiler_command_vector = ['clang', '--version']
+  compiler_version_process = subprocess.run(
+      compiler_command_vector,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.STDOUT,
+      check=True)
+  version_line = compiler_version_process.stdout.decode('utf-8').split('\n')[0]
+  version_line_parts = version_line.split(' ')
+  for index, version_line_part in enumerate(version_line_parts):
+    if version_line_part == 'version':
+      return version_line_parts[index + 1]
+
+
+def get_spack_compiler_config():
+  compiler_config = (
+      "compilers:\n"
+      "- compiler:\n"
+      f"    spec: clang@={get_compiler_version()}\n"
+      "    paths:\n"
+      "      cc: /usr/bin/clang\n"
+      "      cxx: /usr/bin/clang++\n"
+      "      f77: /usr/bin/gfortran\n"
+      "      fc: /usr/bin/gfortran\n"
+      "    flags:\n"
+      "      cflags: -Xclang -fembed-bitcode=all\n"
+      "      cxxflags: -Xclang -fembed-bitcode=all\n"
+      f"    operating_system: {get_spack_arch_info('operating-system')}\n"
+      "    target: x86_64\n"
+      "    modules: []\n"
+      "    environment: {}\n"
+      "    extra_rpaths: []")
+  return compiler_config
+
+
+def get_spack_config(build_dir):
+  spack_config = ("config:\n"
+                  "  install_tree:\n"
+                  f"    root: {build_dir}/spack-installs\n"
+                  "    padded_length: 512\n"
+                  "  build_stage:\n"
+                  f"    - {build_dir}/build-stage\n"
+                  f"  test_stage: {build_dir}/test-stage\n"
+                  f"  source_cache: {build_dir}/source-cache\n"
+                  f"  misc_cache: {build_dir}/misc-cache")
+  return spack_config
+
+
+def spack_setup_compiler(build_dir):
+  compiler_config_path = os.path.join(build_dir, '.spack/compilers.yaml')
+  with open(compiler_config_path, 'w') as compiler_config_file:
+    compiler_config_file.writelines(get_spack_compiler_config())
+
+
+def spack_setup_config(build_dir):
+  spack_config_path = os.path.join(build_dir, '.spack/config.yaml')
+  with open(spack_config_path, 'w') as spack_config_file:
+    spack_config_file.writelines(get_spack_config(build_dir))
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/bitcode_histograms.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/bitcode_histograms.py
new file mode 100644
index 000000000000000..7f45e20887a42e9
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/bitcode_histograms.py
@@ -0,0 +1,154 @@
+"""A tool for generating visualizations of bitcode distributions across
+languages.
+"""
+
+import logging
+import os
+import csv
+
+import pandas
+import plotly.express
+import plotly.io
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string('bc_dist_file', None, 'The path to a data file.')
+flags.DEFINE_multi_string(
+    'opt_bc_dist_file', None,
+    'The path to a data file containing data gathered post-optimization.')
+flags.DEFINE_string('output_file', None, 'The path to the output image.')
+flags.DEFINE_string(
+    'output_data_file', None,
+    'The path to a CSV file to place the data used to generate the figure.')
+
+flags.mark_flag_as_required('bc_dist_file')
+flags.mark_flag_as_required('output_file')
+
+OPCODES_TO_ANALYZE = [
+    'Load', 'GetElementPtr', 'Call', 'BitCast', 'Store', 'Alloca', 'Br',
+    'AddrSpaceCast', 'Ret', 'ICmp', 'ExtractValue', 'Invoke'
+]
+
+
+def compute_cumulative_histogram_from_file(file_path):
+  histogram = {}
+  with open(file_path) as bc_dist_file:
+    dict_reader = csv.DictReader(bc_dist_file)
+    for data_row in dict_reader:
+      for instruction_type in data_row:
+        if instruction_type == 'name':
+          continue
+        instruction_count = int(data_row[instruction_type])
+        if instruction_type in histogram:
+          histogram[instruction_type] += instruction_count
+        else:
+          histogram[instruction_type] = instruction_count
+  return histogram
+
+
+def main(_):
+  distributions = {}
+  instruction_names = []
+  for bc_dist_file_path in FLAGS.bc_dist_file + FLAGS.opt_bc_dist_file:
+    logging.info(f'Loading data from {bc_dist_file_path}')
+    language_name = os.path.basename(bc_dist_file_path)[:-4]
+    if bc_dist_file_path in FLAGS.opt_bc_dist_file:
+      language_name += ' (Optimized)'
+    distribution = compute_cumulative_histogram_from_file(bc_dist_file_path)
+    instruction_names = list(set(instruction_names + list(distribution.keys())))
+    distributions[language_name] = distribution
+
+  # Ensure that all languages have the same opcodes.
+  for distribution in distributions:
+    for instruction_name in instruction_names:
+      if instruction_name not in distributions[distribution]:
+        distributions[distribution][instruction_name] = 0
+
+  # Normalize the distributions in each language by the instruction count
+  for distribution in distributions:
+    total_instruction_count = 0
+    for instruction_name in distributions[distribution]:
+      total_instruction_count += distributions[distribution][instruction_name]
+    for instruction_name in distributions[distribution]:
+      distributions[distribution][instruction_name] = distributions[
+          distribution][instruction_name] / total_instruction_count
+
+  # Remove all opcodes that aren't in the set that we want.
+  for distribution in distributions:
+    for instruction_name in instruction_names:
+      if instruction_name not in OPCODES_TO_ANALYZE:
+        del distributions[distribution][instruction_name]
+
+  # Add an additional opcodes category so everything sums to one.
+  extra_percentages = {}
+  for distribution in distributions:
+    total_instruction_count = 0
+    for instruction_name in distributions[distribution]:
+      total_instruction_count += distributions[distribution][instruction_name]
+    extra_percentages[distribution] = 1 - total_instruction_count
+
+  data_frame = pandas.DataFrame({
+      'Language': [],
+      'Instruction': [],
+      'Count': []
+  })
+
+  for language_name in distributions:
+    language_names = []
+    instructions = []
+    instruction_counts = []
+
+    for instruction in distributions[language_name]:
+      language_names.append(language_name)
+      instructions.append(instruction)
+      instruction_counts.append(distributions[language_name][instruction])
+
+    language_data_frame = pandas.DataFrame({
+        'Language': language_names,
+        'Instruction': instructions,
+        'Count': instruction_counts
+    })
+
+    language_data_frame.sort_values('Count', ascending=False, inplace=True)
+
+    data_frame = pandas.concat([data_frame, language_data_frame])
+
+  extra_percentages_df = pandas.DataFrame({
+      'Language': list(extra_percentages.keys()),
+      'Instruction': [
+          'Other Instructions' for _ in range(0, len(extra_percentages))
+      ],
+      'Count': list(extra_percentages.values())
+  })
+
+  data_frame = pandas.concat([data_frame, extra_percentages_df],
+                             ignore_index=True)
+
+  if FLAGS.output_data_file:
+    data_frame.pivot(
+        index='Language', columns='Instruction', values='Count').to_csv(
+            FLAGS.output_data_file, index_label='index')
+
+  logging.info('Generating figure.')
+
+  figure = plotly.express.bar(
+      data_frame,
+      x='Language',
+      y='Count',
+      color='Instruction',
+      color_discrete_sequence=plotly.express.colors.qualitative.Alphabet_r)
+
+  figure.update_layout(legend_traceorder="reversed")
+
+  logging.info('Writing figure to file.')
+
+  plotly.io.kaleido.scope.mathjax = None
+
+  figure.write_image(FLAGS.output_file)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/dimensionality_reduction.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/dimensionality_reduction.py
new file mode 100644
index 000000000000000..44654d1dda3bc95
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/dimensionality_reduction.py
@@ -0,0 +1,138 @@
+"""A tool for performing dimensionality reduction and visualizing the results."""
+
+import logging
+import os
+import csv
+
+import numpy
+import pandas
+import umap
+
+from sklearn.preprocessing import StandardScaler
+
+import plotly.express
+import plotly.io
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string(
+    'properties_file', None,
+    'The path to a file containing a list of functions and their numerical properties.'
+)
+flags.DEFINE_string('output_file', None, 'The path to the output image.')
+flags.DEFINE_string(
+    'output_data_file', None,
+    'The path to a CSV file containing the dimensionality reduction to write '
+    'to or read from.')
+
+flags.mark_flag_as_required('properties_file')
+flags.mark_flag_as_required('output_file')
+
+
+def load_function_properties(file_path):
+  function_properties = {}
+  with open(file_path) as properties_file:
+    properties_reader = csv.DictReader(properties_file)
+    for property_entry in properties_reader:
+      function_name = property_entry['name']
+      property_entry.pop('name')
+      function_properties[function_name] = property_entry
+  return function_properties
+
+
+def get_opcode_set(bitcode_distribution_paths):
+  opcode_set = set()
+  for bitcode_distribution_path in bitcode_distribution_paths:
+    with open(bitcode_distribution_path) as bitcode_dist_file:
+      dist_reader = csv.DictReader(bitcode_dist_file)
+      for dist_row in dist_reader:
+        for opcode_name in dist_row:
+          opcode_set.add(opcode_name)
+        break
+  return list(opcode_set)
+
+
+def add_bitcode_distribution(file_path, function_properties):
+  with open(file_path) as distribution_file:
+    distribution_reader = csv.DictReader(distribution_file)
+    for distribution_entry in distribution_reader:
+      function_name = distribution_entry['name']
+      distribution_entry.pop('name')
+      function_properties[function_name].update(distribution_entry)
+
+
+def convert_to_feature_vector(function_properties):
+  function_features = []
+  for function in function_properties:
+    individual_function_features = []
+    for function_property in function_properties[function]:
+      individual_function_features.append(
+          int(function_properties[function][function_property]))
+    function_features.append(individual_function_features)
+  return function_features
+
+
+def load_data():
+  function_properties = {}
+
+  colors = []
+
+  logging.info('Loading data')
+  for properties_file in FLAGS.properties_file:
+    language_name = os.path.basename(properties_file)[:-4]
+    new_properties = load_function_properties(properties_file)
+    function_properties.update(new_properties)
+    new_colors = [language_name] * len(new_properties)
+    colors.extend(new_colors)
+
+  # TODO(boomanaiden154): Add in support for adding in opcodes here too.
+  # This needs to account for variability in opcodes between languages
+  # though. Some functions are already implemented above.
+
+  function_feature_vectors = convert_to_feature_vector(function_properties)
+
+  function_feature_arrays = numpy.asarray(function_feature_vectors)
+
+  logging.info('Performing dimensionality reduction')
+
+  scaled_data = StandardScaler().fit_transform(function_feature_arrays)
+
+  reducer = umap.UMAP(n_neighbors=100)
+
+  embedded_features = reducer.fit_transform(scaled_data)
+
+  data_frame = pandas.DataFrame(
+      numpy.asarray(embedded_features), columns=['x', 'y'])
+
+  data_frame.insert(2, "colors", colors)
+
+  return data_frame
+
+
+def main(_):
+  if FLAGS.output_data_file and os.path.exists(FLAGS.output_data_file):
+    logging.info('Loading reduction from CSV file.')
+    data_frame = pandas.read_csv(FLAGS.output_data_file)
+  else:
+    logging.info('Loading data from sources and performing reduction.')
+    data_frame = load_data()
+
+    if FLAGS.output_data_file:
+      logging.info('Writing reduction to CSV file.')
+      data_frame.to_csv(FLAGS.output_data_file)
+
+  figure = plotly.express.scatter(data_frame, x='x', y='y', color='colors')
+
+  figure.update_yaxes(visible=False, showticklabels=False)
+  figure.update_xaxes(visible=False, showticklabels=False)
+
+  plotly.io.kaleido.scope.mathjax = None
+
+  figure.write_image(FLAGS.output_file)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/duplication_heatmap.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/duplication_heatmap.py
new file mode 100644
index 000000000000000..0e8476e66f46bad
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/duplication_heatmap.py
@@ -0,0 +1,148 @@
+"""A script for generating a heatmap showing duplication of bitcode
+between languages."""
+
+import logging
+import os
+import csv
+import sys
+
+import plotly.express
+import plotly.io
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string(
+    'hash_file', None,
+    'A CSV file containing a header and a list of function hashes.')
+flags.DEFINE_string('output_file', None, 'The path to the output image.')
+flags.DEFINE_enum('hash_key', 'function_hashes',
+                  ['function_hashes', 'module_hashes'],
+                  'The column name in the CSV containing the hashes.')
+flags.DEFINE_string(
+    'output_data_file', None,
+    'The output file to save data in or load data from if it already exists.')
+flags.DEFINE_bool('include_scale', True,
+                  'Whether or not to include the scale bar.')
+
+flags.mark_flag_as_required('hash_file')
+flags.mark_flag_as_required('output_file')
+
+
+def load_haash_histogram_from_file(file_path):
+  hash_histogram = {}
+  with open(file_path) as hash_file:
+    reader = csv.DictReader(hash_file)
+    for row in reader:
+      hash_value = row[FLAGS.hash_key]
+      if hash_value in hash_histogram:
+        hash_histogram[hash_value] += 1
+      else:
+        hash_histogram[hash_value] = 1
+  return hash_histogram
+
+
+def calculate_overlap(hash_histogram1, hash_histogram2):
+  unique_functions = 0
+  duplicate_functions = 0
+  for function_hash in list(
+      set(list(hash_histogram1.keys()) + list(hash_histogram2.keys()))):
+    if function_hash in hash_histogram1 and function_hash in hash_histogram2:
+      duplicate_functions += hash_histogram1[function_hash] + hash_histogram2[
+          function_hash]
+    else:
+      unique_functions += 1
+  return duplicate_functions / (unique_functions + duplicate_functions)
+
+
+def calculate_duplication(hash_histogram):
+  unique_functions = 0
+  duplicate_functions = 0
+  for function_hash in hash_histogram:
+    if hash_histogram[function_hash] > 1:
+      duplicate_functions += hash_histogram[function_hash]
+    else:
+      unique_functions += 1
+  return duplicate_functions / (unique_functions + duplicate_functions)
+
+
+def load_and_compute():
+  histograms = {}
+  for hash_file_path in FLAGS.hash_file:
+    logging.info(f'Loading data from {hash_file_path}')
+    language_name = os.path.basename(hash_file_path)[:-4]
+    histograms[language_name] = load_haash_histogram_from_file(hash_file_path)
+
+  logging.info('Finished loading data, generating matrix.')
+
+  duplication_matrix = []
+  for language_name_x in histograms:
+    duplication_matrix_row = []
+    for language_name_y in histograms:
+      if language_name_x == language_name_y:
+        duplication_matrix_row.append(
+            calculate_duplication(histograms[language_name_x]))
+      else:
+        duplication_matrix_row.append(
+            calculate_overlap(histograms[language_name_x],
+                              histograms[language_name_y]))
+    duplication_matrix.append(duplication_matrix_row)
+
+  languages = list(histograms.keys())
+
+  return (languages, duplication_matrix)
+
+
+def write_to_csv(languages, duplication_matrix):
+  with open(FLAGS.output_data_file, 'w') as data_file:
+    data_file_writer = csv.writer(data_file)
+    data_file_writer.writerow(languages)
+
+    for duplication_row in duplication_matrix:
+      data_file_writer.writerow(duplication_row)
+
+
+def read_from_csv():
+  with open(FLAGS.output_data_file) as data_file:
+    data_file_reader = csv.reader(data_file)
+
+    languages = next(data_file_reader)
+
+    duplication_matrix = []
+
+    for duplication_row in data_file_reader:
+      duplication_matrix.append(duplication_row)
+
+    return (languages, duplication_matrix)
+
+
+def main(_):
+  if FLAGS.output_data_file and os.path.exists(FLAGS.output_data_file):
+    logging.info('Loading data from CSV file.')
+    languages, duplication_matrix = read_from_csv()
+  else:
+    logging.info('Loading and computing data from hash files.')
+    languages, duplication_matrix = load_and_compute()
+
+    if FLAGS.output_data_file:
+      logging.info('Saving duplication matrix to CSV file.')
+      write_to_csv(languages, duplication_matrix)
+
+  logging.info('Finished generating data, generating figure.')
+
+  figure = plotly.express.imshow(
+      duplication_matrix, text_auto=True, x=languages, y=languages)
+
+  figure.update_coloraxes(showscale=FLAGS.include_scale)
+
+  plotly.io.kaleido.scope.mathjax = None
+
+  figure.write_image(FLAGS.output_file)
+
+
+if __name__ == '__main__':
+  csv.field_size_limit(sys.maxsize)
+
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/function_call_histogram.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/function_call_histogram.py
new file mode 100644
index 000000000000000..ba2f4dcafc77adb
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/function_call_histogram.py
@@ -0,0 +1,99 @@
+"""Tool for generating a histogram of external functions that get called."""
+
+import logging
+import os
+import csv
+
+import pandas
+import plotly.express
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string(
+    'call_data_path', None,
+    'A path to a file containing a list of function calls.')
+flags.DEFINE_multi_string(
+    'defined_functions_path', None,
+    'A path to a file containing a list of defined functions.')
+flags.DEFINE_string('output_file', None, 'The path to the output image.')
+
+flags.mark_flag_as_required('call_data_path')
+flags.mark_flag_as_required('defined_functions_path')
+flags.mark_flag_as_required('output_file')
+
+
+def get_definitions_per_project(file_path):
+  project_functions = {}
+  with open(file_path) as definitions_file:
+    definition_reader = csv.DictReader(definitions_file)
+    for definition in definition_reader:
+      project_path = definition['name'].split(':')[0]
+      if project_path in project_functions:
+        project_functions[project_path].add(definition['defined_function'])
+      else:
+        project_functions[project_path] = set([definition['defined_function']])
+  return project_functions
+
+
+def load_external_calls(file_path, project_functions):
+  external_calls = []
+  with open(file_path) as calls_file:
+    call_reader = csv.DictReader(calls_file)
+    for function_call in call_reader:
+      project_path = function_call['name'].split(':')[0]
+      called_function = function_call['call_names']
+      if called_function in project_functions[project_path]:
+        continue
+      external_calls.append(called_function)
+  return external_calls
+
+
+def generate_calls_histogram(external_calls):
+  call_histogram = {}
+  for external_call in external_calls:
+    if external_call in call_histogram:
+      call_histogram[external_call] += 1
+    else:
+      call_histogram[external_call] = 1
+  return call_histogram
+
+
+def main(_):
+  project_functions = {}
+
+  for defined_functions_path in FLAGS.defined_functions_path:
+    project_functions.update(
+        get_definitions_per_project(defined_functions_path))
+
+  external_calls = []
+
+  for call_data_path in FLAGS.call_data_path:
+    external_calls.extend(
+        load_external_calls(call_data_path, project_functions))
+
+  external_call_histogram = generate_calls_histogram(external_calls)
+
+  external_call_names = []
+  external_call_frequencies = []
+
+  for external_call in external_call_histogram:
+    external_call_names.append(external_call)
+    external_call_frequencies.append(external_call_histogram[external_call])
+
+  data_frame = pandas.DataFrame({
+      'call_name': external_call_names,
+      'count': external_call_frequencies
+  })
+
+  data_frame.sort_values(by=['count'], inplace=True, ascending=False)
+
+  figure = plotly.express.bar(data_frame.head(20), x='call_name', y='count')
+
+  figure.write_image(FLAGS.output_file)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_histograms.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_histograms.py
new file mode 100644
index 000000000000000..fd95401280439c4
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_histograms.py
@@ -0,0 +1,121 @@
+"""A tool for generating histograms from a CSV file."""
+
+import logging
+import os
+
+import pandas
+
+import plotly.express
+import plotly.subplots
+import plotly.graph_objects
+import plotly.io
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+DEFAULT_SUBPLOT_SECTIONS = [
+    'TotalInstructionCount', 'BasicBlockCount', 'TopLevelLoopCount',
+    'DirectCallCount', 'LoadInstCount', 'StoreInstCount',
+    'IntegerInstructionCount', 'FloatingPointInstructionCount'
+]
+
+flags.DEFINE_multi_string('data_path', None, 'The path to the data file.')
+flags.DEFINE_string('output_path', None,
+                    'The path to a folder to write the histograms to.')
+flags.DEFINE_integer('num_bins', 12,
+                     'The number of bins to use for the histograms.')
+flags.DEFINE_multi_string(
+    'sub_plot_sections', DEFAULT_SUBPLOT_SECTIONS,
+    'The column names to include in a subplot diagram. There must be eight '
+    'sections specified. If this flag is set, only one plot will be generated.')
+
+flags.mark_flag_as_required('data_path')
+flags.mark_flag_as_required('output_path')
+
+FANCY_PROPERTY_NAMES = {
+    'BasicBlockCount': 'Basic Blocks',
+    'TotalInstructionCount': 'Instructions',
+    'TopLevelLoopCount': 'Top-level Loops',
+    'LoadInstCount': 'Load Instructions',
+    'StoreInstCount': 'Store Instructions',
+    'DirectCallCount': 'Direct Calls',
+    'IntegerInstructionCount': 'Integer Instructions',
+    'FloatingPointInstructionCount': 'Floating Point Instructions'
+}
+
+
+def main(_):
+  data_frames = []
+  languages = []
+
+  for data_path in FLAGS.data_path:
+    logging.info(f'Loading data from {data_path}')
+    data_frame = pandas.read_csv(data_path)
+    data_frame.drop(['name'], axis=1, inplace=True)
+    language_name = os.path.basename(data_path)[:-4]
+    languages.append(language_name)
+    data_frame.insert(0, 'language', [language_name] * len(data_frame))
+    data_frames.append(data_frame)
+
+  data_frame = pandas.concat(data_frames)
+
+  logging.info('Finished loading data, generating histograms.')
+
+  if FLAGS.sub_plot_sections is None:
+    for column in data_frame:
+      figure = plotly.express.histogram(
+          data_frame,
+          x=column,
+          color='language',
+          nbins=FLAGS.num_bins,
+          log_y=True,
+          barmode='overlay')
+      figure.write_image(os.path.join(FLAGS.output_path, f'{column}.png'))
+      logging.info(f'Finished generating figure for {column}')
+    return
+
+  subplot_titles = [
+      FANCY_PROPERTY_NAMES[property_key]
+      for property_key in FLAGS.sub_plot_sections
+  ]
+
+  subplot_figure = plotly.subplots.make_subplots(
+      rows=2, cols=4, subplot_titles=subplot_titles)
+
+  for index, sub_plot_section in enumerate(FLAGS.sub_plot_sections):
+    column = (index % 4) + 1
+    row = int(index / 4 + 1)
+
+    for language_index, language in enumerate(languages):
+      data_frame_subset = data_frame[data_frame['language'] == language]
+      to_show_legend = True if index == 0 else False
+      subplot_figure.add_trace(
+          plotly.graph_objects.Histogram(
+              x=data_frame_subset[sub_plot_section].to_numpy(),
+              nbinsx=FLAGS.num_bins,
+              name=language,
+              marker_color=plotly.colors.qualitative.Plotly[language_index],
+              showlegend=to_show_legend),
+          col=column,
+          row=row)
+      subplot_figure.update_yaxes(
+          type="log", col=column, row=row, exponentformat='power')
+      logging.info(
+          f'Finished generating figure for {sub_plot_section} in {language}')
+
+  subplot_figure.update_layout(
+      width=2200, height=1000, barmode='group', font=dict(size=30))
+  subplot_figure.update_annotations(font_size=40)
+
+  logging.info('Writing image to file')
+
+  plotly.io.kaleido.scope.mathjax = None
+
+  subplot_figure.write_image(
+      os.path.join(FLAGS.output_path, 'subplot_figure.pdf'))
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_pass_frequency_chart.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_pass_frequency_chart.py
new file mode 100644
index 000000000000000..db19da1208b9453
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/generate_pass_frequency_chart.py
@@ -0,0 +1,104 @@
+"""A tool for generating bar charts describing how often passes run using a
+bar chart.
+"""
+
+import logging
+import os
+
+import pandas
+import plotly.graph_objects as go
+
+from absl import app
+from absl import flags
+
+from llvm_ir_dataset_utils.util import pass_list_constants
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string('data_path', None, 'The path to the data file.')
+flags.DEFINE_string('output_file', None,
+                    'The path to place the output image at.')
+flags.DEFINE_bool('combine_passes', False,
+                  'Whether or not to combine passes that run multiple times.')
+
+flags.mark_flag_as_required('data_path')
+flags.mark_flag_as_required('output_file')
+
+
+def main(_):
+  bar_charts = []
+
+  full_data_frame = pandas.DataFrame.from_dict({
+      'language': [],
+      'label': [],
+      'percentage': []
+  })
+
+  language_names = []
+
+  for language_data_path in FLAGS.data_path:
+    language_name = os.path.splitext(os.path.basename(language_data_path))[0]
+    language_names.append(language_name)
+    logging.info(f'Loading data from {language_data_path}.')
+    data_frame = pandas.read_csv(language_data_path)
+    data_frame.drop(['name'], axis=1, inplace=True)
+
+    # Only grab passes that we have in case we only have a sample that misses
+    # some of the loop passes. We're only transforming everything to make sure
+    # it's all in order.
+    passes_to_grab = []
+    for pass_name in pass_list_constants.OPT_DEFAULT_O3_PASS_LIST:
+      if pass_name in data_frame.columns:
+        passes_to_grab.append(pass_name)
+
+    data_frame = data_frame[passes_to_grab]
+
+    labels = []
+    percentages = []
+
+    for column in data_frame.keys():
+      percentage = data_frame[column].sum() / data_frame.shape[0]
+      if percentage != 0:
+        pass_name = column.split('Pass')[0]
+        if not FLAGS.combine_passes:
+          pass_name += column[-1]
+        labels.append(pass_name)
+        percentages.append(data_frame[column].sum() / data_frame.shape[0])
+
+    partial_data_frame = pandas.DataFrame.from_dict({
+        'language': [language_name for _ in range(0, len(labels))],
+        'label': labels,
+        'percentage': percentages
+    })
+
+    full_data_frame = pandas.concat([full_data_frame, partial_data_frame])
+
+    logging.info(
+        f'Finished generating data for plot with {len(labels)} labels for {language_name}'
+    )
+
+  logging.info('Finished loading data, generating figures.')
+
+  full_data_frame.sort_values(by=['percentage'], ascending=False, inplace=True)
+
+  for language_name in language_names:
+    data_frame = full_data_frame[full_data_frame['language'] == language_name]
+    bar_charts.append(
+        go.Bar(
+            name=language_name,
+            x=data_frame['percentage'],
+            y=data_frame['label'],
+            orientation='h'))
+
+  figure = go.Figure(data=bar_charts)
+
+  figure.update_layout(
+      barmode='group', height=1500, width=1000, font=dict(size=20))
+  figure.update_xaxes(type="log", exponentformat='power')
+  figure.update_yaxes(autorange="reversed")
+
+  figure.write_image(FLAGS.output_file)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/size_treemap.py b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/size_treemap.py
new file mode 100644
index 000000000000000..9f04b9c290035b9
--- /dev/null
+++ b/llvm-ir-dataset-utils/llvm_ir_dataset_utils/visualization_tools/size_treemap.py
@@ -0,0 +1,99 @@
+"""Visualization tool for generating a treemap of size information."""
+
+import os
+
+import pandas
+
+import plotly.express
+
+from absl import flags
+from absl import app
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string(
+    'size_file', None,
+    'A size file to load data from. Can be specified more than once.')
+flags.DEFINE_string('output_file', None,
+                    'The output file to place the image in.')
+flags.DEFINE_integer(
+    'size_threshold', 100 * 1000**2,
+    'The size threshold before putting a project in the other category (in bytes).'
+)
+
+flags.mark_flag_as_required('size_file')
+flags.mark_flag_as_required('output_file')
+
+
+def load_sizes_file(size_file_path):
+  other_size = 0
+  total_size = 0
+  with open(size_file_path) as size_file:
+    name_size_pairs = []
+    for line in size_file:
+      name_size_pair = line.rstrip().split(',')
+      name = name_size_pair[0]
+      size = int(name_size_pair[1])
+      total_size += size
+      if size < FLAGS.size_threshold:
+        other_size += size
+        continue
+      name_size_pairs.append((name, size))
+  # Get the basename of the file without the extension
+  language_name_base = os.path.basename(size_file_path)[:-4]
+  language_name = f'{language_name_base} ({str(round(total_size / 10 ** 9,0))[:-2]} GB)'
+  names = [language_name]
+  languages = ['ComPile']
+  values = [0]
+  text = ['']
+  for name, size in name_size_pairs:
+    size_mb_string = str(round(size / 10**6, 0))[:-2]
+    names.append(name + size_mb_string)
+    languages.append(language_name)
+    values.append(size)
+    text.append(f'{size_mb_string} MB')
+  other_size_gb = str(round(other_size / 10**9, 2))
+  names.append(f'Small {language_name_base} projects')
+  text.append(f'Small {language_name_base} projects ({other_size_gb} GB).')
+  languages.append(language_name)
+  values.append(other_size)
+  return (names, languages, values, text)
+
+
+def main(_):
+  names = ['ComPile']
+  languages = ['']
+  sizes = [0]
+  text = ['']
+
+  for size_file in FLAGS.size_file:
+    new_names, new_languages, new_sizes, new_text = load_sizes_file(size_file)
+    names.extend(new_names)
+    languages.extend(new_languages)
+    sizes.extend(new_sizes)
+    text.extend(new_text)
+
+  data_frame = pandas.DataFrame(
+      list(zip(names, languages, sizes)),
+      columns=['names', 'languages', 'sizes'])
+
+  figure = plotly.express.treemap(
+      data_frame=data_frame,
+      names='names',
+      parents='languages',
+      values='sizes',
+      color='sizes',
+      color_continuous_scale='Aggrnyl',
+      width=1100,
+      height=550)
+
+  figure.data[0].text = text
+  figure.data[0].textinfo = 'text'
+
+  figure.update_layout(margin=dict(l=20, r=20, t=20, b=20),)
+
+  figure.write_image(FLAGS.output_file)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm-ir-dataset-utils/pyproject.toml b/llvm-ir-dataset-utils/pyproject.toml
new file mode 100644
index 000000000000000..82d6a58849a3a88
--- /dev/null
+++ b/llvm-ir-dataset-utils/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "llvm-ir-dataset-utils"
+description = "Infrastructure to build an LLVM IR dataset."
+readme = "README.md"
+authors = [
+    {name="Aiden Grossman", email="aidengrossmanpso at gmail.com"},
+    {name="Ludger Paehler", email="ludger.paehler at tum.de"}
+]
+version = "0.1"
+license = {text="Apache-2.0"}
+classifiers = [
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Development Status :: 2 - Pre-Alpha Copy",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache-2.0",
+]
+requires-python = ">=3.8"
+dependencies = [
+  "absl-py>=1.4.0",
+  "ml-compiler-opt>=0.0.1.dev202308100007",
+  "ray>=2.5.1",
+  "toml>=0.10.2",
+  "pandas>=2.0.3",
+  "plotly>=5.16.1",
+  "kaleido>=0.2.1",
+  "umap-learn>=0.5.3",
+]
+
+[project.optional-dependencies]
+dev = [
+    "yapf>=0.33.0",
+    "pytest>=7.4.0"
+]
+
+[tool.setuptools.packages.find]
+include = ["llvm_ir_dataset_utils"]



More information about the llvm-commits mailing list