[libc-commits] [libc] 00c943a - [libc] automemcpy

Thu Oct 28 04:10:39 PDT 2021

Author: Guillaume Chatelet
Date: 2021-10-28T11:10:15Z
New Revision: 00c943a5488577c1237df81fb5d9b3312f429738

URL: https://github.com/llvm/llvm-project/commit/00c943a5488577c1237df81fb5d9b3312f429738
DIFF: https://github.com/llvm/llvm-project/commit/00c943a5488577c1237df81fb5d9b3312f429738.diff

LOG: [libc] automemcpy

Added: 
    libc/benchmarks/automemcpy/CMakeLists.txt
    libc/benchmarks/automemcpy/README.md
    libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h
    libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h
    libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h
    libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
    libc/benchmarks/automemcpy/lib/CMakeLists.txt
    libc/benchmarks/automemcpy/lib/CodeGen.cpp
    libc/benchmarks/automemcpy/lib/CodeGenMain.cpp
    libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp
    libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
    libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
    libc/benchmarks/automemcpy/unittests/CMakeLists.txt
    libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp
    libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp

Modified: 
    libc/benchmarks/CMakeLists.txt
    libc/src/string/memory_utils/elements.h

Removed: 
    


################################################################################
diff  --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 18e5fc0c255c9..01aab0585bbf7 100644

--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -112,9 +112,14 @@ add_library(libc-memory-benchmark
     EXCLUDE_FROM_ALL
     LibcMemoryBenchmark.cpp
     LibcMemoryBenchmark.h
+    LibcFunctionPrototypes.h
     MemorySizeDistributions.cpp
     MemorySizeDistributions.h
 )
+target_include_directories(libc-memory-benchmark
+    PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
 target_link_libraries(libc-memory-benchmark
     PUBLIC
     libc-benchmark
@@ -196,3 +201,5 @@ target_link_libraries(libc.benchmarks.memory_functions.opt_host
   libc.src.string.bzero_opt_host
   benchmark_main
 )
+
+add_subdirectory(automemcpy)

diff  --git a/libc/benchmarks/automemcpy/CMakeLists.txt b/libc/benchmarks/automemcpy/CMakeLists.txt
new file mode 100644
index 0000000000000..ef9b4218c8d61
--- /dev/null
+++ b/libc/benchmarks/automemcpy/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(NOT LIBC_BUILD_AUTOMEMCPY)
+  return ()
+endif()
+
+if(NOT LLVM_WITH_Z3)
+  MESSAGE(FATAL_ERROR "Building llvm-libc automemcpy requires Z3")
+endif()
+
+set(LIBC_AUTOMEMCPY_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+add_subdirectory(lib)
+add_subdirectory(unittests)

diff  --git a/libc/benchmarks/automemcpy/README.md b/libc/benchmarks/automemcpy/README.md
new file mode 100644
index 0000000000000..88d0b7ece9b9f
--- /dev/null
+++ b/libc/benchmarks/automemcpy/README.md
@@ -0,0 +1,111 @@
+This folder contains an implementation of [automemcpy: A framework for automatic generation of fundamental memory operations](https://research.google/pubs/pub50338/).
+
+It uses the [Z3 theorem prover](https://github.com/Z3Prover/z3) to enumerate a subset of valid memory function implementations. These implementations are then materialized as C++ code and can be [benchmarked](../) against various [size distributions](../distributions). This process helps the design of efficient implementations for a particular environnement (size distribution, processor or custom compilation options).
+
+This is not enabled by default, as it is mostly useful when working on tuning the library implementation. To build it, use `LIBC_BUILD_AUTOMEMCPY=ON` (see below).
+
+## Prerequisites
+
+You may need to install `Z3` from source if it's not available on your system.
+Here we show instructions to install it into `<Z3_INSTALL_DIR>`.
+You may need to `sudo` to `make install`.
+
+```shell
+mkdir -p ~/git
+cd ~/git
+git clone https://github.com/Z3Prover/z3.git
+python scripts/mk_make.py --prefix=<Z3_INSTALL_DIR>
+cd build
+make -j
+make install 
+```
+
+## Configuration
+
+```shell
+mkdir -p <BUILD_DIR>
+cd <LLVM_PROJECT_DIR>/llvm
+cmake -DCMAKE_C_COMPILER=/usr/bin/clang \
+ -DCMAKE_CXX_COMPILER=/usr/bin/clang++ \
+ -DLLVM_ENABLE_PROJECTS="libc" \
+ -DLLVM_ENABLE_Z3_SOLVER=ON \
+ -DLLVM_Z3_INSTALL_DIR=<Z3_INSTALL_DIR> \
+ -DLIBC_BUILD_AUTOMEMCPY=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -B<BUILD_DIR>
+```
+
+## Targets and compilation
+
+There are three main CMake targets
+ 1. `automemcpy_implementations`
+    - runs `Z3` and materializes valid memory functions as C++ code, a message will display its ondisk location.
+    - the source code is then compiled using the native host optimizations (i.e. `-march=native` or `-mcpu=native` depending on the architecture).
+ 2. `automemcpy`
+    - the binary that benchmarks the autogenerated implementations.
+ 3. `automemcpy_result_analyzer`
+    - the binary that analyses the benchmark results.
+
+You may only compile the binaries as they both pull the autogenerated code as a dependency.
+
+```shell
+make -C <BUILD_DIR> -j automemcpy automemcpy_result_analyzer
+```
+
+## Running the benchmarks
+
+Make sure to save the results of the benchmark as a json file.
+
+```shell
+<BUILD_DIR>/bin/automemcpy --benchmark_out_format=json --benchmark_out=<RESULTS_DIR>/results.json
+```
+
+### Additional useful options
+
+
+ - `--benchmark_min_time=.2`
+
+     By default, each function is benchmarked for at least one second, here we lower it to 200ms.
+
+ - `--benchmark_filter="BM_Memset|BM_Bzero"`
+ 
+     By default, all functions are benchmarked, here we restrict them to `memset` and `bzero`.
+
+Other options might be useful, use `--help` for more information.
+
+## Analyzing the benchmarks
+
+Analysis is performed by running `automemcpy_result_analyzer` on one or more json result files.
+
+```shell
+<BUILD_DIR>/bin/automemcpy_result_analyzer <RESULTS_DIR>/results.json
+```
+
+What it does:
+  1. Gathers all throughput values for each function / distribution pair and picks the median one.\
+  This allows picking a representative value over many runs of the benchmark. Please make sure all the runs happen under similar circumstances.
+
+  2. For each distribution, look at the span of throughputs for functions of the same type (e.g. For distribution `A`, memcpy throughput spans from 2GiB/s to 5GiB/s).
+
+  3. For each distribution, give a normalized score to each function (e.g. For distribution `A`, function `M` scores 0.65).\
+  This score is then turned into a grade `EXCELLENT`, `VERY_GOOD`, `GOOD`, `PASSABLE`, `INADEQUATE`, `MEDIOCRE`, `BAD` - so that each distribution categorizes how function perform according to them.
+
+  4. A [Majority Judgement](https://en.wikipedia.org/wiki/Majority_judgment) process is then used to categorize each function. This enables finer analysis of how distributions agree on which function is better. In the following example, `Function_1` and `Function_2` are rated `EXCELLENT` but looking at the grade's distribution might help decide which is best.
+
+|            | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | INADEQUATE | MEDIOCRE | BAD |
+|------------|:---------:|:---------:|:----:|:--------:|:----------:|:--------:|:---:|
+| Function_1 |     7     |     1     |   2  |          |            |          |     |
+| Function_2 |     6     |     4     |      |          |            |          |     |
+
+The tool outputs the histogram of grades for each function. In case of tie, other dimensions might help decide (e.g. code size, performance on other microarchitectures).
+
+```
+EXCELLENT  |█▁▂    |  Function_0
+EXCELLENT  |█▅     |  Function_1
+VERY_GOOD  |▂█▁ ▁  |  Function_2
+GOOD       | ▁█▄   |  Function_3
+PASSABLE   | ▂▆▄█  |  Function_4
+INADEQUATE |  ▃▃█▁ |  Function_5
+MEDIOCRE   |    █▆▁|  Function_6
+BAD        |    ▁▁█|  Function_7
+```

diff  --git a/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h b/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h
new file mode 100644
index 0000000000000..389e8249f9399
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h
@@ -0,0 +1,26 @@
+//===-- C++ code generation from NamedFunctionDescriptors -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H
+#define LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H
+
+#include "automemcpy/FunctionDescriptor.h"
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/Support/raw_ostream.h>
+#include <vector>
+
+namespace llvm {
+namespace automemcpy {
+
+// This function serializes the array of FunctionDescriptors as a C++ file.
+void Serialize(raw_ostream &Stream, ArrayRef<FunctionDescriptor> FD);
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif // LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H

diff  --git a/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h b/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h
new file mode 100644
index 0000000000000..444d856a7260d
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h
@@ -0,0 +1,159 @@
+//===-- Pod structs to describe a memory function----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H
+#define LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H
+
+#include <climits>
+#include <cstddef>
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/Hashing.h>
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/StringRef.h>
+#include <tuple>
+
+namespace llvm {
+namespace automemcpy {
+
+// Boilerplate code to be able to sort and hash types.
+#define COMPARABLE_AND_HASHABLE(T, ...)                                        \
+  inline auto asTuple() const { return std::tie(__VA_ARGS__); }                \
+  bool operator==(const T &O) const { return asTuple() == O.asTuple(); }       \
+  bool operator<(const T &O) const { return asTuple() < O.asTuple(); }         \
+  struct Hasher {                                                              \
+    std::size_t operator()(const T &K) const {                                 \
+      return llvm::hash_value(K.asTuple());                                    \
+    }                                                                          \
+  };
+
+// Represents the maximum value for the size parameter of a memory function.
+// This is an `int` so we can use it as an expression in Z3.
+// It also allows for a more readable and compact representation when storing
+// the SizeSpan in the autogenerated C++ file.
+static constexpr int kMaxSize = INT_MAX;
+
+// This mimics the `Arg` type in libc/src/string/memory_utils/elements.h without
+// having to depend on it.
+enum class AlignArg { _1, _2, ARRAY_SIZE };
+
+// Describes a range of sizes.
+// We use the begin/end representation instead of first/last to allow for empty
+// range (i.e. Begin == End)
+struct SizeSpan {
+  size_t Begin = 0;
+  size_t End = 0;
+
+  COMPARABLE_AND_HASHABLE(SizeSpan, Begin, End)
+};
+
+// Describes a contiguous region.
+// In such a region all sizes are handled individually.
+// e.g. with Span = {0, 2};
+// if(size == 0) return Handle<0>();
+// if(size == 1) return Handle<1>();
+struct Contiguous {
+  SizeSpan Span;
+
+  COMPARABLE_AND_HASHABLE(Contiguous, Span)
+};
+
+// This struct represents a range of sizes over which to use an overlapping
+// strategy. An overlapping strategy of size N handles all sizes from N to 2xN.
+// The span may represent several contiguous overlaps.
+// e.g. with Span = {16, 128};
+// if(size >= 16 and size < 32) return Handle<Overlap<16>>();
+// if(size >= 32 and size < 64) return Handle<Overlap<32>>();
+// if(size >= 64 and size < 128) return Handle<Overlap<64>>();
+struct Overlap {
+  SizeSpan Span;
+
+  COMPARABLE_AND_HASHABLE(Overlap, Span)
+};
+
+// Describes a region using a loop handling BlockSize bytes at a time. The
+// remaining bytes of the loop are handled with an overlapping operation.
+struct Loop {
+  SizeSpan Span;
+  size_t BlockSize = 0;
+
+  COMPARABLE_AND_HASHABLE(Loop, Span, BlockSize)
+};
+
+// Same as `Loop` but starts by aligning a buffer on `Alignment` bytes.
+// A first operation handling 'Alignment` bytes is performed followed by a
+// sequence of Loop.BlockSize bytes operation. The Loop starts processing from
+// the next aligned byte in the chosen buffer. The remaining bytes of the loop
+// are handled with an overlapping operation.
+struct AlignedLoop {
+  Loop Loop;
+  size_t Alignment = 0;            // Size of the alignment.
+  AlignArg AlignTo = AlignArg::_1; // Which buffer to align.
+
+  COMPARABLE_AND_HASHABLE(AlignedLoop, Loop, Alignment, AlignTo)
+};
+
+// Some processors offer special instruction to handle the memory function
+// completely, we refer to such instructions as accelerators.
+struct Accelerator {
+  SizeSpan Span;
+
+  COMPARABLE_AND_HASHABLE(Accelerator, Span)
+};
+
+// The memory functions are assembled out of primitives that can be implemented
+// with regular scalar operations (SCALAR), with the help of vector or bitcount
+// instructions (NATIVE) or by deferring it to the compiler (BUILTIN).
+enum class ElementTypeClass {
+  SCALAR,
+  NATIVE,
+  BUILTIN,
+};
+
+// A simple enum to categorize which function is being implemented.
+enum class FunctionType {
+  MEMCPY,
+  MEMCMP,
+  BCMP,
+  MEMSET,
+  BZERO,
+};
+
+// This struct describes the skeleton of the implementation, it does not go into
+// every detail but is enough to uniquely identify the implementation.
+struct FunctionDescriptor {
+  FunctionType Type;
+  Optional<Contiguous> Contiguous;
+  Optional<Overlap> Overlap;
+  Optional<Loop> Loop;
+  Optional<AlignedLoop> AlignedLoop;
+  Optional<Accelerator> Accelerator;
+  ElementTypeClass ElementClass;
+
+  COMPARABLE_AND_HASHABLE(FunctionDescriptor, Type, Contiguous, Overlap, Loop,
+                          AlignedLoop, Accelerator, ElementClass)
+
+  inline size_t id() const { return llvm::hash_value(asTuple()); }
+};
+
+// Same as above but with the function name.
+struct NamedFunctionDescriptor {
+  StringRef Name;
+  FunctionDescriptor Desc;
+};
+
+template <typename T> llvm::hash_code hash_value(const ArrayRef<T> &V) {
+  return llvm::hash_combine_range(V.begin(), V.end());
+}
+template <typename T> llvm::hash_code hash_value(const T &O) {
+  return llvm::hash_value(O.asTuple());
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif /* LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H */

diff  --git a/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h b/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h
new file mode 100644
index 0000000000000..48e8815801c55
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h
@@ -0,0 +1,62 @@
+//===-- Generate random but valid function descriptors  ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H
+#define LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H
+
+#include "automemcpy/FunctionDescriptor.h"
+#include <cstddef>
+#include <cstdint>
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/StringRef.h>
+#include <vector>
+#include <z3++.h>
+
+namespace llvm {
+namespace automemcpy {
+
+// Holds the state for the constraint solver.
+// It implements a single method that returns the next valid description.
+struct RandomFunctionGenerator {
+  RandomFunctionGenerator();
+
+  // Get the next valid FunctionDescriptor or llvm::None.
+  Optional<FunctionDescriptor> next();
+
+private:
+  // Returns an expression where `Variable` is forced to be one of the `Values`.
+  z3::expr inSetConstraint(z3::expr &Variable, ArrayRef<int> Values) const;
+  // Add constaints to `Begin` and `End` so that they are:
+  // - between 0 and kMaxSize (inclusive)
+  // - ordered (begin<=End)
+  // - amongst a set of predefined values.
+  void addBoundsAndAnchors(z3::expr &Begin, z3::expr &End);
+  // Add constraints to make sure that the loop block size is amongst a set of
+  // predefined values. Also makes sure that the loop that the loop is iterated
+  // at least `LoopMinIter` times.
+  void addLoopConstraints(const z3::expr &LoopBegin, const z3::expr &LoopEnd,
+                          z3::expr &LoopBlockSize, int LoopMinIter);
+
+  z3::context Context;
+  z3::solver Solver;
+
+  z3::expr Type;
+  z3::expr ContiguousBegin, ContiguousEnd;
+  z3::expr OverlapBegin, OverlapEnd;
+  z3::expr LoopBegin, LoopEnd, LoopBlockSize;
+  z3::expr AlignedLoopBegin, AlignedLoopEnd, AlignedLoopBlockSize,
+      AlignedAlignment, AlignedArg;
+  z3::expr AcceleratorBegin, AcceleratorEnd;
+  z3::expr ElementClass;
+};
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif /* LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H */

diff  --git a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
new file mode 100644
index 0000000000000..845c3e1e1180f
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
@@ -0,0 +1,99 @@
+//===-- Analyze benchmark JSON files ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBC_BENCHMARKS_AUTOMEMCPY_RESULTANALYZER_H
+#define LIBC_BENCHMARKS_AUTOMEMCPY_RESULTANALYZER_H
+
+#include "automemcpy/FunctionDescriptor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
+#include <array>
+#include <vector>
+
+namespace llvm {
+namespace automemcpy {
+
+// A Grade as in the Majority Judgment voting system.
+struct Grade {
+  enum GradeEnum {
+    EXCELLENT,
+    VERY_GOOD,
+    GOOD,
+    PASSABLE,
+    INADEQUATE,
+    MEDIOCRE,
+    BAD,
+    ARRAY_SIZE,
+  };
+
+  // Returns a human readable string of the enum.
+  static StringRef getString(const GradeEnum &GE);
+
+  // Turns 'Score' into a GradeEnum.
+  static GradeEnum judge(double Score);
+};
+
+// A 'GradeEnum' indexed array with counts for each grade.
+using GradeHistogram = std::array<size_t, Grade::ARRAY_SIZE>;
+
+// Identifies a Function by its name and type. Used as a key in a map.
+struct FunctionId {
+  StringRef Name;
+  FunctionType Type;
+  COMPARABLE_AND_HASHABLE(FunctionId, Type, Name)
+};
+
+struct PerDistributionData {
+  double MedianBytesPerSecond; // Median of samples for this distribution.
+  double Score;                // Normalized score for this distribution.
+  Grade::GradeEnum Grade;      // Grade for this distribution.
+};
+
+struct FunctionData {
+  FunctionId Id;
+  StringMap<PerDistributionData> PerDistributionData;
+  GradeHistogram GradeHisto = {};           // GradeEnum indexed array
+  Grade::GradeEnum FinalGrade = Grade::BAD; // Overall grade for this function
+};
+
+// Identifies a Distribution by its name. Used as a key in a map.
+struct DistributionId {
+  StringRef Name;
+  COMPARABLE_AND_HASHABLE(DistributionId, Name)
+};
+
+// Identifies a Sample by its distribution and function. Used as a key in a map.
+struct SampleId {
+  FunctionId Function;
+  DistributionId Distribution;
+  COMPARABLE_AND_HASHABLE(SampleId, Function.Type, Function.Name,
+                          Distribution.Name)
+};
+
+// A SampleId with an associated measured throughput.
+struct Sample {
+  SampleId Id;
+  double BytesPerSecond = 0;
+};
+
+// This function collects Samples that belong to the same distribution and
+// function and retains the median one. It then stores each of them into a
+// 'FunctionData' and returns them as a vector.
+std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples);
+
+// Normalize the function's throughput per distribution.
+void fillScores(MutableArrayRef<FunctionData> Functions);
+
+// Convert scores into Grades, stores an histogram of Grade for each functions
+// and cast a median grade for the function.
+void castVotes(MutableArrayRef<FunctionData> Functions);
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif // LIBC_BENCHMARKS_AUTOMEMCPY_RESULTANALYZER_H

diff  --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..073a92ae68ad0
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
@@ -0,0 +1,32 @@
+add_library(automemcpy_codegen CodeGen.cpp)
+target_link_libraries(automemcpy_codegen PUBLIC LLVMSupport)
+target_compile_options(automemcpy_codegen PUBLIC -fno-rtti)
+target_include_directories(automemcpy_codegen PUBLIC ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+
+add_executable(automemcpy_codegen_main CodeGenMain.cpp RandomFunctionGenerator.cpp)
+target_link_libraries(automemcpy_codegen_main PUBLIC automemcpy_codegen ${Z3_LIBRARIES})
+target_compile_options(automemcpy_codegen_main PUBLIC -fno-rtti)
+
+set(Implementations "${CMAKE_CURRENT_BINARY_DIR}/Implementations.cpp")
+add_custom_command(
+    OUTPUT ${Implementations}
+    COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/automemcpy_codegen_main" > "${Implementations}"
+    COMMAND echo "automemcpy implementations generated in ${Implementations}"
+    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEPENDS automemcpy_codegen_main
+)
+
+add_library(automemcpy_implementations "${Implementations}")
+target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark)
+target_include_directories(automemcpy_implementations PRIVATE ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+target_compile_options(automemcpy_implementations PUBLIC -fno-rtti PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin)
+
+add_executable(automemcpy EXCLUDE_FROM_ALL ${LIBC_SOURCE_DIR}/benchmarks/LibcMemoryGoogleBenchmarkMain.cpp)
+target_link_libraries(automemcpy PRIVATE libc-memory-benchmark benchmark_main automemcpy_implementations)
+
+add_library(automemcpy_result_analyzer_lib EXCLUDE_FROM_ALL ResultAnalyzer.cpp)
+target_link_libraries(automemcpy_result_analyzer_lib PUBLIC LLVMSupport)
+target_include_directories(automemcpy_result_analyzer_lib PUBLIC ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+
+add_executable(automemcpy_result_analyzer EXCLUDE_FROM_ALL ResultAnalyzerMain.cpp)
+target_link_libraries(automemcpy_result_analyzer PRIVATE automemcpy_result_analyzer_lib automemcpy_implementations)

diff  --git a/libc/benchmarks/automemcpy/lib/CodeGen.cpp b/libc/benchmarks/automemcpy/lib/CodeGen.cpp
new file mode 100644
index 0000000000000..28bd62044c549
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/CodeGen.cpp
@@ -0,0 +1,646 @@
+//===-- C++ code generation from NamedFunctionDescriptors -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This code is responsible for generating the "Implementation.cpp" file.
+// The file is composed like this:
+//
+// 1. Includes
+// 2. Using statements to help readability.
+// 3. Source code for all the mem function implementations.
+// 4. The function to retrieve all the function descriptors with their name.
+//      llvm::ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors();
+// 5. The functions for the benchmarking infrastructure:
+//      llvm::ArrayRef<MemcpyConfiguration> getMemcpyConfigurations();
+//      llvm::ArrayRef<MemcmpOrBcmpConfiguration> getMemcmpConfigurations();
+//      llvm::ArrayRef<MemcmpOrBcmpConfiguration> getBcmpConfigurations();
+//      llvm::ArrayRef<MemsetConfiguration> getMemsetConfigurations();
+//      llvm::ArrayRef<BzeroConfiguration> getBzeroConfigurations();
+//
+//
+// Sections 3, 4 and 5 are handled by the following namespaces:
+// - codegen::functions
+// - codegen::descriptors
+// - codegen::configurations
+//
+// The programming style is functionnal. In each of these namespace, the
+// original `NamedFunctionDescriptor` object is turned into a 
diff erent type. We
+// make use of overloaded stream operators to format the resulting type into
+// either a function, a descriptor or a configuration. The entry point of each
+// namespace is the Serialize function.
+//
+// Note the code here is better understood by starting from the `Serialize`
+// function at the end of the file.
+
+#include "automemcpy/CodeGen.h"
+#include <cassert>
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/StringSet.h>
+#include <llvm/Support/FormatVariadic.h>
+#include <llvm/Support/raw_ostream.h>
+#include <set>
+
+namespace llvm {
+namespace automemcpy {
+namespace codegen {
+
+// The indentation string.
+static constexpr StringRef kIndent = "  ";
+
+// The codegen namespace handles the serialization of a NamedFunctionDescriptor
+// into source code for the function, the descriptor and the configuration.
+
+namespace functions {
+
+// This namespace turns a NamedFunctionDescriptor into an actual implementation.
+// -----------------------------------------------------------------------------
+// e.g.
+// static void memcpy_0xB20D4702493C397E(char *__restrict dst,
+//                                       const char *__restrict src,
+//                                       size_t size) {
+//   using namespace __llvm_libc::x86;
+//   if(size == 0) return;
+//   if(size == 1) return Copy<_1>(dst, src);
+//   if(size < 4) return Copy<HeadTail<_2>>(dst, src, size);
+//   if(size < 8) return Copy<HeadTail<_4>>(dst, src, size);
+//   if(size < 16) return Copy<HeadTail<_8>>(dst, src, size);
+//   if(size < 32) return Copy<HeadTail<_16>>(dst, src, size);
+//   return Copy<Accelerator>(dst, src, size);
+// }
+
+// The `Serialize` method turns a `NamedFunctionDescriptor` into a
+// `FunctionImplementation` which holds all the information needed to produce
+// the C++ source code.
+
+// An Element with its size (e.g. `_16` in the example above).
+struct ElementType {
+  size_t Size;
+};
+// The case `if(size == 0)` is encoded as a the Zero type.
+struct Zero {
+  StringRef DefaultReturnValue;
+};
+// An individual size `if(size == X)` is encoded as an Individual type.
+struct Individual {
+  size_t IfEq;
+  ElementType Element;
+};
+// An overlap strategy is encoded as an Overlap type.
+struct Overlap {
+  size_t IfLt;
+  ElementType Element;
+};
+// A loop strategy is encoded as a Loop type.
+struct Loop {
+  size_t IfLt;
+  ElementType Element;
+};
+// An aligned loop strategy is encoded as an AlignedLoop type.
+struct AlignedLoop {
+  size_t IfLt;
+  ElementType Element;
+  ElementType Alignment;
+  StringRef AlignTo;
+};
+// The accelerator strategy.
+struct Accelerator {
+  size_t IfLt;
+};
+// The Context stores data about the function type.
+struct Context {
+  StringRef FunctionReturnType; // e.g. void* or int
+  StringRef FunctionArgs;
+  StringRef ElementOp; // Copy, ThreeWayCompare, SplatSet, ...
+  StringRef FixedSizeArgs;
+  StringRef RuntimeSizeArgs;
+  StringRef AlignArg1;
+  StringRef AlignArg2;
+  StringRef DefaultReturnValue;
+};
+// A detailed representation of the function implementation mapped from the
+// NamedFunctionDescriptor.
+struct FunctionImplementation {
+  Context Ctx;
+  StringRef Name;
+  std::vector<Individual> Individuals;
+  std::vector<Overlap> Overlaps;
+  Optional<Loop> Loop;
+  Optional<AlignedLoop> AlignedLoop;
+  Optional<Accelerator> Accelerator;
+  ElementTypeClass ElementClass;
+};
+
+// Returns the Context for each FunctionType.
+static Context getCtx(FunctionType FT) {
+  switch (FT) {
+  case FunctionType::MEMCPY:
+    return {"void",
+            "(char *__restrict dst, const char *__restrict src, size_t size)",
+            "Copy",
+            "(dst, src)",
+            "(dst, src, size)",
+            "Arg::Dst",
+            "Arg::Src",
+            ""};
+  case FunctionType::MEMCMP:
+    return {"int",
+            "(const char * lhs, const char * rhs, size_t size)",
+            "ThreeWayCompare",
+            "(lhs, rhs)",
+            "(lhs, rhs, size)",
+            "Arg::Lhs",
+            "Arg::Rhs",
+            "0"};
+  case FunctionType::MEMSET:
+    return {"void",
+            "(char * dst, int value, size_t size)",
+            "SplatSet",
+            "(dst, value)",
+            "(dst, value, size)",
+            "Arg::Dst",
+            "Arg::Src",
+            ""};
+  case FunctionType::BZERO:
+    return {"void",           "(char * dst, size_t size)",
+            "SplatSet",       "(dst, 0)",
+            "(dst, 0, size)", "Arg::Dst",
+            "Arg::Src",       ""};
+  default:
+    report_fatal_error("Not yet implemented");
+  }
+}
+
+static StringRef getAligntoString(const Context &Ctx, const AlignArg &AlignTo) {
+  switch (AlignTo) {
+  case AlignArg::_1:
+    return Ctx.AlignArg1;
+  case AlignArg::_2:
+    return Ctx.AlignArg2;
+  case AlignArg::ARRAY_SIZE:
+    report_fatal_error("logic error");
+  }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream, const ElementType &E) {
+  return Stream << '_' << E.Size;
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Individual &O) {
+  return Stream << O.Element;
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Overlap &O) {
+  return Stream << "HeadTail<" << O.Element << '>';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Loop &O) {
+  return Stream << "Loop<" << O.Element << '>';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const AlignedLoop &O) {
+  return Stream << "Align<" << O.Alignment << ',' << O.AlignTo << ">::Then<"
+                << Loop{O.IfLt, O.Element} << ">";
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Accelerator &O) {
+  return Stream << "Accelerator";
+}
+
+template <typename T> struct IfEq {
+  StringRef Op;
+  StringRef Args;
+  const T ∈
+};
+
+template <typename T> struct IfLt {
+  StringRef Op;
+  StringRef Args;
+  const T ∈
+};
+
+static raw_ostream &operator<<(raw_ostream &Stream, const Zero &O) {
+  Stream << kIndent << "if(size == 0) return";
+  if (!O.DefaultReturnValue.empty())
+    Stream << ' ' << O.DefaultReturnValue;
+  return Stream << ";\n";
+}
+
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream, const IfEq<T> &O) {
+  return Stream << kIndent << "if(size == " << O.Element.IfEq << ") return "
+                << O.Op << '<' << O.Element << '>' << O.Args << ";\n";
+}
+
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream, const IfLt<T> &O) {
+  Stream << kIndent;
+  if (O.Element.IfLt != kMaxSize)
+    Stream << "if(size < " << O.Element.IfLt << ") ";
+  return Stream << "return " << O.Op << '<' << O.Element << '>' << O.Args
+                << ";\n";
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream,
+                               const ElementTypeClass &Class) {
+  switch (Class) {
+  case ElementTypeClass::SCALAR:
+    return Stream << "scalar";
+  case ElementTypeClass::BUILTIN:
+    return Stream << "builtin";
+  case ElementTypeClass::NATIVE:
+    // FIXME: the framework should provide a `native` namespace that redirect to
+    // x86, arm or other architectures.
+    return Stream << "x86";
+  }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream,
+                               const FunctionImplementation &FI) {
+  const auto &Ctx = FI.Ctx;
+  Stream << "static " << Ctx.FunctionReturnType << ' ' << FI.Name
+         << Ctx.FunctionArgs << " {\n";
+  Stream << kIndent << "using namespace __llvm_libc::" << FI.ElementClass
+         << ";\n";
+  for (const auto &I : FI.Individuals)
+    if (I.Element.Size == 0)
+      Stream << Zero{Ctx.DefaultReturnValue};
+    else
+      Stream << IfEq<Individual>{Ctx.ElementOp, Ctx.FixedSizeArgs, I};
+  for (const auto &O : FI.Overlaps)
+    Stream << IfLt<Overlap>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, O};
+  if (const auto &C = FI.Loop)
+    Stream << IfLt<Loop>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C};
+  if (const auto &C = FI.AlignedLoop)
+    Stream << IfLt<AlignedLoop>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C};
+  if (const auto &C = FI.Accelerator)
+    Stream << IfLt<Accelerator>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C};
+  return Stream << "}\n";
+}
+
+// Turns a `NamedFunctionDescriptor` into a `FunctionImplementation` unfolding
+// the contiguous and overlap region into several statements. The zero case is
+// also mapped to its own type.
+static FunctionImplementation
+getImplementation(const NamedFunctionDescriptor &NamedFD) {
+  const FunctionDescriptor &FD = NamedFD.Desc;
+  FunctionImplementation Impl;
+  Impl.Ctx = getCtx(FD.Type);
+  Impl.Name = NamedFD.Name;
+  Impl.ElementClass = FD.ElementClass;
+  if (auto C = FD.Contiguous)
+    for (size_t I = C->Span.Begin; I < C->Span.End; ++I)
+      Impl.Individuals.push_back(Individual{I, ElementType{I}});
+  if (auto C = FD.Overlap)
+    for (size_t I = C->Span.Begin; I < C->Span.End; I *= 2)
+      Impl.Overlaps.push_back(Overlap{2 * I, ElementType{I}});
+  if (const auto &L = FD.Loop)
+    Impl.Loop = Loop{L->Span.End, ElementType{L->BlockSize}};
+  if (const auto &AL = FD.AlignedLoop)
+    Impl.AlignedLoop = AlignedLoop{
+        AL->Loop.Span.End, ElementType{AL->Loop.BlockSize},
+        ElementType{AL->Alignment}, getAligntoString(Impl.Ctx, AL->AlignTo)};
+  if (const auto &A = FD.Accelerator)
+    Impl.Accelerator = Accelerator{A->Span.End};
+  return Impl;
+}
+
+static void Serialize(raw_ostream &Stream,
+                      ArrayRef<NamedFunctionDescriptor> Descriptors) {
+
+  for (const auto &FD : Descriptors)
+    Stream << getImplementation(FD);
+}
+
+} // namespace functions
+
+namespace descriptors {
+
+// This namespace generates the getFunctionDescriptors function:
+// -------------------------------------------------------------
+// e.g.
+// ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors() {
+//   static constexpr NamedFunctionDescriptor kDescriptors[] = {
+//     {"memcpy_0xE00E29EE73994E2B",{FunctionType::MEMCPY,llvm::None,llvm::None,llvm::None,llvm::None,Accelerator{{0,kMaxSize}},ElementTypeClass::NATIVE}},
+//     {"memcpy_0x8661D80472487AB5",{FunctionType::MEMCPY,Contiguous{{0,1}},llvm::None,llvm::None,llvm::None,Accelerator{{1,kMaxSize}},ElementTypeClass::NATIVE}},
+//     ...
+//   };
+//   return makeArrayRef(kDescriptors);
+// }
+
+static raw_ostream &operator<<(raw_ostream &Stream, const SizeSpan &SS) {
+  Stream << "{" << SS.Begin << ',';
+  if (SS.End == kMaxSize)
+    Stream << "kMaxSize";
+  else
+    Stream << SS.End;
+  return Stream << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Contiguous &O) {
+  return Stream << "Contiguous{" << O.Span << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Overlap &O) {
+  return Stream << "Overlap{" << O.Span << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Loop &O) {
+  return Stream << "Loop{" << O.Span << ',' << O.BlockSize << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const AlignArg &O) {
+  switch (O) {
+  case AlignArg::_1:
+    return Stream << "AlignArg::_1";
+  case AlignArg::_2:
+    return Stream << "AlignArg::_2";
+  case AlignArg::ARRAY_SIZE:
+    report_fatal_error("logic error");
+  }
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const AlignedLoop &O) {
+  return Stream << "AlignedLoop{" << O.Loop << ',' << O.Alignment << ','
+                << O.AlignTo << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Accelerator &O) {
+  return Stream << "Accelerator{" << O.Span << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const ElementTypeClass &O) {
+  switch (O) {
+  case ElementTypeClass::SCALAR:
+    return Stream << "ElementTypeClass::SCALAR";
+  case ElementTypeClass::BUILTIN:
+    return Stream << "ElementTypeClass::BUILTIN";
+  case ElementTypeClass::NATIVE:
+    return Stream << "ElementTypeClass::NATIVE";
+  }
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const FunctionType &T) {
+  switch (T) {
+  case FunctionType::MEMCPY:
+    return Stream << "FunctionType::MEMCPY";
+  case FunctionType::MEMCMP:
+    return Stream << "FunctionType::MEMCMP";
+  case FunctionType::BCMP:
+    return Stream << "FunctionType::BCMP";
+  case FunctionType::MEMSET:
+    return Stream << "FunctionType::MEMSET";
+  case FunctionType::BZERO:
+    return Stream << "FunctionType::BZERO";
+  }
+}
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream,
+                               const llvm::Optional<T> &MaybeT) {
+  if (MaybeT)
+    return Stream << *MaybeT;
+  return Stream << "llvm::None";
+}
+static raw_ostream &operator<<(raw_ostream &Stream,
+                               const FunctionDescriptor &FD) {
+  return Stream << '{' << FD.Type << ',' << FD.Contiguous << ',' << FD.Overlap
+                << ',' << FD.Loop << ',' << FD.AlignedLoop << ','
+                << FD.Accelerator << ',' << FD.ElementClass << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream,
+                               const NamedFunctionDescriptor &NFD) {
+  return Stream << '{' << '"' << NFD.Name << '"' << ',' << NFD.Desc << '}';
+}
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream,
+                               const std::vector<T> &VectorT) {
+  Stream << '{';
+  bool First = true;
+  for (const auto &Obj : VectorT) {
+    if (!First)
+      Stream << ',';
+    Stream << Obj;
+    First = false;
+  }
+  return Stream << '}';
+}
+
+static void Serialize(raw_ostream &Stream,
+                      ArrayRef<NamedFunctionDescriptor> Descriptors) {
+  Stream << R"(ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors() {
+  static constexpr NamedFunctionDescriptor kDescriptors[] = {
+)";
+  for (size_t I = 0, E = Descriptors.size(); I < E; ++I) {
+    Stream << kIndent << kIndent << Descriptors[I] << ",\n";
+  }
+  Stream << R"(  };
+  return makeArrayRef(kDescriptors);
+}
+)";
+}
+
+} // namespace descriptors
+
+namespace configurations {
+
+// This namespace generates the getXXXConfigurations functions:
+// ------------------------------------------------------------
+// e.g.
+// llvm::ArrayRef<MemcpyConfiguration> getMemcpyConfigurations() {
+//   using namespace __llvm_libc;
+//   static constexpr MemcpyConfiguration kConfigurations[] = {
+//     {Wrap<memcpy_0xE00E29EE73994E2B>, "memcpy_0xE00E29EE73994E2B"},
+//     {Wrap<memcpy_0x8661D80472487AB5>, "memcpy_0x8661D80472487AB5"},
+//     ...
+//   };
+//   return llvm::makeArrayRef(kConfigurations);
+// }
+
+// The `Wrap` template function is provided in the `Main` function below.
+// It is used to adapt the gnerated code to the prototype of the C function.
+// For instance, the generated code for a `memcpy` takes `char*` pointers and
+// returns nothing but the original C `memcpy` function take and returns `void*`
+// pointers.
+
+struct FunctionName {
+  FunctionType ForType;
+};
+
+struct ReturnType {
+  FunctionType ForType;
+};
+
+struct Configuration {
+  FunctionName Name;
+  ReturnType Type;
+  std::vector<const NamedFunctionDescriptor *> Descriptors;
+};
+
+static raw_ostream &operator<<(raw_ostream &Stream, const FunctionName &FN) {
+  switch (FN.ForType) {
+  case FunctionType::MEMCPY:
+    return Stream << "getMemcpyConfigurations";
+  case FunctionType::MEMCMP:
+    return Stream << "getMemcmpConfigurations";
+  case FunctionType::BCMP:
+    return Stream << "getBcmpConfigurations";
+  case FunctionType::MEMSET:
+    return Stream << "getMemsetConfigurations";
+  case FunctionType::BZERO:
+    return Stream << "getBzeroConfigurations";
+  }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream, const ReturnType &RT) {
+  switch (RT.ForType) {
+  case FunctionType::MEMCPY:
+    return Stream << "MemcpyConfiguration";
+  case FunctionType::MEMCMP:
+  case FunctionType::BCMP:
+    return Stream << "MemcmpOrBcmpConfiguration";
+  case FunctionType::MEMSET:
+    return Stream << "MemsetConfiguration";
+  case FunctionType::BZERO:
+    return Stream << "BzeroConfiguration";
+  }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream,
+                               const NamedFunctionDescriptor *FD) {
+  return Stream << formatv("{Wrap<{0}>, \"{0}\"}", FD->Name);
+}
+
+static raw_ostream &
+operator<<(raw_ostream &Stream,
+           const std::vector<const NamedFunctionDescriptor *> &Descriptors) {
+  for (size_t I = 0, E = Descriptors.size(); I < E; ++I)
+    Stream << kIndent << kIndent << Descriptors[I] << ",\n";
+  return Stream;
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream, const Configuration &C) {
+  Stream << "llvm::ArrayRef<" << C.Type << "> " << C.Name << "() {\n";
+  if (C.Descriptors.empty())
+    Stream << kIndent << "return {};\n";
+  else {
+    Stream << kIndent << "using namespace __llvm_libc;\n";
+    Stream << kIndent << "static constexpr " << C.Type
+           << " kConfigurations[] = {\n";
+    Stream << C.Descriptors;
+    Stream << kIndent << "};\n";
+    Stream << kIndent << "return llvm::makeArrayRef(kConfigurations);\n";
+  }
+  Stream << "}\n";
+  return Stream;
+}
+
+static void Serialize(raw_ostream &Stream, FunctionType FT,
+                      ArrayRef<NamedFunctionDescriptor> Descriptors) {
+  Configuration Conf;
+  Conf.Name = {FT};
+  Conf.Type = {FT};
+  for (const auto &FD : Descriptors)
+    if (FD.Desc.Type == FT)
+      Conf.Descriptors.push_back(&FD);
+  Stream << Conf;
+}
+
+} // namespace configurations
+static void Serialize(raw_ostream &Stream,
+                      ArrayRef<NamedFunctionDescriptor> Descriptors) {
+  Stream << "// This file is auto-generated by libc/benchmarks/automemcpy.\n";
+  Stream << "// Functions : " << Descriptors.size() << "\n";
+  Stream << "\n";
+  Stream << "#include \"LibcFunctionPrototypes.h\"\n";
+  Stream << "#include \"automemcpy/FunctionDescriptor.h\"\n";
+  Stream << "#include \"src/string/memory_utils/elements.h\"\n";
+  Stream << "\n";
+  Stream << "using llvm::libc_benchmarks::BzeroConfiguration;\n";
+  Stream << "using llvm::libc_benchmarks::MemcmpOrBcmpConfiguration;\n";
+  Stream << "using llvm::libc_benchmarks::MemcpyConfiguration;\n";
+  Stream << "using llvm::libc_benchmarks::MemsetConfiguration;\n";
+  Stream << "\n";
+  Stream << "namespace __llvm_libc {\n";
+  Stream << "\n";
+  codegen::functions::Serialize(Stream, Descriptors);
+  Stream << "\n";
+  Stream << "} // namespace __llvm_libc\n";
+  Stream << "\n";
+  Stream << "namespace llvm {\n";
+  Stream << "namespace automemcpy {\n";
+  Stream << "\n";
+  codegen::descriptors::Serialize(Stream, Descriptors);
+  Stream << "\n";
+  Stream << "} // namespace automemcpy\n";
+  Stream << "} // namespace llvm\n";
+  Stream << "\n";
+  Stream << R"(
+using MemcpyStub = void (*)(char *__restrict, const char *__restrict, size_t);
+template <MemcpyStub Foo>
+void *Wrap(void *__restrict dst, const void *__restrict src, size_t size) {
+  Foo(reinterpret_cast<char *__restrict>(dst),
+      reinterpret_cast<const char *__restrict>(src), size);
+  return dst;
+}
+)";
+  codegen::configurations::Serialize(Stream, FunctionType::MEMCPY, Descriptors);
+  Stream << R"(
+using MemcmpStub = int (*)(const char *, const char *, size_t);
+template <MemcmpStub Foo>
+int Wrap(const void *lhs, const void *rhs, size_t size) {
+  return Foo(reinterpret_cast<const char *>(lhs),
+             reinterpret_cast<const char *>(rhs), size);
+}
+)";
+  codegen::configurations::Serialize(Stream, FunctionType::MEMCMP, Descriptors);
+  codegen::configurations::Serialize(Stream, FunctionType::BCMP, Descriptors);
+  Stream << R"(
+using MemsetStub = void (*)(char *, int, size_t);
+template <MemsetStub Foo> void *Wrap(void *dst, int value, size_t size) {
+  Foo(reinterpret_cast<char *>(dst), value, size);
+  return dst;
+}
+)";
+  codegen::configurations::Serialize(Stream, FunctionType::MEMSET, Descriptors);
+  Stream << R"(
+using BzeroStub = void (*)(char *, size_t);
+template <BzeroStub Foo> void Wrap(void *dst, size_t size) {
+  Foo(reinterpret_cast<char *>(dst), size);
+}
+)";
+  codegen::configurations::Serialize(Stream, FunctionType::BZERO, Descriptors);
+  Stream << "// Functions : " << Descriptors.size() << "\n";
+}
+
+} // namespace codegen
+
+// Stores `VolatileStr` into a cache and returns a StringRef of the cached
+// version.
+StringRef getInternalizedString(std::string VolatileStr) {
+  static llvm::StringSet<> StringCache;
+  return StringCache.insert(std::move(VolatileStr)).first->getKey();
+}
+
+static StringRef getString(FunctionType FT) {
+  switch (FT) {
+  case FunctionType::MEMCPY:
+    return "memcpy";
+  case FunctionType::MEMCMP:
+    return "memcmp";
+  case FunctionType::BCMP:
+    return "bcmp";
+  case FunctionType::MEMSET:
+    return "memset";
+  case FunctionType::BZERO:
+    return "bzero";
+  }
+}
+
+void Serialize(raw_ostream &Stream, ArrayRef<FunctionDescriptor> Descriptors) {
+  std::vector<NamedFunctionDescriptor> FunctionDescriptors;
+  FunctionDescriptors.reserve(Descriptors.size());
+  for (auto &FD : Descriptors) {
+    FunctionDescriptors.emplace_back();
+    FunctionDescriptors.back().Name = getInternalizedString(
+        formatv("{0}_{1:X16}", getString(FD.Type), FD.id()));
+    FunctionDescriptors.back().Desc = std::move(FD);
+  }
+  // Sort functions so they are easier to spot in the generated C++ file.
+  std::sort(FunctionDescriptors.begin(), FunctionDescriptors.end(),
+            [](const NamedFunctionDescriptor &A,
+               const NamedFunctionDescriptor &B) { return A.Desc < B.Desc; });
+  codegen::Serialize(Stream, FunctionDescriptors);
+}
+
+} // namespace automemcpy
+} // namespace llvm

diff  --git a/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp b/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp
new file mode 100644
index 0000000000000..618e4f1186e30
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp
@@ -0,0 +1,28 @@
+#include "automemcpy/CodeGen.h"
+#include "automemcpy/RandomFunctionGenerator.h"
+#include <unordered_set>
+
+namespace llvm {
+namespace automemcpy {
+
+std::vector<FunctionDescriptor> generateFunctionDescriptors() {
+  std::unordered_set<FunctionDescriptor, FunctionDescriptor::Hasher> Seen;
+  std::vector<FunctionDescriptor> FunctionDescriptors;
+  RandomFunctionGenerator P;
+  while (Optional<FunctionDescriptor> MaybeFD = P.next()) {
+    FunctionDescriptor FD = *MaybeFD;
+    if (Seen.count(FD)) // FIXME: Z3 sometimes returns twice the same object.
+      continue;
+    Seen.insert(FD);
+    FunctionDescriptors.push_back(std::move(FD));
+  }
+  return FunctionDescriptors;
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+int main(int, char **) {
+  llvm::automemcpy::Serialize(llvm::outs(),
+                              llvm::automemcpy::generateFunctionDescriptors());
+}

diff  --git a/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp b/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp
new file mode 100644
index 0000000000000..763fbc6d85a3d
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp
@@ -0,0 +1,279 @@
+//===-- Generate random but valid function descriptors  -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/RandomFunctionGenerator.h"
+
+#include <llvm/ADT/None.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <set>
+
+namespace llvm {
+namespace automemcpy {
+
+// Exploration parameters
+// ----------------------
+// Here we define a set of values that will contraint the exploration and
+// limit combinatorial explosion.
+
+// We limit the number of cases for individual sizes to sizes up to 4.
+// More individual sizes don't bring much over the overlapping strategy.
+static constexpr int kMaxIndividualSize = 4;
+
+// We limit Overlapping Strategy to sizes up to 256.
+// An overlap of 256B means accessing 128B at once which is usually not
+// feasible by current CPUs. We rely on the compiler to generate multiple
+// loads/stores if needed but higher sizes are unlikely to benefit from hardware
+// acceleration.
+static constexpr int kMaxOverlapSize = 256;
+
+// For the loop strategies, we make sure that they iterate at least a certain
+// number of times to amortize the cost of looping.
+static constexpr int kLoopMinIter = 3;
+static constexpr int kAlignedLoopMinIter = 2;
+
+// We restrict the size of the block of data to handle in a loop.
+// Generally speaking block size <= 16 perform poorly.
+static constexpr int kLoopBlockSize[] = {16, 32, 64};
+
+// We restrict alignment to the following values.
+static constexpr int kLoopAlignments[] = {16, 32, 64};
+
+// We make sure that the region bounds are one of the following values.
+static constexpr int kAnchors[] = {0,  1,  2,   4,   8,   16,   32,      48,
+                                   64, 96, 128, 256, 512, 1024, kMaxSize};
+
+// We also allow disabling loops, aligned loops and accelerators.
+static constexpr bool kDisableLoop = false;
+static constexpr bool kDisableAlignedLoop = false;
+static constexpr bool kDisableAccelerator = false;
+
+// For memcpy, we can also explore whether aligning on source or destination has
+// an effect.
+static constexpr bool kExploreAlignmentArg = true;
+
+// The function we generate code for.
+// BCMP is specifically disabled for now.
+static constexpr int kFunctionTypes[] = {
+    (int)FunctionType::MEMCPY,
+    (int)FunctionType::MEMCMP,
+    //  (int)FunctionType::BCMP,
+    (int)FunctionType::MEMSET,
+    (int)FunctionType::BZERO,
+};
+
+// The actual implementation of each function can be handled via primitive types
+// (SCALAR), vector types where available (NATIVE) or by the compiler (BUILTIN).
+// We want to move toward delegating the code generation entirely to the
+// compiler but for now we have to make use of -per microarchitecture- custom
+// implementations. Scalar being more portable but also less performant, we
+// remove it as well.
+static constexpr int kElementClasses[] = {
+    // (int)ElementTypeClass::SCALAR,
+    (int)ElementTypeClass::NATIVE,
+    // (int)ElementTypeClass::BUILTIN
+};
+
+RandomFunctionGenerator::RandomFunctionGenerator()
+    : Solver(Context), Type(Context.int_const("Type")),
+      ContiguousBegin(Context.int_const("ContiguousBegin")),
+      ContiguousEnd(Context.int_const("ContiguousEnd")),
+      OverlapBegin(Context.int_const("OverlapBegin")),
+      OverlapEnd(Context.int_const("OverlapEnd")),
+      LoopBegin(Context.int_const("LoopBegin")),
+      LoopEnd(Context.int_const("LoopEnd")),
+      LoopBlockSize(Context.int_const("LoopBlockSize")),
+      AlignedLoopBegin(Context.int_const("AlignedLoopBegin")),
+      AlignedLoopEnd(Context.int_const("AlignedLoopEnd")),
+      AlignedLoopBlockSize(Context.int_const("AlignedLoopBlockSize")),
+      AlignedAlignment(Context.int_const("AlignedAlignment")),
+      AlignedArg(Context.int_const("AlignedArg")),
+      AcceleratorBegin(Context.int_const("AcceleratorBegin")),
+      AcceleratorEnd(Context.int_const("AcceleratorEnd")),
+      ElementClass(Context.int_const("ElementClass")) {
+  // All possible functions.
+  Solver.add(inSetConstraint(Type, kFunctionTypes));
+
+  // Add constraints for region bounds.
+  addBoundsAndAnchors(ContiguousBegin, ContiguousEnd);
+  addBoundsAndAnchors(OverlapBegin, OverlapEnd);
+  addBoundsAndAnchors(LoopBegin, LoopEnd);
+  addBoundsAndAnchors(AlignedLoopBegin, AlignedLoopEnd);
+  addBoundsAndAnchors(AcceleratorBegin, AcceleratorEnd);
+  // We always consider strategies in this order, and we
+  // always end with the `Accelerator` strategy, as it's typically more
+  // efficient for large sizes.
+  // Contiguous <= Overlap <= Loop <= AlignedLoop <= Accelerator
+  Solver.add(ContiguousEnd == OverlapBegin);
+  Solver.add(OverlapEnd == LoopBegin);
+  Solver.add(LoopEnd == AlignedLoopBegin);
+  Solver.add(AlignedLoopEnd == AcceleratorBegin);
+  // Fix endpoints: The minimum size that we want to copy is 0, and we always
+  // start with the `Contiguous` strategy. The max size is `kMaxSize`.
+  Solver.add(ContiguousBegin == 0);
+  Solver.add(AcceleratorEnd == kMaxSize);
+  // Contiguous
+  Solver.add(ContiguousEnd <= kMaxIndividualSize + 1);
+  // Overlap
+  Solver.add(OverlapEnd <= kMaxOverlapSize + 1);
+  // Overlap only ever makes sense when accessing multiple bytes at a time.
+  // i.e. Overlap<1> is useless.
+  Solver.add(OverlapBegin == OverlapEnd || OverlapBegin >= 2);
+  // Loop
+  addLoopConstraints(LoopBegin, LoopEnd, LoopBlockSize, kLoopMinIter);
+  // Aligned Loop
+  addLoopConstraints(AlignedLoopBegin, AlignedLoopEnd, AlignedLoopBlockSize,
+                     kAlignedLoopMinIter);
+  Solver.add(inSetConstraint(AlignedAlignment, kLoopAlignments));
+  Solver.add(AlignedLoopBegin == AlignedLoopEnd || AlignedLoopBegin >= 64);
+  Solver.add(AlignedLoopBlockSize >= AlignedAlignment);
+  Solver.add(AlignedLoopBlockSize >= LoopBlockSize);
+  z3::expr IsMemcpy = Type == (int)FunctionType::MEMCPY;
+  z3::expr ExploreAlignment = IsMemcpy && kExploreAlignmentArg;
+  Solver.add(
+      (ExploreAlignment &&
+       inSetConstraint(AlignedArg, {(int)AlignArg::_1, (int)AlignArg::_2})) ||
+      (!ExploreAlignment && AlignedArg == (int)AlignArg::_1));
+  // Accelerator
+  Solver.add(IsMemcpy ||
+             (AcceleratorBegin ==
+              AcceleratorEnd)); // Only Memcpy has accelerator for now.
+  // Element classes
+  Solver.add(inSetConstraint(ElementClass, kElementClasses));
+
+  if (kDisableLoop)
+    Solver.add(LoopBegin == LoopEnd);
+  if (kDisableAlignedLoop)
+    Solver.add(AlignedLoopBegin == AlignedLoopEnd);
+  if (kDisableAccelerator)
+    Solver.add(AcceleratorBegin == AcceleratorEnd);
+}
+
+// Creates SizeSpan from Begin/End values.
+// Returns llvm::None if Begin==End.
+static Optional<SizeSpan> AsSizeSpan(size_t Begin, size_t End) {
+  if (Begin == End)
+    return None;
+  SizeSpan SS;
+  SS.Begin = Begin;
+  SS.End = End;
+  return SS;
+}
+
+// Generic method to create a `Region` struct with a Span or None if span is
+// empty.
+template <typename Region>
+static Optional<Region> As(size_t Begin, size_t End) {
+  if (auto Span = AsSizeSpan(Begin, End)) {
+    Region Output;
+    Output.Span = *Span;
+    return Output;
+  }
+  return None;
+}
+
+// Returns a Loop struct or None if span is empty.
+static Optional<Loop> AsLoop(size_t Begin, size_t End, size_t BlockSize) {
+  if (auto Span = AsSizeSpan(Begin, End)) {
+    Loop Output;
+    Output.Span = *Span;
+    Output.BlockSize = BlockSize;
+    return Output;
+  }
+  return None;
+}
+
+// Returns an AlignedLoop struct or None if span is empty.
+static Optional<AlignedLoop> AsAlignedLoop(size_t Begin, size_t End,
+                                           size_t BlockSize, size_t Alignment,
+                                           AlignArg AlignTo) {
+  if (auto Loop = AsLoop(Begin, End, BlockSize)) {
+    AlignedLoop Output;
+    Output.Loop = *Loop;
+    Output.Alignment = Alignment;
+    Output.AlignTo = AlignTo;
+    return Output;
+  }
+  return None;
+}
+
+Optional<FunctionDescriptor> RandomFunctionGenerator::next() {
+  if (Solver.check() != z3::sat)
+    return {};
+
+  z3::model m = Solver.get_model();
+
+  // Helper method to get the current numerical value of a z3::expr.
+  const auto E = [&m](z3::expr &V) -> int {
+    return m.eval(V).get_numeral_int();
+  };
+
+  // Fill is the function descriptor to return.
+  FunctionDescriptor R;
+  R.Type = FunctionType(E(Type));
+  R.Contiguous = As<Contiguous>(E(ContiguousBegin), E(ContiguousEnd));
+  R.Overlap = As<Overlap>(E(OverlapBegin), E(OverlapEnd));
+  R.Loop = AsLoop(E(LoopBegin), E(LoopEnd), E(LoopBlockSize));
+  R.AlignedLoop = AsAlignedLoop(E(AlignedLoopBegin), E(AlignedLoopEnd),
+                                E(AlignedLoopBlockSize), E(AlignedAlignment),
+                                AlignArg(E(AlignedArg)));
+  R.Accelerator = As<Accelerator>(E(AcceleratorBegin), E(AcceleratorEnd));
+  R.ElementClass = ElementTypeClass(E(ElementClass));
+
+  // Express current state as a set of constraints.
+  z3::expr CurrentLayout =
+      (Type == E(Type)) && (ContiguousBegin == E(ContiguousBegin)) &&
+      (ContiguousEnd == E(ContiguousEnd)) &&
+      (OverlapBegin == E(OverlapBegin)) && (OverlapEnd == E(OverlapEnd)) &&
+      (LoopBegin == E(LoopBegin)) && (LoopEnd == E(LoopEnd)) &&
+      (LoopBlockSize == E(LoopBlockSize)) &&
+      (AlignedLoopBegin == E(AlignedLoopBegin)) &&
+      (AlignedLoopEnd == E(AlignedLoopEnd)) &&
+      (AlignedLoopBlockSize == E(AlignedLoopBlockSize)) &&
+      (AlignedAlignment == E(AlignedAlignment)) &&
+      (AlignedArg == E(AlignedArg)) &&
+      (AcceleratorBegin == E(AcceleratorBegin)) &&
+      (AcceleratorEnd == E(AcceleratorEnd)) &&
+      (ElementClass == E(ElementClass));
+
+  // Ask solver to never show this configuration ever again.
+  Solver.add(!CurrentLayout);
+  return R;
+}
+
+// Make sure `Variable` is one of the provided values.
+z3::expr RandomFunctionGenerator::inSetConstraint(z3::expr &Variable,
+                                                  ArrayRef<int> Values) const {
+  z3::expr_vector Args(Variable.ctx());
+  for (int Value : Values)
+    Args.push_back(Variable == Value);
+  return z3::mk_or(Args);
+}
+
+void RandomFunctionGenerator::addBoundsAndAnchors(z3::expr &Begin,
+                                                  z3::expr &End) {
+  // Begin and End are picked amongst a set of predefined values.
+  Solver.add(inSetConstraint(Begin, kAnchors));
+  Solver.add(inSetConstraint(End, kAnchors));
+  Solver.add(Begin >= 0);
+  Solver.add(Begin <= End);
+  Solver.add(End <= kMaxSize);
+}
+
+void RandomFunctionGenerator::addLoopConstraints(const z3::expr &LoopBegin,
+                                                 const z3::expr &LoopEnd,
+                                                 z3::expr &LoopBlockSize,
+                                                 int LoopMinIter) {
+  Solver.add(inSetConstraint(LoopBlockSize, kLoopBlockSize));
+  Solver.add(LoopBegin == LoopEnd ||
+             (LoopBegin > (LoopMinIter * LoopBlockSize)));
+}
+
+} // namespace automemcpy
+} // namespace llvm

diff  --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
new file mode 100644
index 0000000000000..00298f69f77f6
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
@@ -0,0 +1,180 @@
+//===-- Analyze benchmark JSON files --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This code analyzes the json file produced by the `automemcpy` binary.
+//
+// As a remainder, `automemcpy` will benchmark each autogenerated memory
+// functions against one of the predefined distributions available in the
+// `libc/benchmarks/distributions` folder.
+//
+// It works as follows:
+// - Reads one or more json files.
+// - If there are several runs for the same function and distribution, picks the
+//   median throughput (aka `BytesPerSecond`).
+// - Aggregates the throughput per distributions and scores them from worst (0)
+//   to best (1).
+// - Each distribution categorizes each function into one of the following
+//   categories: EXCELLENT, VERY_GOOD, GOOD, PASSABLE, INADEQUATE, MEDIOCRE,
+//   BAD.
+// - A process similar to the Majority Judgment voting system is used to `elect`
+//   the best function. The histogram of grades is returned so we can
+//   distinguish between functions with the same final grade. In the following
+//   example both functions grade EXCELLENT but we may prefer the second one.
+//
+//   |            | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | ...
+//   |------------|-----------|-----------|------|----------| ...
+//   | Function_1 |     7     |     1     |   2  |          | ...
+//   | Function_2 |     6     |     4     |      |          | ...
+
+#include "automemcpy/ResultAnalyzer.h"
+#include "llvm/ADT/StringRef.h"
+#include <numeric>
+#include <unordered_map>
+
+namespace llvm {
+
+namespace automemcpy {
+
+StringRef Grade::getString(const GradeEnum &GE) {
+  switch (GE) {
+  case EXCELLENT:
+    return "EXCELLENT";
+  case VERY_GOOD:
+    return "VERY_GOOD";
+  case GOOD:
+    return "GOOD";
+  case PASSABLE:
+    return "PASSABLE";
+  case INADEQUATE:
+    return "INADEQUATE";
+  case MEDIOCRE:
+    return "MEDIOCRE";
+  case BAD:
+    return "BAD";
+  case ARRAY_SIZE:
+    report_fatal_error("logic error");
+  }
+}
+
+Grade::GradeEnum Grade::judge(double Score) {
+  if (Score >= 6. / 7)
+    return EXCELLENT;
+  if (Score >= 5. / 7)
+    return VERY_GOOD;
+  if (Score >= 4. / 7)
+    return GOOD;
+  if (Score >= 3. / 7)
+    return PASSABLE;
+  if (Score >= 2. / 7)
+    return INADEQUATE;
+  if (Score >= 1. / 7)
+    return MEDIOCRE;
+  return BAD;
+}
+
+std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples) {
+  std::unordered_map<SampleId, std::vector<double>, SampleId::Hasher>
+      BucketedSamples;
+  for (const auto &S : Samples)
+    BucketedSamples[S.Id].push_back(S.BytesPerSecond);
+  std::unordered_map<FunctionId, StringMap<double>, FunctionId::Hasher>
+      Throughputs;
+  for (auto &Pair : BucketedSamples) {
+    const auto &Id = Pair.first;
+    auto &Values = Pair.second;
+    const size_t HalfSize = Values.size() / 2;
+    std::nth_element(Values.begin(), Values.begin() + HalfSize, Values.end());
+    const double MedianValue = Values[HalfSize];
+    Throughputs[Id.Function][Id.Distribution.Name] = MedianValue;
+  }
+  std::vector<FunctionData> Output;
+  for (auto &Pair : Throughputs) {
+    FunctionData Data;
+    Data.Id = Pair.first;
+    for (const auto &Pair : Pair.second)
+      Data.PerDistributionData[Pair.getKey()].MedianBytesPerSecond =
+          Pair.getValue();
+    Output.push_back(std::move(Data));
+  }
+  return Output;
+}
+
+void fillScores(MutableArrayRef<FunctionData> Functions) {
+  // A key to bucket throughput per function type and distribution.
+  struct Key {
+    FunctionType Type;
+    StringRef Distribution;
+
+    COMPARABLE_AND_HASHABLE(Key, Type, Distribution)
+  };
+
+  // Tracks minimum and maximum values.
+  struct MinMax {
+    double Min = std::numeric_limits<double>::max();
+    double Max = std::numeric_limits<double>::min();
+    void update(double Value) {
+      if (Value < Min)
+        Min = Value;
+      if (Value > Max)
+        Max = Value;
+    }
+    double normalize(double Value) const { return (Value - Min) / (Max - Min); }
+  };
+
+  std::unordered_map<Key, MinMax, Key::Hasher> ThroughputMinMax;
+  for (const auto &Function : Functions) {
+    const FunctionType Type = Function.Id.Type;
+    for (const auto &Pair : Function.PerDistributionData) {
+      const auto &Distribution = Pair.getKey();
+      const double Throughput = Pair.getValue().MedianBytesPerSecond;
+      const Key K{Type, Distribution};
+      ThroughputMinMax[K].update(Throughput);
+    }
+  }
+
+  for (auto &Function : Functions) {
+    const FunctionType Type = Function.Id.Type;
+    for (const auto &Pair : Function.PerDistributionData) {
+      const auto &Distribution = Pair.getKey();
+      const double Throughput = Pair.getValue().MedianBytesPerSecond;
+      const Key K{Type, Distribution};
+      Function.PerDistributionData[Distribution].Score =
+          ThroughputMinMax[K].normalize(Throughput);
+    }
+  }
+}
+
+void castVotes(MutableArrayRef<FunctionData> Functions) {
+  for (FunctionData &Function : Functions)
+    for (const auto &Pair : Function.PerDistributionData) {
+      const StringRef Distribution = Pair.getKey();
+      const double Score = Pair.getValue().Score;
+      const auto G = Grade::judge(Score);
+      ++(Function.GradeHisto[G]);
+      Function.PerDistributionData[Distribution].Grade = G;
+    }
+
+  for (FunctionData &Function : Functions) {
+    const auto &GradeHisto = Function.GradeHisto;
+    const size_t Votes =
+        std::accumulate(GradeHisto.begin(), GradeHisto.end(), 0U);
+    const size_t MedianVote = Votes / 2;
+    size_t CountedVotes = 0;
+    Grade::GradeEnum MedianGrade = Grade::BAD;
+    for (size_t I = 0; I < GradeHisto.size(); ++I) {
+      CountedVotes += GradeHisto[I];
+      if (CountedVotes > MedianVote) {
+        MedianGrade = Grade::GradeEnum(I);
+        break;
+      }
+    }
+    Function.FinalGrade = MedianGrade;
+  }
+}
+
+} // namespace automemcpy
+} // namespace llvm

diff  --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
new file mode 100644
index 0000000000000..6a657e432c18f
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
@@ -0,0 +1,158 @@
+//===-- Application to analyze benchmark JSON files -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/ResultAnalyzer.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+// User can specify one or more json filenames to process on the command line.
+static cl::list<std::string> InputFilenames(cl::Positional, cl::OneOrMore,
+                                            cl::desc("<input json files>"));
+
+namespace automemcpy {
+
+// This is defined in the autogenerated 'Implementations.cpp' file.
+extern ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors();
+
+// Iterates over all functions and fills a map of function name to function
+// descriptor pointers.
+static StringMap<const FunctionDescriptor *> createFunctionDescriptorMap() {
+  StringMap<const FunctionDescriptor *> Descriptors;
+  for (const NamedFunctionDescriptor &FD : getFunctionDescriptors())
+    Descriptors.insert_or_assign(FD.Name, &FD.Desc);
+  return Descriptors;
+}
+
+// Retrieves the function descriptor for a particular function name.
+static const FunctionDescriptor &getFunctionDescriptor(StringRef FunctionName) {
+  static StringMap<const FunctionDescriptor *> Descriptors =
+      createFunctionDescriptorMap();
+  const auto *FD = Descriptors.lookup(FunctionName);
+  if (!FD)
+    report_fatal_error(
+        Twine("No FunctionDescriptor for ").concat(FunctionName));
+  return *FD;
+}
+
+// Functions and distributions names are stored quite a few times so it's more
+// efficient to internalize these strings and refer to them through 'StringRef'.
+static StringRef getInternalizedString(StringRef VolatileStr) {
+  static llvm::StringSet<> StringCache;
+  return StringCache.insert(VolatileStr).first->getKey();
+}
+
+// Helper function for the LLVM JSON API.
+bool fromJSON(const json::Value &V, Sample &Out, json::Path P) {
+  std::string Label;
+  json::ObjectMapper O(V, P);
+  if (O && O.map("bytes_per_second", Out.BytesPerSecond) &&
+      O.map("label", Label)) {
+    const auto LabelPair = StringRef(Label).split(',');
+    Out.Id.Function.Name = getInternalizedString(LabelPair.first);
+    Out.Id.Function.Type = getFunctionDescriptor(LabelPair.first).Type;
+    Out.Id.Distribution.Name = getInternalizedString(LabelPair.second);
+    return true;
+  }
+  return false;
+}
+
+// An object to represent the content of the JSON file.
+// This is easier to parse/serialize JSON when the structures of the json file
+// maps the structure of the object.
+struct JsonFile {
+  std::vector<Sample> Samples;
+};
+
+// Helper function for the LLVM JSON API.
+bool fromJSON(const json::Value &V, JsonFile &JF, json::Path P) {
+  json::ObjectMapper O(V, P);
+  return O && O.map("benchmarks", JF.Samples);
+}
+
+// Global object to ease error reporting, it consumes errors and crash the
+// application with a meaningful message.
+static ExitOnError ExitOnErr;
+
+// Main JSON parsing method. Reads the content of the file pointed to by
+// 'Filename' and returns a JsonFile object.
+JsonFile parseJsonResultFile(StringRef Filename) {
+  auto Buf = ExitOnErr(errorOrToExpected(
+      MemoryBuffer::getFile(Filename, /*bool IsText=*/true,
+                            /*RequiresNullTerminator=*/false)));
+  auto JsonValue = ExitOnErr(json::parse(Buf->getBuffer()));
+  json::Path::Root Root;
+  JsonFile JF;
+  if (!fromJSON(JsonValue, JF, Root))
+    ExitOnErr(Root.getError());
+  return JF;
+}
+
+// Serializes the 'GradeHisto' to the provided 'Stream'.
+static void Serialize(raw_ostream &Stream, const GradeHistogram &GH) {
+  static constexpr std::array<StringRef, 9> kCharacters = {
+      " ", "▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"};
+
+  const size_t Max = *std::max_element(GH.begin(), GH.end());
+  for (size_t I = 0; I < GH.size(); ++I) {
+    size_t Index = (float(GH[I]) / Max) * (kCharacters.size() - 1);
+    Stream << kCharacters.at(Index);
+  }
+}
+
+int Main(int argc, char **argv) {
+  ExitOnErr.setBanner("Automemcpy Json Results Analyzer stopped with error: ");
+  cl::ParseCommandLineOptions(argc, argv, "Automemcpy Json Results Analyzer\n");
+
+  // Reads all samples stored in the input JSON files.
+  std::vector<Sample> Samples;
+  for (const auto &Filename : InputFilenames) {
+    auto Result = parseJsonResultFile(Filename);
+    llvm::append_range(Samples, Result.Samples);
+  }
+
+  // Extracts median of throughputs.
+  std::vector<FunctionData> Functions = getThroughputs(Samples);
+  fillScores(Functions);
+  castVotes(Functions);
+
+  // TODO: Implement tie breaking algorithm.
+  std::sort(Functions.begin(), Functions.end(),
+            [](const FunctionData &A, const FunctionData &B) {
+              return A.FinalGrade < B.FinalGrade;
+            });
+
+  // Present data by function type.
+  std::stable_sort(Functions.begin(), Functions.end(),
+                   [](const FunctionData &A, const FunctionData &B) {
+                     return A.Id.Type < B.Id.Type;
+                   });
+
+  // Print result.
+  for (const FunctionData &Function : Functions) {
+    outs() << formatv("{0,-10}", Grade::getString(Function.FinalGrade));
+    outs() << " |";
+    Serialize(outs(), Function.GradeHisto);
+    outs() << "| ";
+    outs().resetColor();
+    outs() << formatv("{0,+25}", Function.Id.Name);
+    outs() << "\n";
+  }
+
+  return EXIT_SUCCESS;
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+int main(int argc, char **argv) { return llvm::automemcpy::Main(argc, argv); }

diff  --git a/libc/benchmarks/automemcpy/unittests/CMakeLists.txt b/libc/benchmarks/automemcpy/unittests/CMakeLists.txt
new file mode 100644
index 0000000000000..35caaac1519ba
--- /dev/null
+++ b/libc/benchmarks/automemcpy/unittests/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_libc_benchmark_unittest(libc-automemcpy-codegen-test
+    SRCS CodeGenTest.cpp
+    DEPENDS automemcpy_codegen
+)
+
+add_libc_benchmark_unittest(libc-automemcpy-result-analyzer-test
+    SRCS ResultAnalyzerTest.cpp
+    DEPENDS automemcpy_result_analyzer_lib
+)

diff  --git a/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp b/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp
new file mode 100644
index 0000000000000..6849682c44459
--- /dev/null
+++ b/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp
@@ -0,0 +1,219 @@
+//===-- Automemcpy CodeGen Test -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/CodeGen.h"
+#include "automemcpy/RandomFunctionGenerator.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::AllOf;
+using testing::AnyOf;
+using testing::ElementsAre;
+using testing::Ge;
+using testing::Gt;
+using testing::Le;
+using testing::Lt;
+
+namespace llvm {
+namespace automemcpy {
+namespace {
+
+TEST(Automemcpy, Codegen) {
+  static constexpr FunctionDescriptor kDescriptors[] = {
+      {FunctionType::MEMCPY, llvm::None, llvm::None, llvm::None, llvm::None,
+       Accelerator{{0, kMaxSize}}, ElementTypeClass::NATIVE},
+      {FunctionType::MEMCPY, Contiguous{{0, 4}}, Overlap{{4, 256}},
+       Loop{{256, kMaxSize}, 64}, llvm::None, llvm::None,
+       ElementTypeClass::NATIVE},
+      {FunctionType::MEMCMP, Contiguous{{0, 2}}, Overlap{{2, 64}}, llvm::None,
+       AlignedLoop{Loop{{64, kMaxSize}, 16}, 16, AlignArg::_1}, llvm::None,
+       ElementTypeClass::NATIVE},
+      {FunctionType::MEMSET, Contiguous{{0, 2}}, Overlap{{2, 256}}, llvm::None,
+       AlignedLoop{Loop{{256, kMaxSize}, 32}, 16, AlignArg::_1}, llvm::None,
+       ElementTypeClass::NATIVE},
+      {FunctionType::MEMSET, Contiguous{{0, 2}}, Overlap{{2, 256}}, llvm::None,
+       AlignedLoop{Loop{{256, kMaxSize}, 32}, 32, AlignArg::_1}, llvm::None,
+       ElementTypeClass::NATIVE},
+      {FunctionType::BZERO, Contiguous{{0, 4}}, Overlap{{4, 128}}, llvm::None,
+       AlignedLoop{Loop{{128, kMaxSize}, 32}, 32, AlignArg::_1}, llvm::None,
+       ElementTypeClass::NATIVE},
+  };
+
+  std::string Output;
+  raw_string_ostream OutputStream(Output);
+  Serialize(OutputStream, kDescriptors);
+
+  EXPECT_STREQ(OutputStream.str().c_str(),
+               R"(// This file is auto-generated by libc/benchmarks/automemcpy.
+// Functions : 6
+
+#include "LibcFunctionPrototypes.h"
+#include "automemcpy/FunctionDescriptor.h"
+#include "src/string/memory_utils/elements.h"
+
+using llvm::libc_benchmarks::BzeroConfiguration;
+using llvm::libc_benchmarks::MemcmpOrBcmpConfiguration;
+using llvm::libc_benchmarks::MemcpyConfiguration;
+using llvm::libc_benchmarks::MemsetConfiguration;
+
+namespace __llvm_libc {
+
+static void memcpy_0xE00E29EE73994E2B(char *__restrict dst, const char *__restrict src, size_t size) {
+  using namespace __llvm_libc::x86;
+  return Copy<Accelerator>(dst, src, size);
+}
+static void memcpy_0x7381B60C7BE75EF9(char *__restrict dst, const char *__restrict src, size_t size) {
+  using namespace __llvm_libc::x86;
+  if(size == 0) return;
+  if(size == 1) return Copy<_1>(dst, src);
+  if(size == 2) return Copy<_2>(dst, src);
+  if(size == 3) return Copy<_3>(dst, src);
+  if(size < 8) return Copy<HeadTail<_4>>(dst, src, size);
+  if(size < 16) return Copy<HeadTail<_8>>(dst, src, size);
+  if(size < 32) return Copy<HeadTail<_16>>(dst, src, size);
+  if(size < 64) return Copy<HeadTail<_32>>(dst, src, size);
+  if(size < 128) return Copy<HeadTail<_64>>(dst, src, size);
+  if(size < 256) return Copy<HeadTail<_128>>(dst, src, size);
+  return Copy<Loop<_64>>(dst, src, size);
+}
+static int memcmp_0x348D7BA6DB0EE033(const char * lhs, const char * rhs, size_t size) {
+  using namespace __llvm_libc::x86;
+  if(size == 0) return 0;
+  if(size == 1) return ThreeWayCompare<_1>(lhs, rhs);
+  if(size < 4) return ThreeWayCompare<HeadTail<_2>>(lhs, rhs, size);
+  if(size < 8) return ThreeWayCompare<HeadTail<_4>>(lhs, rhs, size);
+  if(size < 16) return ThreeWayCompare<HeadTail<_8>>(lhs, rhs, size);
+  if(size < 32) return ThreeWayCompare<HeadTail<_16>>(lhs, rhs, size);
+  if(size < 64) return ThreeWayCompare<HeadTail<_32>>(lhs, rhs, size);
+  return ThreeWayCompare<Align<_16,Arg::Lhs>::Then<Loop<_16>>>(lhs, rhs, size);
+}
+static void memset_0x71E761699B999863(char * dst, int value, size_t size) {
+  using namespace __llvm_libc::x86;
+  if(size == 0) return;
+  if(size == 1) return SplatSet<_1>(dst, value);
+  if(size < 4) return SplatSet<HeadTail<_2>>(dst, value, size);
+  if(size < 8) return SplatSet<HeadTail<_4>>(dst, value, size);
+  if(size < 16) return SplatSet<HeadTail<_8>>(dst, value, size);
+  if(size < 32) return SplatSet<HeadTail<_16>>(dst, value, size);
+  if(size < 64) return SplatSet<HeadTail<_32>>(dst, value, size);
+  if(size < 128) return SplatSet<HeadTail<_64>>(dst, value, size);
+  if(size < 256) return SplatSet<HeadTail<_128>>(dst, value, size);
+  return SplatSet<Align<_16,Arg::Dst>::Then<Loop<_32>>>(dst, value, size);
+}
+static void memset_0x3DF0F44E2ED6A50F(char * dst, int value, size_t size) {
+  using namespace __llvm_libc::x86;
+  if(size == 0) return;
+  if(size == 1) return SplatSet<_1>(dst, value);
+  if(size < 4) return SplatSet<HeadTail<_2>>(dst, value, size);
+  if(size < 8) return SplatSet<HeadTail<_4>>(dst, value, size);
+  if(size < 16) return SplatSet<HeadTail<_8>>(dst, value, size);
+  if(size < 32) return SplatSet<HeadTail<_16>>(dst, value, size);
+  if(size < 64) return SplatSet<HeadTail<_32>>(dst, value, size);
+  if(size < 128) return SplatSet<HeadTail<_64>>(dst, value, size);
+  if(size < 256) return SplatSet<HeadTail<_128>>(dst, value, size);
+  return SplatSet<Align<_32,Arg::Dst>::Then<Loop<_32>>>(dst, value, size);
+}
+static void bzero_0x475977492C218AD4(char * dst, size_t size) {
+  using namespace __llvm_libc::x86;
+  if(size == 0) return;
+  if(size == 1) return SplatSet<_1>(dst, 0);
+  if(size == 2) return SplatSet<_2>(dst, 0);
+  if(size == 3) return SplatSet<_3>(dst, 0);
+  if(size < 8) return SplatSet<HeadTail<_4>>(dst, 0, size);
+  if(size < 16) return SplatSet<HeadTail<_8>>(dst, 0, size);
+  if(size < 32) return SplatSet<HeadTail<_16>>(dst, 0, size);
+  if(size < 64) return SplatSet<HeadTail<_32>>(dst, 0, size);
+  if(size < 128) return SplatSet<HeadTail<_64>>(dst, 0, size);
+  return SplatSet<Align<_32,Arg::Dst>::Then<Loop<_32>>>(dst, 0, size);
+}
+
+} // namespace __llvm_libc
+
+namespace llvm {
+namespace automemcpy {
+
+ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors() {
+  static constexpr NamedFunctionDescriptor kDescriptors[] = {
+    {"memcpy_0xE00E29EE73994E2B",{FunctionType::MEMCPY,llvm::None,llvm::None,llvm::None,llvm::None,Accelerator{{0,kMaxSize}},ElementTypeClass::NATIVE}},
+    {"memcpy_0x7381B60C7BE75EF9",{FunctionType::MEMCPY,Contiguous{{0,4}},Overlap{{4,256}},Loop{{256,kMaxSize},64},llvm::None,llvm::None,ElementTypeClass::NATIVE}},
+    {"memcmp_0x348D7BA6DB0EE033",{FunctionType::MEMCMP,Contiguous{{0,2}},Overlap{{2,64}},llvm::None,AlignedLoop{Loop{{64,kMaxSize},16},16,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+    {"memset_0x71E761699B999863",{FunctionType::MEMSET,Contiguous{{0,2}},Overlap{{2,256}},llvm::None,AlignedLoop{Loop{{256,kMaxSize},32},16,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+    {"memset_0x3DF0F44E2ED6A50F",{FunctionType::MEMSET,Contiguous{{0,2}},Overlap{{2,256}},llvm::None,AlignedLoop{Loop{{256,kMaxSize},32},32,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+    {"bzero_0x475977492C218AD4",{FunctionType::BZERO,Contiguous{{0,4}},Overlap{{4,128}},llvm::None,AlignedLoop{Loop{{128,kMaxSize},32},32,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+  };
+  return makeArrayRef(kDescriptors);
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+
+using MemcpyStub = void (*)(char *__restrict, const char *__restrict, size_t);
+template <MemcpyStub Foo>
+void *Wrap(void *__restrict dst, const void *__restrict src, size_t size) {
+  Foo(reinterpret_cast<char *__restrict>(dst),
+      reinterpret_cast<const char *__restrict>(src), size);
+  return dst;
+}
+llvm::ArrayRef<MemcpyConfiguration> getMemcpyConfigurations() {
+  using namespace __llvm_libc;
+  static constexpr MemcpyConfiguration kConfigurations[] = {
+    {Wrap<memcpy_0xE00E29EE73994E2B>, "memcpy_0xE00E29EE73994E2B"},
+    {Wrap<memcpy_0x7381B60C7BE75EF9>, "memcpy_0x7381B60C7BE75EF9"},
+  };
+  return llvm::makeArrayRef(kConfigurations);
+}
+
+using MemcmpStub = int (*)(const char *, const char *, size_t);
+template <MemcmpStub Foo>
+int Wrap(const void *lhs, const void *rhs, size_t size) {
+  return Foo(reinterpret_cast<const char *>(lhs),
+             reinterpret_cast<const char *>(rhs), size);
+}
+llvm::ArrayRef<MemcmpOrBcmpConfiguration> getMemcmpConfigurations() {
+  using namespace __llvm_libc;
+  static constexpr MemcmpOrBcmpConfiguration kConfigurations[] = {
+    {Wrap<memcmp_0x348D7BA6DB0EE033>, "memcmp_0x348D7BA6DB0EE033"},
+  };
+  return llvm::makeArrayRef(kConfigurations);
+}
+llvm::ArrayRef<MemcmpOrBcmpConfiguration> getBcmpConfigurations() {
+  return {};
+}
+
+using MemsetStub = void (*)(char *, int, size_t);
+template <MemsetStub Foo> void *Wrap(void *dst, int value, size_t size) {
+  Foo(reinterpret_cast<char *>(dst), value, size);
+  return dst;
+}
+llvm::ArrayRef<MemsetConfiguration> getMemsetConfigurations() {
+  using namespace __llvm_libc;
+  static constexpr MemsetConfiguration kConfigurations[] = {
+    {Wrap<memset_0x71E761699B999863>, "memset_0x71E761699B999863"},
+    {Wrap<memset_0x3DF0F44E2ED6A50F>, "memset_0x3DF0F44E2ED6A50F"},
+  };
+  return llvm::makeArrayRef(kConfigurations);
+}
+
+using BzeroStub = void (*)(char *, size_t);
+template <BzeroStub Foo> void Wrap(void *dst, size_t size) {
+  Foo(reinterpret_cast<char *>(dst), size);
+}
+llvm::ArrayRef<BzeroConfiguration> getBzeroConfigurations() {
+  using namespace __llvm_libc;
+  static constexpr BzeroConfiguration kConfigurations[] = {
+    {Wrap<bzero_0x475977492C218AD4>, "bzero_0x475977492C218AD4"},
+  };
+  return llvm::makeArrayRef(kConfigurations);
+}
+// Functions : 6
+)");
+}
+} // namespace
+} // namespace automemcpy
+} // namespace llvm

diff  --git a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
new file mode 100644
index 0000000000000..bce508d17acbd
--- /dev/null
+++ b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
@@ -0,0 +1,170 @@
+//===--  Automemcpy Json Results Analyzer Test ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/ResultAnalyzer.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::ElementsAre;
+using testing::Pair;
+using testing::SizeIs;
+
+namespace llvm {
+namespace automemcpy {
+namespace {
+
+TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsOneSample) {
+  static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+  static constexpr DistributionId DistA = {{"A"}};
+  static constexpr SampleId Id = {Foo1, DistA};
+  static constexpr Sample kSamples[] = {
+      Sample{Id, 4},
+  };
+
+  const std::vector<FunctionData> Data = getThroughputs(kSamples);
+  EXPECT_THAT(Data, SizeIs(1));
+  EXPECT_THAT(Data[0].Id, Foo1);
+  EXPECT_THAT(Data[0].PerDistributionData, SizeIs(1));
+  // A single value is provided.
+  EXPECT_THAT(
+      Data[0].PerDistributionData.lookup(DistA.Name).MedianBytesPerSecond, 4);
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsManySamplesSameBucket) {
+  static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+  static constexpr DistributionId DistA = {{"A"}};
+  static constexpr SampleId Id = {Foo1, DistA};
+  static constexpr Sample kSamples[] = {Sample{Id, 4}, Sample{Id, 5},
+                                        Sample{Id, 5}};
+
+  const std::vector<FunctionData> Data = getThroughputs(kSamples);
+  EXPECT_THAT(Data, SizeIs(1));
+  EXPECT_THAT(Data[0].Id, Foo1);
+  EXPECT_THAT(Data[0].PerDistributionData, SizeIs(1));
+  // When multiple values are provided we pick the median one (here median of 4,
+  // 5, 5).
+  EXPECT_THAT(
+      Data[0].PerDistributionData.lookup(DistA.Name).MedianBytesPerSecond, 5);
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsServeralFunctionAndDist) {
+  static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+  static constexpr DistributionId DistA = {{"A"}};
+  static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
+  static constexpr DistributionId DistB = {{"B"}};
+  static constexpr Sample kSamples[] = {
+      Sample{{Foo1, DistA}, 1}, Sample{{Foo1, DistB}, 2},
+      Sample{{Foo2, DistA}, 3}, Sample{{Foo2, DistB}, 4}};
+  // Data is aggregated per function.
+  const std::vector<FunctionData> Data = getThroughputs(kSamples);
+  EXPECT_THAT(Data, SizeIs(2)); // 2 functions Foo1 and Foo2.
+  // Each function has data for both distributions DistA and DistB.
+  EXPECT_THAT(Data[0].PerDistributionData, SizeIs(2));
+  EXPECT_THAT(Data[1].PerDistributionData, SizeIs(2));
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, getScore) {
+  static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+  static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
+  static constexpr FunctionId Foo3 = {"memcpy3", FunctionType::MEMCPY};
+  static constexpr DistributionId Dist = {{"A"}};
+  static constexpr Sample kSamples[] = {Sample{{Foo1, Dist}, 1},
+                                        Sample{{Foo2, Dist}, 2},
+                                        Sample{{Foo3, Dist}, 3}};
+
+  // Data is aggregated per function.
+  std::vector<FunctionData> Data = getThroughputs(kSamples);
+
+  // Sort Data by function name so we can test them.
+  std::sort(
+      Data.begin(), Data.end(),
+      [](const FunctionData &A, const FunctionData &B) { return A.Id < B.Id; });
+
+  EXPECT_THAT(Data[0].Id, Foo1);
+  EXPECT_THAT(Data[0].PerDistributionData.lookup("A").MedianBytesPerSecond, 1);
+  EXPECT_THAT(Data[1].Id, Foo2);
+  EXPECT_THAT(Data[1].PerDistributionData.lookup("A").MedianBytesPerSecond, 2);
+  EXPECT_THAT(Data[2].Id, Foo3);
+  EXPECT_THAT(Data[2].PerDistributionData.lookup("A").MedianBytesPerSecond, 3);
+
+  // Normalizes throughput per distribution.
+  fillScores(Data);
+  EXPECT_THAT(Data[0].PerDistributionData.lookup("A").Score, 0);
+  EXPECT_THAT(Data[1].PerDistributionData.lookup("A").Score, 0.5);
+  EXPECT_THAT(Data[2].PerDistributionData.lookup("A").Score, 1);
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, castVotes) {
+  static constexpr double kAbsErr = 0.01;
+
+  static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+  static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
+  static constexpr FunctionId Foo3 = {"memcpy3", FunctionType::MEMCPY};
+  static constexpr DistributionId DistA = {{"A"}};
+  static constexpr DistributionId DistB = {{"B"}};
+  static constexpr Sample kSamples[] = {
+      Sample{{Foo1, DistA}, 0}, Sample{{Foo1, DistB}, 30},
+      Sample{{Foo2, DistA}, 1}, Sample{{Foo2, DistB}, 100},
+      Sample{{Foo3, DistA}, 7}, Sample{{Foo3, DistB}, 100},
+  };
+
+  // DistA Thoughput ranges from 0 to 7.
+  // DistB Thoughput ranges from 30 to 100.
+
+  // Data is aggregated per function.
+  std::vector<FunctionData> Data = getThroughputs(kSamples);
+
+  // Sort Data by function name so we can test them.
+  std::sort(
+      Data.begin(), Data.end(),
+      [](const FunctionData &A, const FunctionData &B) { return A.Id < B.Id; });
+
+  // Normalizes throughput per distribution.
+  fillScores(Data);
+
+  // Cast votes
+  castVotes(Data);
+
+  EXPECT_THAT(Data[0].Id, Foo1);
+  EXPECT_THAT(Data[1].Id, Foo2);
+  EXPECT_THAT(Data[2].Id, Foo3);
+
+  // Distribution A
+  // Throughput is 0, 1 and 7, so normalized scores are 0, 1/7 and 1.
+  EXPECT_NEAR(Data[0].PerDistributionData.lookup("A").Score, 0, kAbsErr);
+  EXPECT_NEAR(Data[1].PerDistributionData.lookup("A").Score, 1. / 7, kAbsErr);
+  EXPECT_NEAR(Data[2].PerDistributionData.lookup("A").Score, 1, kAbsErr);
+  // which are turned into grades BAD,  MEDIOCRE and EXCELLENT.
+  EXPECT_THAT(Data[0].PerDistributionData.lookup("A").Grade, Grade::BAD);
+  EXPECT_THAT(Data[1].PerDistributionData.lookup("A").Grade, Grade::MEDIOCRE);
+  EXPECT_THAT(Data[2].PerDistributionData.lookup("A").Grade, Grade::EXCELLENT);
+
+  // Distribution B
+  // Throughput is 30, 100 and 100, so normalized scores are 0, 1 and 1.
+  EXPECT_NEAR(Data[0].PerDistributionData.lookup("B").Score, 0, kAbsErr);
+  EXPECT_NEAR(Data[1].PerDistributionData.lookup("B").Score, 1, kAbsErr);
+  EXPECT_NEAR(Data[2].PerDistributionData.lookup("B").Score, 1, kAbsErr);
+  // which are turned into grades BAD, EXCELLENT and EXCELLENT.
+  EXPECT_THAT(Data[0].PerDistributionData.lookup("B").Grade, Grade::BAD);
+  EXPECT_THAT(Data[1].PerDistributionData.lookup("B").Grade, Grade::EXCELLENT);
+  EXPECT_THAT(Data[2].PerDistributionData.lookup("B").Grade, Grade::EXCELLENT);
+
+  // Now looking from the functions point of view.
+  // Note the array is indexed by GradeEnum values (EXCELLENT=0 / BAD = 6)
+  EXPECT_THAT(Data[0].GradeHisto, ElementsAre(0, 0, 0, 0, 0, 0, 2));
+  EXPECT_THAT(Data[1].GradeHisto, ElementsAre(1, 0, 0, 0, 0, 1, 0));
+  EXPECT_THAT(Data[2].GradeHisto, ElementsAre(2, 0, 0, 0, 0, 0, 0));
+
+  EXPECT_THAT(Data[0].FinalGrade, Grade::BAD);
+  EXPECT_THAT(Data[1].FinalGrade, Grade::MEDIOCRE);
+  EXPECT_THAT(Data[2].FinalGrade, Grade::EXCELLENT);
+}
+
+} // namespace
+} // namespace automemcpy
+} // namespace llvm

diff  --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
index 1b893373e41cd..63e30baded12b 100644
--- a/libc/src/string/memory_utils/elements.h
+++ b/libc/src/string/memory_utils/elements.h
@@ -151,6 +151,43 @@ template <> struct Chained<> {
   static void SplatSet(char *dst, const unsigned char value) {}
 };
 
+// Overlap ElementA and ElementB so they span Size bytes.
+template <size_t Size, typename ElementA, typename ElementB = ElementA>
+struct Overlap {
+  static constexpr size_t kSize = Size;
+  static_assert(ElementB::kSize <= ElementA::kSize, "ElementB too big");
+  static_assert(ElementA::kSize <= Size, "ElementA too big");
+  static_assert((ElementA::kSize + ElementB::kSize) >= Size,
+                "Elements too small to overlap");
+  static constexpr size_t kOffset = kSize - ElementB::kSize;
+
+  static void Copy(char *__restrict dst, const char *__restrict src) {
+    ElementA::Copy(dst, src);
+    ElementB::Copy(dst + kOffset, src + kOffset);
+  }
+
+  static bool Equals(const char *lhs, const char *rhs) {
+    if (!ElementA::Equals(lhs, rhs))
+      return false;
+    if (!ElementB::Equals(lhs + kOffset, rhs + kOffset))
+      return false;
+    return true;
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs) {
+    if (!ElementA::Equals(lhs, rhs))
+      return ElementA::ThreeWayCompare(lhs, rhs);
+    if (!ElementB::Equals(lhs + kOffset, rhs + kOffset))
+      return ElementB::ThreeWayCompare(lhs + kOffset, rhs + kOffset);
+    return 0;
+  }
+
+  static void SplatSet(char *dst, const unsigned char value) {
+    ElementA::SplatSet(dst, value);
+    ElementB::SplatSet(dst + kOffset, value);
+  }
+};
+
 // Runtime-size Higher-Order Operations
 // ------------------------------------
 // - Tail<T>: Perform the operation on the last 'T::kSize' bytes of the buffer.