[libc-commits] [libc] 00c943a - [libc] automemcpy
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Thu Oct 28 04:10:39 PDT 2021
Author: Guillaume Chatelet
Date: 2021-10-28T11:10:15Z
New Revision: 00c943a5488577c1237df81fb5d9b3312f429738
URL: https://github.com/llvm/llvm-project/commit/00c943a5488577c1237df81fb5d9b3312f429738
DIFF: https://github.com/llvm/llvm-project/commit/00c943a5488577c1237df81fb5d9b3312f429738.diff
LOG: [libc] automemcpy
Added:
libc/benchmarks/automemcpy/CMakeLists.txt
libc/benchmarks/automemcpy/README.md
libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h
libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h
libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h
libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
libc/benchmarks/automemcpy/lib/CMakeLists.txt
libc/benchmarks/automemcpy/lib/CodeGen.cpp
libc/benchmarks/automemcpy/lib/CodeGenMain.cpp
libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp
libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
libc/benchmarks/automemcpy/unittests/CMakeLists.txt
libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp
libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
Modified:
libc/benchmarks/CMakeLists.txt
libc/src/string/memory_utils/elements.h
Removed:
################################################################################
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 18e5fc0c255c9..01aab0585bbf7 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -112,9 +112,14 @@ add_library(libc-memory-benchmark
EXCLUDE_FROM_ALL
LibcMemoryBenchmark.cpp
LibcMemoryBenchmark.h
+ LibcFunctionPrototypes.h
MemorySizeDistributions.cpp
MemorySizeDistributions.h
)
+target_include_directories(libc-memory-benchmark
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}
+)
target_link_libraries(libc-memory-benchmark
PUBLIC
libc-benchmark
@@ -196,3 +201,5 @@ target_link_libraries(libc.benchmarks.memory_functions.opt_host
libc.src.string.bzero_opt_host
benchmark_main
)
+
+add_subdirectory(automemcpy)
diff --git a/libc/benchmarks/automemcpy/CMakeLists.txt b/libc/benchmarks/automemcpy/CMakeLists.txt
new file mode 100644
index 0000000000000..ef9b4218c8d61
--- /dev/null
+++ b/libc/benchmarks/automemcpy/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(NOT LIBC_BUILD_AUTOMEMCPY)
+ return ()
+endif()
+
+if(NOT LLVM_WITH_Z3)
+ MESSAGE(FATAL_ERROR "Building llvm-libc automemcpy requires Z3")
+endif()
+
+set(LIBC_AUTOMEMCPY_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+add_subdirectory(lib)
+add_subdirectory(unittests)
diff --git a/libc/benchmarks/automemcpy/README.md b/libc/benchmarks/automemcpy/README.md
new file mode 100644
index 0000000000000..88d0b7ece9b9f
--- /dev/null
+++ b/libc/benchmarks/automemcpy/README.md
@@ -0,0 +1,111 @@
+This folder contains an implementation of [automemcpy: A framework for automatic generation of fundamental memory operations](https://research.google/pubs/pub50338/).
+
+It uses the [Z3 theorem prover](https://github.com/Z3Prover/z3) to enumerate a subset of valid memory function implementations. These implementations are then materialized as C++ code and can be [benchmarked](../) against various [size distributions](../distributions). This process helps the design of efficient implementations for a particular environnement (size distribution, processor or custom compilation options).
+
+This is not enabled by default, as it is mostly useful when working on tuning the library implementation. To build it, use `LIBC_BUILD_AUTOMEMCPY=ON` (see below).
+
+## Prerequisites
+
+You may need to install `Z3` from source if it's not available on your system.
+Here we show instructions to install it into `<Z3_INSTALL_DIR>`.
+You may need to `sudo` to `make install`.
+
+```shell
+mkdir -p ~/git
+cd ~/git
+git clone https://github.com/Z3Prover/z3.git
+python scripts/mk_make.py --prefix=<Z3_INSTALL_DIR>
+cd build
+make -j
+make install
+```
+
+## Configuration
+
+```shell
+mkdir -p <BUILD_DIR>
+cd <LLVM_PROJECT_DIR>/llvm
+cmake -DCMAKE_C_COMPILER=/usr/bin/clang \
+ -DCMAKE_CXX_COMPILER=/usr/bin/clang++ \
+ -DLLVM_ENABLE_PROJECTS="libc" \
+ -DLLVM_ENABLE_Z3_SOLVER=ON \
+ -DLLVM_Z3_INSTALL_DIR=<Z3_INSTALL_DIR> \
+ -DLIBC_BUILD_AUTOMEMCPY=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -B<BUILD_DIR>
+```
+
+## Targets and compilation
+
+There are three main CMake targets
+ 1. `automemcpy_implementations`
+ - runs `Z3` and materializes valid memory functions as C++ code, a message will display its ondisk location.
+ - the source code is then compiled using the native host optimizations (i.e. `-march=native` or `-mcpu=native` depending on the architecture).
+ 2. `automemcpy`
+ - the binary that benchmarks the autogenerated implementations.
+ 3. `automemcpy_result_analyzer`
+ - the binary that analyses the benchmark results.
+
+You may only compile the binaries as they both pull the autogenerated code as a dependency.
+
+```shell
+make -C <BUILD_DIR> -j automemcpy automemcpy_result_analyzer
+```
+
+## Running the benchmarks
+
+Make sure to save the results of the benchmark as a json file.
+
+```shell
+<BUILD_DIR>/bin/automemcpy --benchmark_out_format=json --benchmark_out=<RESULTS_DIR>/results.json
+```
+
+### Additional useful options
+
+
+ - `--benchmark_min_time=.2`
+
+ By default, each function is benchmarked for at least one second, here we lower it to 200ms.
+
+ - `--benchmark_filter="BM_Memset|BM_Bzero"`
+
+ By default, all functions are benchmarked, here we restrict them to `memset` and `bzero`.
+
+Other options might be useful, use `--help` for more information.
+
+## Analyzing the benchmarks
+
+Analysis is performed by running `automemcpy_result_analyzer` on one or more json result files.
+
+```shell
+<BUILD_DIR>/bin/automemcpy_result_analyzer <RESULTS_DIR>/results.json
+```
+
+What it does:
+ 1. Gathers all throughput values for each function / distribution pair and picks the median one.\
+ This allows picking a representative value over many runs of the benchmark. Please make sure all the runs happen under similar circumstances.
+
+ 2. For each distribution, look at the span of throughputs for functions of the same type (e.g. For distribution `A`, memcpy throughput spans from 2GiB/s to 5GiB/s).
+
+ 3. For each distribution, give a normalized score to each function (e.g. For distribution `A`, function `M` scores 0.65).\
+ This score is then turned into a grade `EXCELLENT`, `VERY_GOOD`, `GOOD`, `PASSABLE`, `INADEQUATE`, `MEDIOCRE`, `BAD` - so that each distribution categorizes how function perform according to them.
+
+ 4. A [Majority Judgement](https://en.wikipedia.org/wiki/Majority_judgment) process is then used to categorize each function. This enables finer analysis of how distributions agree on which function is better. In the following example, `Function_1` and `Function_2` are rated `EXCELLENT` but looking at the grade's distribution might help decide which is best.
+
+| | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | INADEQUATE | MEDIOCRE | BAD |
+|------------|:---------:|:---------:|:----:|:--------:|:----------:|:--------:|:---:|
+| Function_1 | 7 | 1 | 2 | | | | |
+| Function_2 | 6 | 4 | | | | | |
+
+The tool outputs the histogram of grades for each function. In case of tie, other dimensions might help decide (e.g. code size, performance on other microarchitectures).
+
+```
+EXCELLENT |█▁▂ | Function_0
+EXCELLENT |█▅ | Function_1
+VERY_GOOD |▂█▁ ▁ | Function_2
+GOOD | ▁█▄ | Function_3
+PASSABLE | ▂▆▄█ | Function_4
+INADEQUATE | ▃▃█▁ | Function_5
+MEDIOCRE | █▆▁| Function_6
+BAD | ▁▁█| Function_7
+```
diff --git a/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h b/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h
new file mode 100644
index 0000000000000..389e8249f9399
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/CodeGen.h
@@ -0,0 +1,26 @@
+//===-- C++ code generation from NamedFunctionDescriptors -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H
+#define LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H
+
+#include "automemcpy/FunctionDescriptor.h"
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/Support/raw_ostream.h>
+#include <vector>
+
+namespace llvm {
+namespace automemcpy {
+
+// This function serializes the array of FunctionDescriptors as a C++ file.
+void Serialize(raw_ostream &Stream, ArrayRef<FunctionDescriptor> FD);
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif // LIBC_BENCHMARKS_AUTOMEMCPY_CODEGEN_H
diff --git a/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h b/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h
new file mode 100644
index 0000000000000..444d856a7260d
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/FunctionDescriptor.h
@@ -0,0 +1,159 @@
+//===-- Pod structs to describe a memory function----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H
+#define LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H
+
+#include <climits>
+#include <cstddef>
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/Hashing.h>
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/StringRef.h>
+#include <tuple>
+
+namespace llvm {
+namespace automemcpy {
+
+// Boilerplate code to be able to sort and hash types.
+#define COMPARABLE_AND_HASHABLE(T, ...) \
+ inline auto asTuple() const { return std::tie(__VA_ARGS__); } \
+ bool operator==(const T &O) const { return asTuple() == O.asTuple(); } \
+ bool operator<(const T &O) const { return asTuple() < O.asTuple(); } \
+ struct Hasher { \
+ std::size_t operator()(const T &K) const { \
+ return llvm::hash_value(K.asTuple()); \
+ } \
+ };
+
+// Represents the maximum value for the size parameter of a memory function.
+// This is an `int` so we can use it as an expression in Z3.
+// It also allows for a more readable and compact representation when storing
+// the SizeSpan in the autogenerated C++ file.
+static constexpr int kMaxSize = INT_MAX;
+
+// This mimics the `Arg` type in libc/src/string/memory_utils/elements.h without
+// having to depend on it.
+enum class AlignArg { _1, _2, ARRAY_SIZE };
+
+// Describes a range of sizes.
+// We use the begin/end representation instead of first/last to allow for empty
+// range (i.e. Begin == End)
+struct SizeSpan {
+ size_t Begin = 0;
+ size_t End = 0;
+
+ COMPARABLE_AND_HASHABLE(SizeSpan, Begin, End)
+};
+
+// Describes a contiguous region.
+// In such a region all sizes are handled individually.
+// e.g. with Span = {0, 2};
+// if(size == 0) return Handle<0>();
+// if(size == 1) return Handle<1>();
+struct Contiguous {
+ SizeSpan Span;
+
+ COMPARABLE_AND_HASHABLE(Contiguous, Span)
+};
+
+// This struct represents a range of sizes over which to use an overlapping
+// strategy. An overlapping strategy of size N handles all sizes from N to 2xN.
+// The span may represent several contiguous overlaps.
+// e.g. with Span = {16, 128};
+// if(size >= 16 and size < 32) return Handle<Overlap<16>>();
+// if(size >= 32 and size < 64) return Handle<Overlap<32>>();
+// if(size >= 64 and size < 128) return Handle<Overlap<64>>();
+struct Overlap {
+ SizeSpan Span;
+
+ COMPARABLE_AND_HASHABLE(Overlap, Span)
+};
+
+// Describes a region using a loop handling BlockSize bytes at a time. The
+// remaining bytes of the loop are handled with an overlapping operation.
+struct Loop {
+ SizeSpan Span;
+ size_t BlockSize = 0;
+
+ COMPARABLE_AND_HASHABLE(Loop, Span, BlockSize)
+};
+
+// Same as `Loop` but starts by aligning a buffer on `Alignment` bytes.
+// A first operation handling 'Alignment` bytes is performed followed by a
+// sequence of Loop.BlockSize bytes operation. The Loop starts processing from
+// the next aligned byte in the chosen buffer. The remaining bytes of the loop
+// are handled with an overlapping operation.
+struct AlignedLoop {
+ Loop Loop;
+ size_t Alignment = 0; // Size of the alignment.
+ AlignArg AlignTo = AlignArg::_1; // Which buffer to align.
+
+ COMPARABLE_AND_HASHABLE(AlignedLoop, Loop, Alignment, AlignTo)
+};
+
+// Some processors offer special instruction to handle the memory function
+// completely, we refer to such instructions as accelerators.
+struct Accelerator {
+ SizeSpan Span;
+
+ COMPARABLE_AND_HASHABLE(Accelerator, Span)
+};
+
+// The memory functions are assembled out of primitives that can be implemented
+// with regular scalar operations (SCALAR), with the help of vector or bitcount
+// instructions (NATIVE) or by deferring it to the compiler (BUILTIN).
+enum class ElementTypeClass {
+ SCALAR,
+ NATIVE,
+ BUILTIN,
+};
+
+// A simple enum to categorize which function is being implemented.
+enum class FunctionType {
+ MEMCPY,
+ MEMCMP,
+ BCMP,
+ MEMSET,
+ BZERO,
+};
+
+// This struct describes the skeleton of the implementation, it does not go into
+// every detail but is enough to uniquely identify the implementation.
+struct FunctionDescriptor {
+ FunctionType Type;
+ Optional<Contiguous> Contiguous;
+ Optional<Overlap> Overlap;
+ Optional<Loop> Loop;
+ Optional<AlignedLoop> AlignedLoop;
+ Optional<Accelerator> Accelerator;
+ ElementTypeClass ElementClass;
+
+ COMPARABLE_AND_HASHABLE(FunctionDescriptor, Type, Contiguous, Overlap, Loop,
+ AlignedLoop, Accelerator, ElementClass)
+
+ inline size_t id() const { return llvm::hash_value(asTuple()); }
+};
+
+// Same as above but with the function name.
+struct NamedFunctionDescriptor {
+ StringRef Name;
+ FunctionDescriptor Desc;
+};
+
+template <typename T> llvm::hash_code hash_value(const ArrayRef<T> &V) {
+ return llvm::hash_combine_range(V.begin(), V.end());
+}
+template <typename T> llvm::hash_code hash_value(const T &O) {
+ return llvm::hash_value(O.asTuple());
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif /* LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_COMMON_H */
diff --git a/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h b/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h
new file mode 100644
index 0000000000000..48e8815801c55
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/RandomFunctionGenerator.h
@@ -0,0 +1,62 @@
+//===-- Generate random but valid function descriptors ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H
+#define LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H
+
+#include "automemcpy/FunctionDescriptor.h"
+#include <cstddef>
+#include <cstdint>
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/StringRef.h>
+#include <vector>
+#include <z3++.h>
+
+namespace llvm {
+namespace automemcpy {
+
+// Holds the state for the constraint solver.
+// It implements a single method that returns the next valid description.
+struct RandomFunctionGenerator {
+ RandomFunctionGenerator();
+
+ // Get the next valid FunctionDescriptor or llvm::None.
+ Optional<FunctionDescriptor> next();
+
+private:
+ // Returns an expression where `Variable` is forced to be one of the `Values`.
+ z3::expr inSetConstraint(z3::expr &Variable, ArrayRef<int> Values) const;
+ // Add constaints to `Begin` and `End` so that they are:
+ // - between 0 and kMaxSize (inclusive)
+ // - ordered (begin<=End)
+ // - amongst a set of predefined values.
+ void addBoundsAndAnchors(z3::expr &Begin, z3::expr &End);
+ // Add constraints to make sure that the loop block size is amongst a set of
+ // predefined values. Also makes sure that the loop that the loop is iterated
+ // at least `LoopMinIter` times.
+ void addLoopConstraints(const z3::expr &LoopBegin, const z3::expr &LoopEnd,
+ z3::expr &LoopBlockSize, int LoopMinIter);
+
+ z3::context Context;
+ z3::solver Solver;
+
+ z3::expr Type;
+ z3::expr ContiguousBegin, ContiguousEnd;
+ z3::expr OverlapBegin, OverlapEnd;
+ z3::expr LoopBegin, LoopEnd, LoopBlockSize;
+ z3::expr AlignedLoopBegin, AlignedLoopEnd, AlignedLoopBlockSize,
+ AlignedAlignment, AlignedArg;
+ z3::expr AcceleratorBegin, AcceleratorEnd;
+ z3::expr ElementClass;
+};
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif /* LLVM_LIBC_BENCHMARKS_AUTOMEMCPY_RANDOM_FUNCTION_GENERATOR_H */
diff --git a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
new file mode 100644
index 0000000000000..845c3e1e1180f
--- /dev/null
+++ b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
@@ -0,0 +1,99 @@
+//===-- Analyze benchmark JSON files ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBC_BENCHMARKS_AUTOMEMCPY_RESULTANALYZER_H
+#define LIBC_BENCHMARKS_AUTOMEMCPY_RESULTANALYZER_H
+
+#include "automemcpy/FunctionDescriptor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
+#include <array>
+#include <vector>
+
+namespace llvm {
+namespace automemcpy {
+
+// A Grade as in the Majority Judgment voting system.
+struct Grade {
+ enum GradeEnum {
+ EXCELLENT,
+ VERY_GOOD,
+ GOOD,
+ PASSABLE,
+ INADEQUATE,
+ MEDIOCRE,
+ BAD,
+ ARRAY_SIZE,
+ };
+
+ // Returns a human readable string of the enum.
+ static StringRef getString(const GradeEnum &GE);
+
+ // Turns 'Score' into a GradeEnum.
+ static GradeEnum judge(double Score);
+};
+
+// A 'GradeEnum' indexed array with counts for each grade.
+using GradeHistogram = std::array<size_t, Grade::ARRAY_SIZE>;
+
+// Identifies a Function by its name and type. Used as a key in a map.
+struct FunctionId {
+ StringRef Name;
+ FunctionType Type;
+ COMPARABLE_AND_HASHABLE(FunctionId, Type, Name)
+};
+
+struct PerDistributionData {
+ double MedianBytesPerSecond; // Median of samples for this distribution.
+ double Score; // Normalized score for this distribution.
+ Grade::GradeEnum Grade; // Grade for this distribution.
+};
+
+struct FunctionData {
+ FunctionId Id;
+ StringMap<PerDistributionData> PerDistributionData;
+ GradeHistogram GradeHisto = {}; // GradeEnum indexed array
+ Grade::GradeEnum FinalGrade = Grade::BAD; // Overall grade for this function
+};
+
+// Identifies a Distribution by its name. Used as a key in a map.
+struct DistributionId {
+ StringRef Name;
+ COMPARABLE_AND_HASHABLE(DistributionId, Name)
+};
+
+// Identifies a Sample by its distribution and function. Used as a key in a map.
+struct SampleId {
+ FunctionId Function;
+ DistributionId Distribution;
+ COMPARABLE_AND_HASHABLE(SampleId, Function.Type, Function.Name,
+ Distribution.Name)
+};
+
+// A SampleId with an associated measured throughput.
+struct Sample {
+ SampleId Id;
+ double BytesPerSecond = 0;
+};
+
+// This function collects Samples that belong to the same distribution and
+// function and retains the median one. It then stores each of them into a
+// 'FunctionData' and returns them as a vector.
+std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples);
+
+// Normalize the function's throughput per distribution.
+void fillScores(MutableArrayRef<FunctionData> Functions);
+
+// Convert scores into Grades, stores an histogram of Grade for each functions
+// and cast a median grade for the function.
+void castVotes(MutableArrayRef<FunctionData> Functions);
+
+} // namespace automemcpy
+} // namespace llvm
+
+#endif // LIBC_BENCHMARKS_AUTOMEMCPY_RESULTANALYZER_H
diff --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..073a92ae68ad0
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
@@ -0,0 +1,32 @@
+add_library(automemcpy_codegen CodeGen.cpp)
+target_link_libraries(automemcpy_codegen PUBLIC LLVMSupport)
+target_compile_options(automemcpy_codegen PUBLIC -fno-rtti)
+target_include_directories(automemcpy_codegen PUBLIC ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+
+add_executable(automemcpy_codegen_main CodeGenMain.cpp RandomFunctionGenerator.cpp)
+target_link_libraries(automemcpy_codegen_main PUBLIC automemcpy_codegen ${Z3_LIBRARIES})
+target_compile_options(automemcpy_codegen_main PUBLIC -fno-rtti)
+
+set(Implementations "${CMAKE_CURRENT_BINARY_DIR}/Implementations.cpp")
+add_custom_command(
+ OUTPUT ${Implementations}
+ COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/automemcpy_codegen_main" > "${Implementations}"
+ COMMAND echo "automemcpy implementations generated in ${Implementations}"
+ WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+ DEPENDS automemcpy_codegen_main
+)
+
+add_library(automemcpy_implementations "${Implementations}")
+target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark)
+target_include_directories(automemcpy_implementations PRIVATE ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+target_compile_options(automemcpy_implementations PUBLIC -fno-rtti PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin)
+
+add_executable(automemcpy EXCLUDE_FROM_ALL ${LIBC_SOURCE_DIR}/benchmarks/LibcMemoryGoogleBenchmarkMain.cpp)
+target_link_libraries(automemcpy PRIVATE libc-memory-benchmark benchmark_main automemcpy_implementations)
+
+add_library(automemcpy_result_analyzer_lib EXCLUDE_FROM_ALL ResultAnalyzer.cpp)
+target_link_libraries(automemcpy_result_analyzer_lib PUBLIC LLVMSupport)
+target_include_directories(automemcpy_result_analyzer_lib PUBLIC ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+
+add_executable(automemcpy_result_analyzer EXCLUDE_FROM_ALL ResultAnalyzerMain.cpp)
+target_link_libraries(automemcpy_result_analyzer PRIVATE automemcpy_result_analyzer_lib automemcpy_implementations)
diff --git a/libc/benchmarks/automemcpy/lib/CodeGen.cpp b/libc/benchmarks/automemcpy/lib/CodeGen.cpp
new file mode 100644
index 0000000000000..28bd62044c549
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/CodeGen.cpp
@@ -0,0 +1,646 @@
+//===-- C++ code generation from NamedFunctionDescriptors -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This code is responsible for generating the "Implementation.cpp" file.
+// The file is composed like this:
+//
+// 1. Includes
+// 2. Using statements to help readability.
+// 3. Source code for all the mem function implementations.
+// 4. The function to retrieve all the function descriptors with their name.
+// llvm::ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors();
+// 5. The functions for the benchmarking infrastructure:
+// llvm::ArrayRef<MemcpyConfiguration> getMemcpyConfigurations();
+// llvm::ArrayRef<MemcmpOrBcmpConfiguration> getMemcmpConfigurations();
+// llvm::ArrayRef<MemcmpOrBcmpConfiguration> getBcmpConfigurations();
+// llvm::ArrayRef<MemsetConfiguration> getMemsetConfigurations();
+// llvm::ArrayRef<BzeroConfiguration> getBzeroConfigurations();
+//
+//
+// Sections 3, 4 and 5 are handled by the following namespaces:
+// - codegen::functions
+// - codegen::descriptors
+// - codegen::configurations
+//
+// The programming style is functionnal. In each of these namespace, the
+// original `NamedFunctionDescriptor` object is turned into a
diff erent type. We
+// make use of overloaded stream operators to format the resulting type into
+// either a function, a descriptor or a configuration. The entry point of each
+// namespace is the Serialize function.
+//
+// Note the code here is better understood by starting from the `Serialize`
+// function at the end of the file.
+
+#include "automemcpy/CodeGen.h"
+#include <cassert>
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/StringSet.h>
+#include <llvm/Support/FormatVariadic.h>
+#include <llvm/Support/raw_ostream.h>
+#include <set>
+
+namespace llvm {
+namespace automemcpy {
+namespace codegen {
+
+// The indentation string.
+static constexpr StringRef kIndent = " ";
+
+// The codegen namespace handles the serialization of a NamedFunctionDescriptor
+// into source code for the function, the descriptor and the configuration.
+
+namespace functions {
+
+// This namespace turns a NamedFunctionDescriptor into an actual implementation.
+// -----------------------------------------------------------------------------
+// e.g.
+// static void memcpy_0xB20D4702493C397E(char *__restrict dst,
+// const char *__restrict src,
+// size_t size) {
+// using namespace __llvm_libc::x86;
+// if(size == 0) return;
+// if(size == 1) return Copy<_1>(dst, src);
+// if(size < 4) return Copy<HeadTail<_2>>(dst, src, size);
+// if(size < 8) return Copy<HeadTail<_4>>(dst, src, size);
+// if(size < 16) return Copy<HeadTail<_8>>(dst, src, size);
+// if(size < 32) return Copy<HeadTail<_16>>(dst, src, size);
+// return Copy<Accelerator>(dst, src, size);
+// }
+
+// The `Serialize` method turns a `NamedFunctionDescriptor` into a
+// `FunctionImplementation` which holds all the information needed to produce
+// the C++ source code.
+
+// An Element with its size (e.g. `_16` in the example above).
+struct ElementType {
+ size_t Size;
+};
+// The case `if(size == 0)` is encoded as a the Zero type.
+struct Zero {
+ StringRef DefaultReturnValue;
+};
+// An individual size `if(size == X)` is encoded as an Individual type.
+struct Individual {
+ size_t IfEq;
+ ElementType Element;
+};
+// An overlap strategy is encoded as an Overlap type.
+struct Overlap {
+ size_t IfLt;
+ ElementType Element;
+};
+// A loop strategy is encoded as a Loop type.
+struct Loop {
+ size_t IfLt;
+ ElementType Element;
+};
+// An aligned loop strategy is encoded as an AlignedLoop type.
+struct AlignedLoop {
+ size_t IfLt;
+ ElementType Element;
+ ElementType Alignment;
+ StringRef AlignTo;
+};
+// The accelerator strategy.
+struct Accelerator {
+ size_t IfLt;
+};
+// The Context stores data about the function type.
+struct Context {
+ StringRef FunctionReturnType; // e.g. void* or int
+ StringRef FunctionArgs;
+ StringRef ElementOp; // Copy, ThreeWayCompare, SplatSet, ...
+ StringRef FixedSizeArgs;
+ StringRef RuntimeSizeArgs;
+ StringRef AlignArg1;
+ StringRef AlignArg2;
+ StringRef DefaultReturnValue;
+};
+// A detailed representation of the function implementation mapped from the
+// NamedFunctionDescriptor.
+struct FunctionImplementation {
+ Context Ctx;
+ StringRef Name;
+ std::vector<Individual> Individuals;
+ std::vector<Overlap> Overlaps;
+ Optional<Loop> Loop;
+ Optional<AlignedLoop> AlignedLoop;
+ Optional<Accelerator> Accelerator;
+ ElementTypeClass ElementClass;
+};
+
+// Returns the Context for each FunctionType.
+static Context getCtx(FunctionType FT) {
+ switch (FT) {
+ case FunctionType::MEMCPY:
+ return {"void",
+ "(char *__restrict dst, const char *__restrict src, size_t size)",
+ "Copy",
+ "(dst, src)",
+ "(dst, src, size)",
+ "Arg::Dst",
+ "Arg::Src",
+ ""};
+ case FunctionType::MEMCMP:
+ return {"int",
+ "(const char * lhs, const char * rhs, size_t size)",
+ "ThreeWayCompare",
+ "(lhs, rhs)",
+ "(lhs, rhs, size)",
+ "Arg::Lhs",
+ "Arg::Rhs",
+ "0"};
+ case FunctionType::MEMSET:
+ return {"void",
+ "(char * dst, int value, size_t size)",
+ "SplatSet",
+ "(dst, value)",
+ "(dst, value, size)",
+ "Arg::Dst",
+ "Arg::Src",
+ ""};
+ case FunctionType::BZERO:
+ return {"void", "(char * dst, size_t size)",
+ "SplatSet", "(dst, 0)",
+ "(dst, 0, size)", "Arg::Dst",
+ "Arg::Src", ""};
+ default:
+ report_fatal_error("Not yet implemented");
+ }
+}
+
+static StringRef getAligntoString(const Context &Ctx, const AlignArg &AlignTo) {
+ switch (AlignTo) {
+ case AlignArg::_1:
+ return Ctx.AlignArg1;
+ case AlignArg::_2:
+ return Ctx.AlignArg2;
+ case AlignArg::ARRAY_SIZE:
+ report_fatal_error("logic error");
+ }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream, const ElementType &E) {
+ return Stream << '_' << E.Size;
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Individual &O) {
+ return Stream << O.Element;
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Overlap &O) {
+ return Stream << "HeadTail<" << O.Element << '>';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Loop &O) {
+ return Stream << "Loop<" << O.Element << '>';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const AlignedLoop &O) {
+ return Stream << "Align<" << O.Alignment << ',' << O.AlignTo << ">::Then<"
+ << Loop{O.IfLt, O.Element} << ">";
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Accelerator &O) {
+ return Stream << "Accelerator";
+}
+
+template <typename T> struct IfEq {
+ StringRef Op;
+ StringRef Args;
+ const T ∈
+};
+
+template <typename T> struct IfLt {
+ StringRef Op;
+ StringRef Args;
+ const T ∈
+};
+
+static raw_ostream &operator<<(raw_ostream &Stream, const Zero &O) {
+ Stream << kIndent << "if(size == 0) return";
+ if (!O.DefaultReturnValue.empty())
+ Stream << ' ' << O.DefaultReturnValue;
+ return Stream << ";\n";
+}
+
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream, const IfEq<T> &O) {
+ return Stream << kIndent << "if(size == " << O.Element.IfEq << ") return "
+ << O.Op << '<' << O.Element << '>' << O.Args << ";\n";
+}
+
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream, const IfLt<T> &O) {
+ Stream << kIndent;
+ if (O.Element.IfLt != kMaxSize)
+ Stream << "if(size < " << O.Element.IfLt << ") ";
+ return Stream << "return " << O.Op << '<' << O.Element << '>' << O.Args
+ << ";\n";
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream,
+ const ElementTypeClass &Class) {
+ switch (Class) {
+ case ElementTypeClass::SCALAR:
+ return Stream << "scalar";
+ case ElementTypeClass::BUILTIN:
+ return Stream << "builtin";
+ case ElementTypeClass::NATIVE:
+ // FIXME: the framework should provide a `native` namespace that redirect to
+ // x86, arm or other architectures.
+ return Stream << "x86";
+ }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream,
+ const FunctionImplementation &FI) {
+ const auto &Ctx = FI.Ctx;
+ Stream << "static " << Ctx.FunctionReturnType << ' ' << FI.Name
+ << Ctx.FunctionArgs << " {\n";
+ Stream << kIndent << "using namespace __llvm_libc::" << FI.ElementClass
+ << ";\n";
+ for (const auto &I : FI.Individuals)
+ if (I.Element.Size == 0)
+ Stream << Zero{Ctx.DefaultReturnValue};
+ else
+ Stream << IfEq<Individual>{Ctx.ElementOp, Ctx.FixedSizeArgs, I};
+ for (const auto &O : FI.Overlaps)
+ Stream << IfLt<Overlap>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, O};
+ if (const auto &C = FI.Loop)
+ Stream << IfLt<Loop>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C};
+ if (const auto &C = FI.AlignedLoop)
+ Stream << IfLt<AlignedLoop>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C};
+ if (const auto &C = FI.Accelerator)
+ Stream << IfLt<Accelerator>{Ctx.ElementOp, Ctx.RuntimeSizeArgs, *C};
+ return Stream << "}\n";
+}
+
+// Turns a `NamedFunctionDescriptor` into a `FunctionImplementation` unfolding
+// the contiguous and overlap region into several statements. The zero case is
+// also mapped to its own type.
+static FunctionImplementation
+getImplementation(const NamedFunctionDescriptor &NamedFD) {
+ const FunctionDescriptor &FD = NamedFD.Desc;
+ FunctionImplementation Impl;
+ Impl.Ctx = getCtx(FD.Type);
+ Impl.Name = NamedFD.Name;
+ Impl.ElementClass = FD.ElementClass;
+ if (auto C = FD.Contiguous)
+ for (size_t I = C->Span.Begin; I < C->Span.End; ++I)
+ Impl.Individuals.push_back(Individual{I, ElementType{I}});
+ if (auto C = FD.Overlap)
+ for (size_t I = C->Span.Begin; I < C->Span.End; I *= 2)
+ Impl.Overlaps.push_back(Overlap{2 * I, ElementType{I}});
+ if (const auto &L = FD.Loop)
+ Impl.Loop = Loop{L->Span.End, ElementType{L->BlockSize}};
+ if (const auto &AL = FD.AlignedLoop)
+ Impl.AlignedLoop = AlignedLoop{
+ AL->Loop.Span.End, ElementType{AL->Loop.BlockSize},
+ ElementType{AL->Alignment}, getAligntoString(Impl.Ctx, AL->AlignTo)};
+ if (const auto &A = FD.Accelerator)
+ Impl.Accelerator = Accelerator{A->Span.End};
+ return Impl;
+}
+
+static void Serialize(raw_ostream &Stream,
+ ArrayRef<NamedFunctionDescriptor> Descriptors) {
+
+ for (const auto &FD : Descriptors)
+ Stream << getImplementation(FD);
+}
+
+} // namespace functions
+
+namespace descriptors {
+
+// This namespace generates the getFunctionDescriptors function:
+// -------------------------------------------------------------
+// e.g.
+// ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors() {
+// static constexpr NamedFunctionDescriptor kDescriptors[] = {
+// {"memcpy_0xE00E29EE73994E2B",{FunctionType::MEMCPY,llvm::None,llvm::None,llvm::None,llvm::None,Accelerator{{0,kMaxSize}},ElementTypeClass::NATIVE}},
+// {"memcpy_0x8661D80472487AB5",{FunctionType::MEMCPY,Contiguous{{0,1}},llvm::None,llvm::None,llvm::None,Accelerator{{1,kMaxSize}},ElementTypeClass::NATIVE}},
+// ...
+// };
+// return makeArrayRef(kDescriptors);
+// }
+
+static raw_ostream &operator<<(raw_ostream &Stream, const SizeSpan &SS) {
+ Stream << "{" << SS.Begin << ',';
+ if (SS.End == kMaxSize)
+ Stream << "kMaxSize";
+ else
+ Stream << SS.End;
+ return Stream << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Contiguous &O) {
+ return Stream << "Contiguous{" << O.Span << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Overlap &O) {
+ return Stream << "Overlap{" << O.Span << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Loop &O) {
+ return Stream << "Loop{" << O.Span << ',' << O.BlockSize << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const AlignArg &O) {
+ switch (O) {
+ case AlignArg::_1:
+ return Stream << "AlignArg::_1";
+ case AlignArg::_2:
+ return Stream << "AlignArg::_2";
+ case AlignArg::ARRAY_SIZE:
+ report_fatal_error("logic error");
+ }
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const AlignedLoop &O) {
+ return Stream << "AlignedLoop{" << O.Loop << ',' << O.Alignment << ','
+ << O.AlignTo << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const Accelerator &O) {
+ return Stream << "Accelerator{" << O.Span << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const ElementTypeClass &O) {
+ switch (O) {
+ case ElementTypeClass::SCALAR:
+ return Stream << "ElementTypeClass::SCALAR";
+ case ElementTypeClass::BUILTIN:
+ return Stream << "ElementTypeClass::BUILTIN";
+ case ElementTypeClass::NATIVE:
+ return Stream << "ElementTypeClass::NATIVE";
+ }
+}
+static raw_ostream &operator<<(raw_ostream &Stream, const FunctionType &T) {
+ switch (T) {
+ case FunctionType::MEMCPY:
+ return Stream << "FunctionType::MEMCPY";
+ case FunctionType::MEMCMP:
+ return Stream << "FunctionType::MEMCMP";
+ case FunctionType::BCMP:
+ return Stream << "FunctionType::BCMP";
+ case FunctionType::MEMSET:
+ return Stream << "FunctionType::MEMSET";
+ case FunctionType::BZERO:
+ return Stream << "FunctionType::BZERO";
+ }
+}
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream,
+ const llvm::Optional<T> &MaybeT) {
+ if (MaybeT)
+ return Stream << *MaybeT;
+ return Stream << "llvm::None";
+}
+static raw_ostream &operator<<(raw_ostream &Stream,
+ const FunctionDescriptor &FD) {
+ return Stream << '{' << FD.Type << ',' << FD.Contiguous << ',' << FD.Overlap
+ << ',' << FD.Loop << ',' << FD.AlignedLoop << ','
+ << FD.Accelerator << ',' << FD.ElementClass << '}';
+}
+static raw_ostream &operator<<(raw_ostream &Stream,
+ const NamedFunctionDescriptor &NFD) {
+ return Stream << '{' << '"' << NFD.Name << '"' << ',' << NFD.Desc << '}';
+}
+template <typename T>
+static raw_ostream &operator<<(raw_ostream &Stream,
+ const std::vector<T> &VectorT) {
+ Stream << '{';
+ bool First = true;
+ for (const auto &Obj : VectorT) {
+ if (!First)
+ Stream << ',';
+ Stream << Obj;
+ First = false;
+ }
+ return Stream << '}';
+}
+
+static void Serialize(raw_ostream &Stream,
+ ArrayRef<NamedFunctionDescriptor> Descriptors) {
+ Stream << R"(ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors() {
+ static constexpr NamedFunctionDescriptor kDescriptors[] = {
+)";
+ for (size_t I = 0, E = Descriptors.size(); I < E; ++I) {
+ Stream << kIndent << kIndent << Descriptors[I] << ",\n";
+ }
+ Stream << R"( };
+ return makeArrayRef(kDescriptors);
+}
+)";
+}
+
+} // namespace descriptors
+
+namespace configurations {
+
+// This namespace generates the getXXXConfigurations functions:
+// ------------------------------------------------------------
+// e.g.
+// llvm::ArrayRef<MemcpyConfiguration> getMemcpyConfigurations() {
+// using namespace __llvm_libc;
+// static constexpr MemcpyConfiguration kConfigurations[] = {
+// {Wrap<memcpy_0xE00E29EE73994E2B>, "memcpy_0xE00E29EE73994E2B"},
+// {Wrap<memcpy_0x8661D80472487AB5>, "memcpy_0x8661D80472487AB5"},
+// ...
+// };
+// return llvm::makeArrayRef(kConfigurations);
+// }
+
+// The `Wrap` template function is provided in the `Main` function below.
+// It is used to adapt the gnerated code to the prototype of the C function.
+// For instance, the generated code for a `memcpy` takes `char*` pointers and
+// returns nothing but the original C `memcpy` function take and returns `void*`
+// pointers.
+
+struct FunctionName {
+ FunctionType ForType;
+};
+
+struct ReturnType {
+ FunctionType ForType;
+};
+
+struct Configuration {
+ FunctionName Name;
+ ReturnType Type;
+ std::vector<const NamedFunctionDescriptor *> Descriptors;
+};
+
+static raw_ostream &operator<<(raw_ostream &Stream, const FunctionName &FN) {
+ switch (FN.ForType) {
+ case FunctionType::MEMCPY:
+ return Stream << "getMemcpyConfigurations";
+ case FunctionType::MEMCMP:
+ return Stream << "getMemcmpConfigurations";
+ case FunctionType::BCMP:
+ return Stream << "getBcmpConfigurations";
+ case FunctionType::MEMSET:
+ return Stream << "getMemsetConfigurations";
+ case FunctionType::BZERO:
+ return Stream << "getBzeroConfigurations";
+ }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream, const ReturnType &RT) {
+ switch (RT.ForType) {
+ case FunctionType::MEMCPY:
+ return Stream << "MemcpyConfiguration";
+ case FunctionType::MEMCMP:
+ case FunctionType::BCMP:
+ return Stream << "MemcmpOrBcmpConfiguration";
+ case FunctionType::MEMSET:
+ return Stream << "MemsetConfiguration";
+ case FunctionType::BZERO:
+ return Stream << "BzeroConfiguration";
+ }
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream,
+ const NamedFunctionDescriptor *FD) {
+ return Stream << formatv("{Wrap<{0}>, \"{0}\"}", FD->Name);
+}
+
+static raw_ostream &
+operator<<(raw_ostream &Stream,
+ const std::vector<const NamedFunctionDescriptor *> &Descriptors) {
+ for (size_t I = 0, E = Descriptors.size(); I < E; ++I)
+ Stream << kIndent << kIndent << Descriptors[I] << ",\n";
+ return Stream;
+}
+
+static raw_ostream &operator<<(raw_ostream &Stream, const Configuration &C) {
+ Stream << "llvm::ArrayRef<" << C.Type << "> " << C.Name << "() {\n";
+ if (C.Descriptors.empty())
+ Stream << kIndent << "return {};\n";
+ else {
+ Stream << kIndent << "using namespace __llvm_libc;\n";
+ Stream << kIndent << "static constexpr " << C.Type
+ << " kConfigurations[] = {\n";
+ Stream << C.Descriptors;
+ Stream << kIndent << "};\n";
+ Stream << kIndent << "return llvm::makeArrayRef(kConfigurations);\n";
+ }
+ Stream << "}\n";
+ return Stream;
+}
+
+static void Serialize(raw_ostream &Stream, FunctionType FT,
+ ArrayRef<NamedFunctionDescriptor> Descriptors) {
+ Configuration Conf;
+ Conf.Name = {FT};
+ Conf.Type = {FT};
+ for (const auto &FD : Descriptors)
+ if (FD.Desc.Type == FT)
+ Conf.Descriptors.push_back(&FD);
+ Stream << Conf;
+}
+
+} // namespace configurations
+static void Serialize(raw_ostream &Stream,
+ ArrayRef<NamedFunctionDescriptor> Descriptors) {
+ Stream << "// This file is auto-generated by libc/benchmarks/automemcpy.\n";
+ Stream << "// Functions : " << Descriptors.size() << "\n";
+ Stream << "\n";
+ Stream << "#include \"LibcFunctionPrototypes.h\"\n";
+ Stream << "#include \"automemcpy/FunctionDescriptor.h\"\n";
+ Stream << "#include \"src/string/memory_utils/elements.h\"\n";
+ Stream << "\n";
+ Stream << "using llvm::libc_benchmarks::BzeroConfiguration;\n";
+ Stream << "using llvm::libc_benchmarks::MemcmpOrBcmpConfiguration;\n";
+ Stream << "using llvm::libc_benchmarks::MemcpyConfiguration;\n";
+ Stream << "using llvm::libc_benchmarks::MemsetConfiguration;\n";
+ Stream << "\n";
+ Stream << "namespace __llvm_libc {\n";
+ Stream << "\n";
+ codegen::functions::Serialize(Stream, Descriptors);
+ Stream << "\n";
+ Stream << "} // namespace __llvm_libc\n";
+ Stream << "\n";
+ Stream << "namespace llvm {\n";
+ Stream << "namespace automemcpy {\n";
+ Stream << "\n";
+ codegen::descriptors::Serialize(Stream, Descriptors);
+ Stream << "\n";
+ Stream << "} // namespace automemcpy\n";
+ Stream << "} // namespace llvm\n";
+ Stream << "\n";
+ Stream << R"(
+using MemcpyStub = void (*)(char *__restrict, const char *__restrict, size_t);
+template <MemcpyStub Foo>
+void *Wrap(void *__restrict dst, const void *__restrict src, size_t size) {
+ Foo(reinterpret_cast<char *__restrict>(dst),
+ reinterpret_cast<const char *__restrict>(src), size);
+ return dst;
+}
+)";
+ codegen::configurations::Serialize(Stream, FunctionType::MEMCPY, Descriptors);
+ Stream << R"(
+using MemcmpStub = int (*)(const char *, const char *, size_t);
+template <MemcmpStub Foo>
+int Wrap(const void *lhs, const void *rhs, size_t size) {
+ return Foo(reinterpret_cast<const char *>(lhs),
+ reinterpret_cast<const char *>(rhs), size);
+}
+)";
+ codegen::configurations::Serialize(Stream, FunctionType::MEMCMP, Descriptors);
+ codegen::configurations::Serialize(Stream, FunctionType::BCMP, Descriptors);
+ Stream << R"(
+using MemsetStub = void (*)(char *, int, size_t);
+template <MemsetStub Foo> void *Wrap(void *dst, int value, size_t size) {
+ Foo(reinterpret_cast<char *>(dst), value, size);
+ return dst;
+}
+)";
+ codegen::configurations::Serialize(Stream, FunctionType::MEMSET, Descriptors);
+ Stream << R"(
+using BzeroStub = void (*)(char *, size_t);
+template <BzeroStub Foo> void Wrap(void *dst, size_t size) {
+ Foo(reinterpret_cast<char *>(dst), size);
+}
+)";
+ codegen::configurations::Serialize(Stream, FunctionType::BZERO, Descriptors);
+ Stream << "// Functions : " << Descriptors.size() << "\n";
+}
+
+} // namespace codegen
+
+// Stores `VolatileStr` into a cache and returns a StringRef of the cached
+// version.
+StringRef getInternalizedString(std::string VolatileStr) {
+ static llvm::StringSet<> StringCache;
+ return StringCache.insert(std::move(VolatileStr)).first->getKey();
+}
+
+static StringRef getString(FunctionType FT) {
+ switch (FT) {
+ case FunctionType::MEMCPY:
+ return "memcpy";
+ case FunctionType::MEMCMP:
+ return "memcmp";
+ case FunctionType::BCMP:
+ return "bcmp";
+ case FunctionType::MEMSET:
+ return "memset";
+ case FunctionType::BZERO:
+ return "bzero";
+ }
+}
+
+void Serialize(raw_ostream &Stream, ArrayRef<FunctionDescriptor> Descriptors) {
+ std::vector<NamedFunctionDescriptor> FunctionDescriptors;
+ FunctionDescriptors.reserve(Descriptors.size());
+ for (auto &FD : Descriptors) {
+ FunctionDescriptors.emplace_back();
+ FunctionDescriptors.back().Name = getInternalizedString(
+ formatv("{0}_{1:X16}", getString(FD.Type), FD.id()));
+ FunctionDescriptors.back().Desc = std::move(FD);
+ }
+ // Sort functions so they are easier to spot in the generated C++ file.
+ std::sort(FunctionDescriptors.begin(), FunctionDescriptors.end(),
+ [](const NamedFunctionDescriptor &A,
+ const NamedFunctionDescriptor &B) { return A.Desc < B.Desc; });
+ codegen::Serialize(Stream, FunctionDescriptors);
+}
+
+} // namespace automemcpy
+} // namespace llvm
diff --git a/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp b/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp
new file mode 100644
index 0000000000000..618e4f1186e30
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/CodeGenMain.cpp
@@ -0,0 +1,28 @@
+#include "automemcpy/CodeGen.h"
+#include "automemcpy/RandomFunctionGenerator.h"
+#include <unordered_set>
+
+namespace llvm {
+namespace automemcpy {
+
+std::vector<FunctionDescriptor> generateFunctionDescriptors() {
+ std::unordered_set<FunctionDescriptor, FunctionDescriptor::Hasher> Seen;
+ std::vector<FunctionDescriptor> FunctionDescriptors;
+ RandomFunctionGenerator P;
+ while (Optional<FunctionDescriptor> MaybeFD = P.next()) {
+ FunctionDescriptor FD = *MaybeFD;
+ if (Seen.count(FD)) // FIXME: Z3 sometimes returns twice the same object.
+ continue;
+ Seen.insert(FD);
+ FunctionDescriptors.push_back(std::move(FD));
+ }
+ return FunctionDescriptors;
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+int main(int, char **) {
+ llvm::automemcpy::Serialize(llvm::outs(),
+ llvm::automemcpy::generateFunctionDescriptors());
+}
diff --git a/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp b/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp
new file mode 100644
index 0000000000000..763fbc6d85a3d
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/RandomFunctionGenerator.cpp
@@ -0,0 +1,279 @@
+//===-- Generate random but valid function descriptors -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/RandomFunctionGenerator.h"
+
+#include <llvm/ADT/None.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <set>
+
+namespace llvm {
+namespace automemcpy {
+
+// Exploration parameters
+// ----------------------
+// Here we define a set of values that will contraint the exploration and
+// limit combinatorial explosion.
+
+// We limit the number of cases for individual sizes to sizes up to 4.
+// More individual sizes don't bring much over the overlapping strategy.
+static constexpr int kMaxIndividualSize = 4;
+
+// We limit Overlapping Strategy to sizes up to 256.
+// An overlap of 256B means accessing 128B at once which is usually not
+// feasible by current CPUs. We rely on the compiler to generate multiple
+// loads/stores if needed but higher sizes are unlikely to benefit from hardware
+// acceleration.
+static constexpr int kMaxOverlapSize = 256;
+
+// For the loop strategies, we make sure that they iterate at least a certain
+// number of times to amortize the cost of looping.
+static constexpr int kLoopMinIter = 3;
+static constexpr int kAlignedLoopMinIter = 2;
+
+// We restrict the size of the block of data to handle in a loop.
+// Generally speaking block size <= 16 perform poorly.
+static constexpr int kLoopBlockSize[] = {16, 32, 64};
+
+// We restrict alignment to the following values.
+static constexpr int kLoopAlignments[] = {16, 32, 64};
+
+// We make sure that the region bounds are one of the following values.
+static constexpr int kAnchors[] = {0, 1, 2, 4, 8, 16, 32, 48,
+ 64, 96, 128, 256, 512, 1024, kMaxSize};
+
+// We also allow disabling loops, aligned loops and accelerators.
+static constexpr bool kDisableLoop = false;
+static constexpr bool kDisableAlignedLoop = false;
+static constexpr bool kDisableAccelerator = false;
+
+// For memcpy, we can also explore whether aligning on source or destination has
+// an effect.
+static constexpr bool kExploreAlignmentArg = true;
+
+// The function we generate code for.
+// BCMP is specifically disabled for now.
+static constexpr int kFunctionTypes[] = {
+ (int)FunctionType::MEMCPY,
+ (int)FunctionType::MEMCMP,
+ // (int)FunctionType::BCMP,
+ (int)FunctionType::MEMSET,
+ (int)FunctionType::BZERO,
+};
+
+// The actual implementation of each function can be handled via primitive types
+// (SCALAR), vector types where available (NATIVE) or by the compiler (BUILTIN).
+// We want to move toward delegating the code generation entirely to the
+// compiler but for now we have to make use of -per microarchitecture- custom
+// implementations. Scalar being more portable but also less performant, we
+// remove it as well.
+static constexpr int kElementClasses[] = {
+ // (int)ElementTypeClass::SCALAR,
+ (int)ElementTypeClass::NATIVE,
+ // (int)ElementTypeClass::BUILTIN
+};
+
+RandomFunctionGenerator::RandomFunctionGenerator()
+ : Solver(Context), Type(Context.int_const("Type")),
+ ContiguousBegin(Context.int_const("ContiguousBegin")),
+ ContiguousEnd(Context.int_const("ContiguousEnd")),
+ OverlapBegin(Context.int_const("OverlapBegin")),
+ OverlapEnd(Context.int_const("OverlapEnd")),
+ LoopBegin(Context.int_const("LoopBegin")),
+ LoopEnd(Context.int_const("LoopEnd")),
+ LoopBlockSize(Context.int_const("LoopBlockSize")),
+ AlignedLoopBegin(Context.int_const("AlignedLoopBegin")),
+ AlignedLoopEnd(Context.int_const("AlignedLoopEnd")),
+ AlignedLoopBlockSize(Context.int_const("AlignedLoopBlockSize")),
+ AlignedAlignment(Context.int_const("AlignedAlignment")),
+ AlignedArg(Context.int_const("AlignedArg")),
+ AcceleratorBegin(Context.int_const("AcceleratorBegin")),
+ AcceleratorEnd(Context.int_const("AcceleratorEnd")),
+ ElementClass(Context.int_const("ElementClass")) {
+ // All possible functions.
+ Solver.add(inSetConstraint(Type, kFunctionTypes));
+
+ // Add constraints for region bounds.
+ addBoundsAndAnchors(ContiguousBegin, ContiguousEnd);
+ addBoundsAndAnchors(OverlapBegin, OverlapEnd);
+ addBoundsAndAnchors(LoopBegin, LoopEnd);
+ addBoundsAndAnchors(AlignedLoopBegin, AlignedLoopEnd);
+ addBoundsAndAnchors(AcceleratorBegin, AcceleratorEnd);
+ // We always consider strategies in this order, and we
+ // always end with the `Accelerator` strategy, as it's typically more
+ // efficient for large sizes.
+ // Contiguous <= Overlap <= Loop <= AlignedLoop <= Accelerator
+ Solver.add(ContiguousEnd == OverlapBegin);
+ Solver.add(OverlapEnd == LoopBegin);
+ Solver.add(LoopEnd == AlignedLoopBegin);
+ Solver.add(AlignedLoopEnd == AcceleratorBegin);
+ // Fix endpoints: The minimum size that we want to copy is 0, and we always
+ // start with the `Contiguous` strategy. The max size is `kMaxSize`.
+ Solver.add(ContiguousBegin == 0);
+ Solver.add(AcceleratorEnd == kMaxSize);
+ // Contiguous
+ Solver.add(ContiguousEnd <= kMaxIndividualSize + 1);
+ // Overlap
+ Solver.add(OverlapEnd <= kMaxOverlapSize + 1);
+ // Overlap only ever makes sense when accessing multiple bytes at a time.
+ // i.e. Overlap<1> is useless.
+ Solver.add(OverlapBegin == OverlapEnd || OverlapBegin >= 2);
+ // Loop
+ addLoopConstraints(LoopBegin, LoopEnd, LoopBlockSize, kLoopMinIter);
+ // Aligned Loop
+ addLoopConstraints(AlignedLoopBegin, AlignedLoopEnd, AlignedLoopBlockSize,
+ kAlignedLoopMinIter);
+ Solver.add(inSetConstraint(AlignedAlignment, kLoopAlignments));
+ Solver.add(AlignedLoopBegin == AlignedLoopEnd || AlignedLoopBegin >= 64);
+ Solver.add(AlignedLoopBlockSize >= AlignedAlignment);
+ Solver.add(AlignedLoopBlockSize >= LoopBlockSize);
+ z3::expr IsMemcpy = Type == (int)FunctionType::MEMCPY;
+ z3::expr ExploreAlignment = IsMemcpy && kExploreAlignmentArg;
+ Solver.add(
+ (ExploreAlignment &&
+ inSetConstraint(AlignedArg, {(int)AlignArg::_1, (int)AlignArg::_2})) ||
+ (!ExploreAlignment && AlignedArg == (int)AlignArg::_1));
+ // Accelerator
+ Solver.add(IsMemcpy ||
+ (AcceleratorBegin ==
+ AcceleratorEnd)); // Only Memcpy has accelerator for now.
+ // Element classes
+ Solver.add(inSetConstraint(ElementClass, kElementClasses));
+
+ if (kDisableLoop)
+ Solver.add(LoopBegin == LoopEnd);
+ if (kDisableAlignedLoop)
+ Solver.add(AlignedLoopBegin == AlignedLoopEnd);
+ if (kDisableAccelerator)
+ Solver.add(AcceleratorBegin == AcceleratorEnd);
+}
+
+// Creates SizeSpan from Begin/End values.
+// Returns llvm::None if Begin==End.
+static Optional<SizeSpan> AsSizeSpan(size_t Begin, size_t End) {
+ if (Begin == End)
+ return None;
+ SizeSpan SS;
+ SS.Begin = Begin;
+ SS.End = End;
+ return SS;
+}
+
+// Generic method to create a `Region` struct with a Span or None if span is
+// empty.
+template <typename Region>
+static Optional<Region> As(size_t Begin, size_t End) {
+ if (auto Span = AsSizeSpan(Begin, End)) {
+ Region Output;
+ Output.Span = *Span;
+ return Output;
+ }
+ return None;
+}
+
+// Returns a Loop struct or None if span is empty.
+static Optional<Loop> AsLoop(size_t Begin, size_t End, size_t BlockSize) {
+ if (auto Span = AsSizeSpan(Begin, End)) {
+ Loop Output;
+ Output.Span = *Span;
+ Output.BlockSize = BlockSize;
+ return Output;
+ }
+ return None;
+}
+
+// Returns an AlignedLoop struct or None if span is empty.
+static Optional<AlignedLoop> AsAlignedLoop(size_t Begin, size_t End,
+ size_t BlockSize, size_t Alignment,
+ AlignArg AlignTo) {
+ if (auto Loop = AsLoop(Begin, End, BlockSize)) {
+ AlignedLoop Output;
+ Output.Loop = *Loop;
+ Output.Alignment = Alignment;
+ Output.AlignTo = AlignTo;
+ return Output;
+ }
+ return None;
+}
+
+Optional<FunctionDescriptor> RandomFunctionGenerator::next() {
+ if (Solver.check() != z3::sat)
+ return {};
+
+ z3::model m = Solver.get_model();
+
+ // Helper method to get the current numerical value of a z3::expr.
+ const auto E = [&m](z3::expr &V) -> int {
+ return m.eval(V).get_numeral_int();
+ };
+
+ // Fill is the function descriptor to return.
+ FunctionDescriptor R;
+ R.Type = FunctionType(E(Type));
+ R.Contiguous = As<Contiguous>(E(ContiguousBegin), E(ContiguousEnd));
+ R.Overlap = As<Overlap>(E(OverlapBegin), E(OverlapEnd));
+ R.Loop = AsLoop(E(LoopBegin), E(LoopEnd), E(LoopBlockSize));
+ R.AlignedLoop = AsAlignedLoop(E(AlignedLoopBegin), E(AlignedLoopEnd),
+ E(AlignedLoopBlockSize), E(AlignedAlignment),
+ AlignArg(E(AlignedArg)));
+ R.Accelerator = As<Accelerator>(E(AcceleratorBegin), E(AcceleratorEnd));
+ R.ElementClass = ElementTypeClass(E(ElementClass));
+
+ // Express current state as a set of constraints.
+ z3::expr CurrentLayout =
+ (Type == E(Type)) && (ContiguousBegin == E(ContiguousBegin)) &&
+ (ContiguousEnd == E(ContiguousEnd)) &&
+ (OverlapBegin == E(OverlapBegin)) && (OverlapEnd == E(OverlapEnd)) &&
+ (LoopBegin == E(LoopBegin)) && (LoopEnd == E(LoopEnd)) &&
+ (LoopBlockSize == E(LoopBlockSize)) &&
+ (AlignedLoopBegin == E(AlignedLoopBegin)) &&
+ (AlignedLoopEnd == E(AlignedLoopEnd)) &&
+ (AlignedLoopBlockSize == E(AlignedLoopBlockSize)) &&
+ (AlignedAlignment == E(AlignedAlignment)) &&
+ (AlignedArg == E(AlignedArg)) &&
+ (AcceleratorBegin == E(AcceleratorBegin)) &&
+ (AcceleratorEnd == E(AcceleratorEnd)) &&
+ (ElementClass == E(ElementClass));
+
+ // Ask solver to never show this configuration ever again.
+ Solver.add(!CurrentLayout);
+ return R;
+}
+
+// Make sure `Variable` is one of the provided values.
+z3::expr RandomFunctionGenerator::inSetConstraint(z3::expr &Variable,
+ ArrayRef<int> Values) const {
+ z3::expr_vector Args(Variable.ctx());
+ for (int Value : Values)
+ Args.push_back(Variable == Value);
+ return z3::mk_or(Args);
+}
+
+void RandomFunctionGenerator::addBoundsAndAnchors(z3::expr &Begin,
+ z3::expr &End) {
+ // Begin and End are picked amongst a set of predefined values.
+ Solver.add(inSetConstraint(Begin, kAnchors));
+ Solver.add(inSetConstraint(End, kAnchors));
+ Solver.add(Begin >= 0);
+ Solver.add(Begin <= End);
+ Solver.add(End <= kMaxSize);
+}
+
+void RandomFunctionGenerator::addLoopConstraints(const z3::expr &LoopBegin,
+ const z3::expr &LoopEnd,
+ z3::expr &LoopBlockSize,
+ int LoopMinIter) {
+ Solver.add(inSetConstraint(LoopBlockSize, kLoopBlockSize));
+ Solver.add(LoopBegin == LoopEnd ||
+ (LoopBegin > (LoopMinIter * LoopBlockSize)));
+}
+
+} // namespace automemcpy
+} // namespace llvm
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
new file mode 100644
index 0000000000000..00298f69f77f6
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
@@ -0,0 +1,180 @@
+//===-- Analyze benchmark JSON files --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This code analyzes the json file produced by the `automemcpy` binary.
+//
+// As a remainder, `automemcpy` will benchmark each autogenerated memory
+// functions against one of the predefined distributions available in the
+// `libc/benchmarks/distributions` folder.
+//
+// It works as follows:
+// - Reads one or more json files.
+// - If there are several runs for the same function and distribution, picks the
+// median throughput (aka `BytesPerSecond`).
+// - Aggregates the throughput per distributions and scores them from worst (0)
+// to best (1).
+// - Each distribution categorizes each function into one of the following
+// categories: EXCELLENT, VERY_GOOD, GOOD, PASSABLE, INADEQUATE, MEDIOCRE,
+// BAD.
+// - A process similar to the Majority Judgment voting system is used to `elect`
+// the best function. The histogram of grades is returned so we can
+// distinguish between functions with the same final grade. In the following
+// example both functions grade EXCELLENT but we may prefer the second one.
+//
+// | | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | ...
+// |------------|-----------|-----------|------|----------| ...
+// | Function_1 | 7 | 1 | 2 | | ...
+// | Function_2 | 6 | 4 | | | ...
+
+#include "automemcpy/ResultAnalyzer.h"
+#include "llvm/ADT/StringRef.h"
+#include <numeric>
+#include <unordered_map>
+
+namespace llvm {
+
+namespace automemcpy {
+
+StringRef Grade::getString(const GradeEnum &GE) {
+ switch (GE) {
+ case EXCELLENT:
+ return "EXCELLENT";
+ case VERY_GOOD:
+ return "VERY_GOOD";
+ case GOOD:
+ return "GOOD";
+ case PASSABLE:
+ return "PASSABLE";
+ case INADEQUATE:
+ return "INADEQUATE";
+ case MEDIOCRE:
+ return "MEDIOCRE";
+ case BAD:
+ return "BAD";
+ case ARRAY_SIZE:
+ report_fatal_error("logic error");
+ }
+}
+
+Grade::GradeEnum Grade::judge(double Score) {
+ if (Score >= 6. / 7)
+ return EXCELLENT;
+ if (Score >= 5. / 7)
+ return VERY_GOOD;
+ if (Score >= 4. / 7)
+ return GOOD;
+ if (Score >= 3. / 7)
+ return PASSABLE;
+ if (Score >= 2. / 7)
+ return INADEQUATE;
+ if (Score >= 1. / 7)
+ return MEDIOCRE;
+ return BAD;
+}
+
+std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples) {
+ std::unordered_map<SampleId, std::vector<double>, SampleId::Hasher>
+ BucketedSamples;
+ for (const auto &S : Samples)
+ BucketedSamples[S.Id].push_back(S.BytesPerSecond);
+ std::unordered_map<FunctionId, StringMap<double>, FunctionId::Hasher>
+ Throughputs;
+ for (auto &Pair : BucketedSamples) {
+ const auto &Id = Pair.first;
+ auto &Values = Pair.second;
+ const size_t HalfSize = Values.size() / 2;
+ std::nth_element(Values.begin(), Values.begin() + HalfSize, Values.end());
+ const double MedianValue = Values[HalfSize];
+ Throughputs[Id.Function][Id.Distribution.Name] = MedianValue;
+ }
+ std::vector<FunctionData> Output;
+ for (auto &Pair : Throughputs) {
+ FunctionData Data;
+ Data.Id = Pair.first;
+ for (const auto &Pair : Pair.second)
+ Data.PerDistributionData[Pair.getKey()].MedianBytesPerSecond =
+ Pair.getValue();
+ Output.push_back(std::move(Data));
+ }
+ return Output;
+}
+
+void fillScores(MutableArrayRef<FunctionData> Functions) {
+ // A key to bucket throughput per function type and distribution.
+ struct Key {
+ FunctionType Type;
+ StringRef Distribution;
+
+ COMPARABLE_AND_HASHABLE(Key, Type, Distribution)
+ };
+
+ // Tracks minimum and maximum values.
+ struct MinMax {
+ double Min = std::numeric_limits<double>::max();
+ double Max = std::numeric_limits<double>::min();
+ void update(double Value) {
+ if (Value < Min)
+ Min = Value;
+ if (Value > Max)
+ Max = Value;
+ }
+ double normalize(double Value) const { return (Value - Min) / (Max - Min); }
+ };
+
+ std::unordered_map<Key, MinMax, Key::Hasher> ThroughputMinMax;
+ for (const auto &Function : Functions) {
+ const FunctionType Type = Function.Id.Type;
+ for (const auto &Pair : Function.PerDistributionData) {
+ const auto &Distribution = Pair.getKey();
+ const double Throughput = Pair.getValue().MedianBytesPerSecond;
+ const Key K{Type, Distribution};
+ ThroughputMinMax[K].update(Throughput);
+ }
+ }
+
+ for (auto &Function : Functions) {
+ const FunctionType Type = Function.Id.Type;
+ for (const auto &Pair : Function.PerDistributionData) {
+ const auto &Distribution = Pair.getKey();
+ const double Throughput = Pair.getValue().MedianBytesPerSecond;
+ const Key K{Type, Distribution};
+ Function.PerDistributionData[Distribution].Score =
+ ThroughputMinMax[K].normalize(Throughput);
+ }
+ }
+}
+
+void castVotes(MutableArrayRef<FunctionData> Functions) {
+ for (FunctionData &Function : Functions)
+ for (const auto &Pair : Function.PerDistributionData) {
+ const StringRef Distribution = Pair.getKey();
+ const double Score = Pair.getValue().Score;
+ const auto G = Grade::judge(Score);
+ ++(Function.GradeHisto[G]);
+ Function.PerDistributionData[Distribution].Grade = G;
+ }
+
+ for (FunctionData &Function : Functions) {
+ const auto &GradeHisto = Function.GradeHisto;
+ const size_t Votes =
+ std::accumulate(GradeHisto.begin(), GradeHisto.end(), 0U);
+ const size_t MedianVote = Votes / 2;
+ size_t CountedVotes = 0;
+ Grade::GradeEnum MedianGrade = Grade::BAD;
+ for (size_t I = 0; I < GradeHisto.size(); ++I) {
+ CountedVotes += GradeHisto[I];
+ if (CountedVotes > MedianVote) {
+ MedianGrade = Grade::GradeEnum(I);
+ break;
+ }
+ }
+ Function.FinalGrade = MedianGrade;
+ }
+}
+
+} // namespace automemcpy
+} // namespace llvm
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
new file mode 100644
index 0000000000000..6a657e432c18f
--- /dev/null
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
@@ -0,0 +1,158 @@
+//===-- Application to analyze benchmark JSON files -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/ResultAnalyzer.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+// User can specify one or more json filenames to process on the command line.
+static cl::list<std::string> InputFilenames(cl::Positional, cl::OneOrMore,
+ cl::desc("<input json files>"));
+
+namespace automemcpy {
+
+// This is defined in the autogenerated 'Implementations.cpp' file.
+extern ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors();
+
+// Iterates over all functions and fills a map of function name to function
+// descriptor pointers.
+static StringMap<const FunctionDescriptor *> createFunctionDescriptorMap() {
+ StringMap<const FunctionDescriptor *> Descriptors;
+ for (const NamedFunctionDescriptor &FD : getFunctionDescriptors())
+ Descriptors.insert_or_assign(FD.Name, &FD.Desc);
+ return Descriptors;
+}
+
+// Retrieves the function descriptor for a particular function name.
+static const FunctionDescriptor &getFunctionDescriptor(StringRef FunctionName) {
+ static StringMap<const FunctionDescriptor *> Descriptors =
+ createFunctionDescriptorMap();
+ const auto *FD = Descriptors.lookup(FunctionName);
+ if (!FD)
+ report_fatal_error(
+ Twine("No FunctionDescriptor for ").concat(FunctionName));
+ return *FD;
+}
+
+// Functions and distributions names are stored quite a few times so it's more
+// efficient to internalize these strings and refer to them through 'StringRef'.
+static StringRef getInternalizedString(StringRef VolatileStr) {
+ static llvm::StringSet<> StringCache;
+ return StringCache.insert(VolatileStr).first->getKey();
+}
+
+// Helper function for the LLVM JSON API.
+bool fromJSON(const json::Value &V, Sample &Out, json::Path P) {
+ std::string Label;
+ json::ObjectMapper O(V, P);
+ if (O && O.map("bytes_per_second", Out.BytesPerSecond) &&
+ O.map("label", Label)) {
+ const auto LabelPair = StringRef(Label).split(',');
+ Out.Id.Function.Name = getInternalizedString(LabelPair.first);
+ Out.Id.Function.Type = getFunctionDescriptor(LabelPair.first).Type;
+ Out.Id.Distribution.Name = getInternalizedString(LabelPair.second);
+ return true;
+ }
+ return false;
+}
+
+// An object to represent the content of the JSON file.
+// This is easier to parse/serialize JSON when the structures of the json file
+// maps the structure of the object.
+struct JsonFile {
+ std::vector<Sample> Samples;
+};
+
+// Helper function for the LLVM JSON API.
+bool fromJSON(const json::Value &V, JsonFile &JF, json::Path P) {
+ json::ObjectMapper O(V, P);
+ return O && O.map("benchmarks", JF.Samples);
+}
+
+// Global object to ease error reporting, it consumes errors and crash the
+// application with a meaningful message.
+static ExitOnError ExitOnErr;
+
+// Main JSON parsing method. Reads the content of the file pointed to by
+// 'Filename' and returns a JsonFile object.
+JsonFile parseJsonResultFile(StringRef Filename) {
+ auto Buf = ExitOnErr(errorOrToExpected(
+ MemoryBuffer::getFile(Filename, /*bool IsText=*/true,
+ /*RequiresNullTerminator=*/false)));
+ auto JsonValue = ExitOnErr(json::parse(Buf->getBuffer()));
+ json::Path::Root Root;
+ JsonFile JF;
+ if (!fromJSON(JsonValue, JF, Root))
+ ExitOnErr(Root.getError());
+ return JF;
+}
+
+// Serializes the 'GradeHisto' to the provided 'Stream'.
+static void Serialize(raw_ostream &Stream, const GradeHistogram &GH) {
+ static constexpr std::array<StringRef, 9> kCharacters = {
+ " ", "▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"};
+
+ const size_t Max = *std::max_element(GH.begin(), GH.end());
+ for (size_t I = 0; I < GH.size(); ++I) {
+ size_t Index = (float(GH[I]) / Max) * (kCharacters.size() - 1);
+ Stream << kCharacters.at(Index);
+ }
+}
+
+int Main(int argc, char **argv) {
+ ExitOnErr.setBanner("Automemcpy Json Results Analyzer stopped with error: ");
+ cl::ParseCommandLineOptions(argc, argv, "Automemcpy Json Results Analyzer\n");
+
+ // Reads all samples stored in the input JSON files.
+ std::vector<Sample> Samples;
+ for (const auto &Filename : InputFilenames) {
+ auto Result = parseJsonResultFile(Filename);
+ llvm::append_range(Samples, Result.Samples);
+ }
+
+ // Extracts median of throughputs.
+ std::vector<FunctionData> Functions = getThroughputs(Samples);
+ fillScores(Functions);
+ castVotes(Functions);
+
+ // TODO: Implement tie breaking algorithm.
+ std::sort(Functions.begin(), Functions.end(),
+ [](const FunctionData &A, const FunctionData &B) {
+ return A.FinalGrade < B.FinalGrade;
+ });
+
+ // Present data by function type.
+ std::stable_sort(Functions.begin(), Functions.end(),
+ [](const FunctionData &A, const FunctionData &B) {
+ return A.Id.Type < B.Id.Type;
+ });
+
+ // Print result.
+ for (const FunctionData &Function : Functions) {
+ outs() << formatv("{0,-10}", Grade::getString(Function.FinalGrade));
+ outs() << " |";
+ Serialize(outs(), Function.GradeHisto);
+ outs() << "| ";
+ outs().resetColor();
+ outs() << formatv("{0,+25}", Function.Id.Name);
+ outs() << "\n";
+ }
+
+ return EXIT_SUCCESS;
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+int main(int argc, char **argv) { return llvm::automemcpy::Main(argc, argv); }
diff --git a/libc/benchmarks/automemcpy/unittests/CMakeLists.txt b/libc/benchmarks/automemcpy/unittests/CMakeLists.txt
new file mode 100644
index 0000000000000..35caaac1519ba
--- /dev/null
+++ b/libc/benchmarks/automemcpy/unittests/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_libc_benchmark_unittest(libc-automemcpy-codegen-test
+ SRCS CodeGenTest.cpp
+ DEPENDS automemcpy_codegen
+)
+
+add_libc_benchmark_unittest(libc-automemcpy-result-analyzer-test
+ SRCS ResultAnalyzerTest.cpp
+ DEPENDS automemcpy_result_analyzer_lib
+)
diff --git a/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp b/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp
new file mode 100644
index 0000000000000..6849682c44459
--- /dev/null
+++ b/libc/benchmarks/automemcpy/unittests/CodeGenTest.cpp
@@ -0,0 +1,219 @@
+//===-- Automemcpy CodeGen Test -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/CodeGen.h"
+#include "automemcpy/RandomFunctionGenerator.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::AllOf;
+using testing::AnyOf;
+using testing::ElementsAre;
+using testing::Ge;
+using testing::Gt;
+using testing::Le;
+using testing::Lt;
+
+namespace llvm {
+namespace automemcpy {
+namespace {
+
+TEST(Automemcpy, Codegen) {
+ static constexpr FunctionDescriptor kDescriptors[] = {
+ {FunctionType::MEMCPY, llvm::None, llvm::None, llvm::None, llvm::None,
+ Accelerator{{0, kMaxSize}}, ElementTypeClass::NATIVE},
+ {FunctionType::MEMCPY, Contiguous{{0, 4}}, Overlap{{4, 256}},
+ Loop{{256, kMaxSize}, 64}, llvm::None, llvm::None,
+ ElementTypeClass::NATIVE},
+ {FunctionType::MEMCMP, Contiguous{{0, 2}}, Overlap{{2, 64}}, llvm::None,
+ AlignedLoop{Loop{{64, kMaxSize}, 16}, 16, AlignArg::_1}, llvm::None,
+ ElementTypeClass::NATIVE},
+ {FunctionType::MEMSET, Contiguous{{0, 2}}, Overlap{{2, 256}}, llvm::None,
+ AlignedLoop{Loop{{256, kMaxSize}, 32}, 16, AlignArg::_1}, llvm::None,
+ ElementTypeClass::NATIVE},
+ {FunctionType::MEMSET, Contiguous{{0, 2}}, Overlap{{2, 256}}, llvm::None,
+ AlignedLoop{Loop{{256, kMaxSize}, 32}, 32, AlignArg::_1}, llvm::None,
+ ElementTypeClass::NATIVE},
+ {FunctionType::BZERO, Contiguous{{0, 4}}, Overlap{{4, 128}}, llvm::None,
+ AlignedLoop{Loop{{128, kMaxSize}, 32}, 32, AlignArg::_1}, llvm::None,
+ ElementTypeClass::NATIVE},
+ };
+
+ std::string Output;
+ raw_string_ostream OutputStream(Output);
+ Serialize(OutputStream, kDescriptors);
+
+ EXPECT_STREQ(OutputStream.str().c_str(),
+ R"(// This file is auto-generated by libc/benchmarks/automemcpy.
+// Functions : 6
+
+#include "LibcFunctionPrototypes.h"
+#include "automemcpy/FunctionDescriptor.h"
+#include "src/string/memory_utils/elements.h"
+
+using llvm::libc_benchmarks::BzeroConfiguration;
+using llvm::libc_benchmarks::MemcmpOrBcmpConfiguration;
+using llvm::libc_benchmarks::MemcpyConfiguration;
+using llvm::libc_benchmarks::MemsetConfiguration;
+
+namespace __llvm_libc {
+
+static void memcpy_0xE00E29EE73994E2B(char *__restrict dst, const char *__restrict src, size_t size) {
+ using namespace __llvm_libc::x86;
+ return Copy<Accelerator>(dst, src, size);
+}
+static void memcpy_0x7381B60C7BE75EF9(char *__restrict dst, const char *__restrict src, size_t size) {
+ using namespace __llvm_libc::x86;
+ if(size == 0) return;
+ if(size == 1) return Copy<_1>(dst, src);
+ if(size == 2) return Copy<_2>(dst, src);
+ if(size == 3) return Copy<_3>(dst, src);
+ if(size < 8) return Copy<HeadTail<_4>>(dst, src, size);
+ if(size < 16) return Copy<HeadTail<_8>>(dst, src, size);
+ if(size < 32) return Copy<HeadTail<_16>>(dst, src, size);
+ if(size < 64) return Copy<HeadTail<_32>>(dst, src, size);
+ if(size < 128) return Copy<HeadTail<_64>>(dst, src, size);
+ if(size < 256) return Copy<HeadTail<_128>>(dst, src, size);
+ return Copy<Loop<_64>>(dst, src, size);
+}
+static int memcmp_0x348D7BA6DB0EE033(const char * lhs, const char * rhs, size_t size) {
+ using namespace __llvm_libc::x86;
+ if(size == 0) return 0;
+ if(size == 1) return ThreeWayCompare<_1>(lhs, rhs);
+ if(size < 4) return ThreeWayCompare<HeadTail<_2>>(lhs, rhs, size);
+ if(size < 8) return ThreeWayCompare<HeadTail<_4>>(lhs, rhs, size);
+ if(size < 16) return ThreeWayCompare<HeadTail<_8>>(lhs, rhs, size);
+ if(size < 32) return ThreeWayCompare<HeadTail<_16>>(lhs, rhs, size);
+ if(size < 64) return ThreeWayCompare<HeadTail<_32>>(lhs, rhs, size);
+ return ThreeWayCompare<Align<_16,Arg::Lhs>::Then<Loop<_16>>>(lhs, rhs, size);
+}
+static void memset_0x71E761699B999863(char * dst, int value, size_t size) {
+ using namespace __llvm_libc::x86;
+ if(size == 0) return;
+ if(size == 1) return SplatSet<_1>(dst, value);
+ if(size < 4) return SplatSet<HeadTail<_2>>(dst, value, size);
+ if(size < 8) return SplatSet<HeadTail<_4>>(dst, value, size);
+ if(size < 16) return SplatSet<HeadTail<_8>>(dst, value, size);
+ if(size < 32) return SplatSet<HeadTail<_16>>(dst, value, size);
+ if(size < 64) return SplatSet<HeadTail<_32>>(dst, value, size);
+ if(size < 128) return SplatSet<HeadTail<_64>>(dst, value, size);
+ if(size < 256) return SplatSet<HeadTail<_128>>(dst, value, size);
+ return SplatSet<Align<_16,Arg::Dst>::Then<Loop<_32>>>(dst, value, size);
+}
+static void memset_0x3DF0F44E2ED6A50F(char * dst, int value, size_t size) {
+ using namespace __llvm_libc::x86;
+ if(size == 0) return;
+ if(size == 1) return SplatSet<_1>(dst, value);
+ if(size < 4) return SplatSet<HeadTail<_2>>(dst, value, size);
+ if(size < 8) return SplatSet<HeadTail<_4>>(dst, value, size);
+ if(size < 16) return SplatSet<HeadTail<_8>>(dst, value, size);
+ if(size < 32) return SplatSet<HeadTail<_16>>(dst, value, size);
+ if(size < 64) return SplatSet<HeadTail<_32>>(dst, value, size);
+ if(size < 128) return SplatSet<HeadTail<_64>>(dst, value, size);
+ if(size < 256) return SplatSet<HeadTail<_128>>(dst, value, size);
+ return SplatSet<Align<_32,Arg::Dst>::Then<Loop<_32>>>(dst, value, size);
+}
+static void bzero_0x475977492C218AD4(char * dst, size_t size) {
+ using namespace __llvm_libc::x86;
+ if(size == 0) return;
+ if(size == 1) return SplatSet<_1>(dst, 0);
+ if(size == 2) return SplatSet<_2>(dst, 0);
+ if(size == 3) return SplatSet<_3>(dst, 0);
+ if(size < 8) return SplatSet<HeadTail<_4>>(dst, 0, size);
+ if(size < 16) return SplatSet<HeadTail<_8>>(dst, 0, size);
+ if(size < 32) return SplatSet<HeadTail<_16>>(dst, 0, size);
+ if(size < 64) return SplatSet<HeadTail<_32>>(dst, 0, size);
+ if(size < 128) return SplatSet<HeadTail<_64>>(dst, 0, size);
+ return SplatSet<Align<_32,Arg::Dst>::Then<Loop<_32>>>(dst, 0, size);
+}
+
+} // namespace __llvm_libc
+
+namespace llvm {
+namespace automemcpy {
+
+ArrayRef<NamedFunctionDescriptor> getFunctionDescriptors() {
+ static constexpr NamedFunctionDescriptor kDescriptors[] = {
+ {"memcpy_0xE00E29EE73994E2B",{FunctionType::MEMCPY,llvm::None,llvm::None,llvm::None,llvm::None,Accelerator{{0,kMaxSize}},ElementTypeClass::NATIVE}},
+ {"memcpy_0x7381B60C7BE75EF9",{FunctionType::MEMCPY,Contiguous{{0,4}},Overlap{{4,256}},Loop{{256,kMaxSize},64},llvm::None,llvm::None,ElementTypeClass::NATIVE}},
+ {"memcmp_0x348D7BA6DB0EE033",{FunctionType::MEMCMP,Contiguous{{0,2}},Overlap{{2,64}},llvm::None,AlignedLoop{Loop{{64,kMaxSize},16},16,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+ {"memset_0x71E761699B999863",{FunctionType::MEMSET,Contiguous{{0,2}},Overlap{{2,256}},llvm::None,AlignedLoop{Loop{{256,kMaxSize},32},16,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+ {"memset_0x3DF0F44E2ED6A50F",{FunctionType::MEMSET,Contiguous{{0,2}},Overlap{{2,256}},llvm::None,AlignedLoop{Loop{{256,kMaxSize},32},32,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+ {"bzero_0x475977492C218AD4",{FunctionType::BZERO,Contiguous{{0,4}},Overlap{{4,128}},llvm::None,AlignedLoop{Loop{{128,kMaxSize},32},32,AlignArg::_1},llvm::None,ElementTypeClass::NATIVE}},
+ };
+ return makeArrayRef(kDescriptors);
+}
+
+} // namespace automemcpy
+} // namespace llvm
+
+
+using MemcpyStub = void (*)(char *__restrict, const char *__restrict, size_t);
+template <MemcpyStub Foo>
+void *Wrap(void *__restrict dst, const void *__restrict src, size_t size) {
+ Foo(reinterpret_cast<char *__restrict>(dst),
+ reinterpret_cast<const char *__restrict>(src), size);
+ return dst;
+}
+llvm::ArrayRef<MemcpyConfiguration> getMemcpyConfigurations() {
+ using namespace __llvm_libc;
+ static constexpr MemcpyConfiguration kConfigurations[] = {
+ {Wrap<memcpy_0xE00E29EE73994E2B>, "memcpy_0xE00E29EE73994E2B"},
+ {Wrap<memcpy_0x7381B60C7BE75EF9>, "memcpy_0x7381B60C7BE75EF9"},
+ };
+ return llvm::makeArrayRef(kConfigurations);
+}
+
+using MemcmpStub = int (*)(const char *, const char *, size_t);
+template <MemcmpStub Foo>
+int Wrap(const void *lhs, const void *rhs, size_t size) {
+ return Foo(reinterpret_cast<const char *>(lhs),
+ reinterpret_cast<const char *>(rhs), size);
+}
+llvm::ArrayRef<MemcmpOrBcmpConfiguration> getMemcmpConfigurations() {
+ using namespace __llvm_libc;
+ static constexpr MemcmpOrBcmpConfiguration kConfigurations[] = {
+ {Wrap<memcmp_0x348D7BA6DB0EE033>, "memcmp_0x348D7BA6DB0EE033"},
+ };
+ return llvm::makeArrayRef(kConfigurations);
+}
+llvm::ArrayRef<MemcmpOrBcmpConfiguration> getBcmpConfigurations() {
+ return {};
+}
+
+using MemsetStub = void (*)(char *, int, size_t);
+template <MemsetStub Foo> void *Wrap(void *dst, int value, size_t size) {
+ Foo(reinterpret_cast<char *>(dst), value, size);
+ return dst;
+}
+llvm::ArrayRef<MemsetConfiguration> getMemsetConfigurations() {
+ using namespace __llvm_libc;
+ static constexpr MemsetConfiguration kConfigurations[] = {
+ {Wrap<memset_0x71E761699B999863>, "memset_0x71E761699B999863"},
+ {Wrap<memset_0x3DF0F44E2ED6A50F>, "memset_0x3DF0F44E2ED6A50F"},
+ };
+ return llvm::makeArrayRef(kConfigurations);
+}
+
+using BzeroStub = void (*)(char *, size_t);
+template <BzeroStub Foo> void Wrap(void *dst, size_t size) {
+ Foo(reinterpret_cast<char *>(dst), size);
+}
+llvm::ArrayRef<BzeroConfiguration> getBzeroConfigurations() {
+ using namespace __llvm_libc;
+ static constexpr BzeroConfiguration kConfigurations[] = {
+ {Wrap<bzero_0x475977492C218AD4>, "bzero_0x475977492C218AD4"},
+ };
+ return llvm::makeArrayRef(kConfigurations);
+}
+// Functions : 6
+)");
+}
+} // namespace
+} // namespace automemcpy
+} // namespace llvm
diff --git a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
new file mode 100644
index 0000000000000..bce508d17acbd
--- /dev/null
+++ b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
@@ -0,0 +1,170 @@
+//===-- Automemcpy Json Results Analyzer Test ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "automemcpy/ResultAnalyzer.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::ElementsAre;
+using testing::Pair;
+using testing::SizeIs;
+
+namespace llvm {
+namespace automemcpy {
+namespace {
+
+TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsOneSample) {
+ static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+ static constexpr DistributionId DistA = {{"A"}};
+ static constexpr SampleId Id = {Foo1, DistA};
+ static constexpr Sample kSamples[] = {
+ Sample{Id, 4},
+ };
+
+ const std::vector<FunctionData> Data = getThroughputs(kSamples);
+ EXPECT_THAT(Data, SizeIs(1));
+ EXPECT_THAT(Data[0].Id, Foo1);
+ EXPECT_THAT(Data[0].PerDistributionData, SizeIs(1));
+ // A single value is provided.
+ EXPECT_THAT(
+ Data[0].PerDistributionData.lookup(DistA.Name).MedianBytesPerSecond, 4);
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsManySamplesSameBucket) {
+ static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+ static constexpr DistributionId DistA = {{"A"}};
+ static constexpr SampleId Id = {Foo1, DistA};
+ static constexpr Sample kSamples[] = {Sample{Id, 4}, Sample{Id, 5},
+ Sample{Id, 5}};
+
+ const std::vector<FunctionData> Data = getThroughputs(kSamples);
+ EXPECT_THAT(Data, SizeIs(1));
+ EXPECT_THAT(Data[0].Id, Foo1);
+ EXPECT_THAT(Data[0].PerDistributionData, SizeIs(1));
+ // When multiple values are provided we pick the median one (here median of 4,
+ // 5, 5).
+ EXPECT_THAT(
+ Data[0].PerDistributionData.lookup(DistA.Name).MedianBytesPerSecond, 5);
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsServeralFunctionAndDist) {
+ static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+ static constexpr DistributionId DistA = {{"A"}};
+ static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
+ static constexpr DistributionId DistB = {{"B"}};
+ static constexpr Sample kSamples[] = {
+ Sample{{Foo1, DistA}, 1}, Sample{{Foo1, DistB}, 2},
+ Sample{{Foo2, DistA}, 3}, Sample{{Foo2, DistB}, 4}};
+ // Data is aggregated per function.
+ const std::vector<FunctionData> Data = getThroughputs(kSamples);
+ EXPECT_THAT(Data, SizeIs(2)); // 2 functions Foo1 and Foo2.
+ // Each function has data for both distributions DistA and DistB.
+ EXPECT_THAT(Data[0].PerDistributionData, SizeIs(2));
+ EXPECT_THAT(Data[1].PerDistributionData, SizeIs(2));
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, getScore) {
+ static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+ static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
+ static constexpr FunctionId Foo3 = {"memcpy3", FunctionType::MEMCPY};
+ static constexpr DistributionId Dist = {{"A"}};
+ static constexpr Sample kSamples[] = {Sample{{Foo1, Dist}, 1},
+ Sample{{Foo2, Dist}, 2},
+ Sample{{Foo3, Dist}, 3}};
+
+ // Data is aggregated per function.
+ std::vector<FunctionData> Data = getThroughputs(kSamples);
+
+ // Sort Data by function name so we can test them.
+ std::sort(
+ Data.begin(), Data.end(),
+ [](const FunctionData &A, const FunctionData &B) { return A.Id < B.Id; });
+
+ EXPECT_THAT(Data[0].Id, Foo1);
+ EXPECT_THAT(Data[0].PerDistributionData.lookup("A").MedianBytesPerSecond, 1);
+ EXPECT_THAT(Data[1].Id, Foo2);
+ EXPECT_THAT(Data[1].PerDistributionData.lookup("A").MedianBytesPerSecond, 2);
+ EXPECT_THAT(Data[2].Id, Foo3);
+ EXPECT_THAT(Data[2].PerDistributionData.lookup("A").MedianBytesPerSecond, 3);
+
+ // Normalizes throughput per distribution.
+ fillScores(Data);
+ EXPECT_THAT(Data[0].PerDistributionData.lookup("A").Score, 0);
+ EXPECT_THAT(Data[1].PerDistributionData.lookup("A").Score, 0.5);
+ EXPECT_THAT(Data[2].PerDistributionData.lookup("A").Score, 1);
+}
+
+TEST(AutomemcpyJsonResultsAnalyzer, castVotes) {
+ static constexpr double kAbsErr = 0.01;
+
+ static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
+ static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
+ static constexpr FunctionId Foo3 = {"memcpy3", FunctionType::MEMCPY};
+ static constexpr DistributionId DistA = {{"A"}};
+ static constexpr DistributionId DistB = {{"B"}};
+ static constexpr Sample kSamples[] = {
+ Sample{{Foo1, DistA}, 0}, Sample{{Foo1, DistB}, 30},
+ Sample{{Foo2, DistA}, 1}, Sample{{Foo2, DistB}, 100},
+ Sample{{Foo3, DistA}, 7}, Sample{{Foo3, DistB}, 100},
+ };
+
+ // DistA Thoughput ranges from 0 to 7.
+ // DistB Thoughput ranges from 30 to 100.
+
+ // Data is aggregated per function.
+ std::vector<FunctionData> Data = getThroughputs(kSamples);
+
+ // Sort Data by function name so we can test them.
+ std::sort(
+ Data.begin(), Data.end(),
+ [](const FunctionData &A, const FunctionData &B) { return A.Id < B.Id; });
+
+ // Normalizes throughput per distribution.
+ fillScores(Data);
+
+ // Cast votes
+ castVotes(Data);
+
+ EXPECT_THAT(Data[0].Id, Foo1);
+ EXPECT_THAT(Data[1].Id, Foo2);
+ EXPECT_THAT(Data[2].Id, Foo3);
+
+ // Distribution A
+ // Throughput is 0, 1 and 7, so normalized scores are 0, 1/7 and 1.
+ EXPECT_NEAR(Data[0].PerDistributionData.lookup("A").Score, 0, kAbsErr);
+ EXPECT_NEAR(Data[1].PerDistributionData.lookup("A").Score, 1. / 7, kAbsErr);
+ EXPECT_NEAR(Data[2].PerDistributionData.lookup("A").Score, 1, kAbsErr);
+ // which are turned into grades BAD, MEDIOCRE and EXCELLENT.
+ EXPECT_THAT(Data[0].PerDistributionData.lookup("A").Grade, Grade::BAD);
+ EXPECT_THAT(Data[1].PerDistributionData.lookup("A").Grade, Grade::MEDIOCRE);
+ EXPECT_THAT(Data[2].PerDistributionData.lookup("A").Grade, Grade::EXCELLENT);
+
+ // Distribution B
+ // Throughput is 30, 100 and 100, so normalized scores are 0, 1 and 1.
+ EXPECT_NEAR(Data[0].PerDistributionData.lookup("B").Score, 0, kAbsErr);
+ EXPECT_NEAR(Data[1].PerDistributionData.lookup("B").Score, 1, kAbsErr);
+ EXPECT_NEAR(Data[2].PerDistributionData.lookup("B").Score, 1, kAbsErr);
+ // which are turned into grades BAD, EXCELLENT and EXCELLENT.
+ EXPECT_THAT(Data[0].PerDistributionData.lookup("B").Grade, Grade::BAD);
+ EXPECT_THAT(Data[1].PerDistributionData.lookup("B").Grade, Grade::EXCELLENT);
+ EXPECT_THAT(Data[2].PerDistributionData.lookup("B").Grade, Grade::EXCELLENT);
+
+ // Now looking from the functions point of view.
+ // Note the array is indexed by GradeEnum values (EXCELLENT=0 / BAD = 6)
+ EXPECT_THAT(Data[0].GradeHisto, ElementsAre(0, 0, 0, 0, 0, 0, 2));
+ EXPECT_THAT(Data[1].GradeHisto, ElementsAre(1, 0, 0, 0, 0, 1, 0));
+ EXPECT_THAT(Data[2].GradeHisto, ElementsAre(2, 0, 0, 0, 0, 0, 0));
+
+ EXPECT_THAT(Data[0].FinalGrade, Grade::BAD);
+ EXPECT_THAT(Data[1].FinalGrade, Grade::MEDIOCRE);
+ EXPECT_THAT(Data[2].FinalGrade, Grade::EXCELLENT);
+}
+
+} // namespace
+} // namespace automemcpy
+} // namespace llvm
diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
index 1b893373e41cd..63e30baded12b 100644
--- a/libc/src/string/memory_utils/elements.h
+++ b/libc/src/string/memory_utils/elements.h
@@ -151,6 +151,43 @@ template <> struct Chained<> {
static void SplatSet(char *dst, const unsigned char value) {}
};
+// Overlap ElementA and ElementB so they span Size bytes.
+template <size_t Size, typename ElementA, typename ElementB = ElementA>
+struct Overlap {
+ static constexpr size_t kSize = Size;
+ static_assert(ElementB::kSize <= ElementA::kSize, "ElementB too big");
+ static_assert(ElementA::kSize <= Size, "ElementA too big");
+ static_assert((ElementA::kSize + ElementB::kSize) >= Size,
+ "Elements too small to overlap");
+ static constexpr size_t kOffset = kSize - ElementB::kSize;
+
+ static void Copy(char *__restrict dst, const char *__restrict src) {
+ ElementA::Copy(dst, src);
+ ElementB::Copy(dst + kOffset, src + kOffset);
+ }
+
+ static bool Equals(const char *lhs, const char *rhs) {
+ if (!ElementA::Equals(lhs, rhs))
+ return false;
+ if (!ElementB::Equals(lhs + kOffset, rhs + kOffset))
+ return false;
+ return true;
+ }
+
+ static int ThreeWayCompare(const char *lhs, const char *rhs) {
+ if (!ElementA::Equals(lhs, rhs))
+ return ElementA::ThreeWayCompare(lhs, rhs);
+ if (!ElementB::Equals(lhs + kOffset, rhs + kOffset))
+ return ElementB::ThreeWayCompare(lhs + kOffset, rhs + kOffset);
+ return 0;
+ }
+
+ static void SplatSet(char *dst, const unsigned char value) {
+ ElementA::SplatSet(dst, value);
+ ElementB::SplatSet(dst + kOffset, value);
+ }
+};
+
// Runtime-size Higher-Order Operations
// ------------------------------------
// - Tail<T>: Perform the operation on the last 'T::kSize' bytes of the buffer.
More information about the libc-commits
mailing list