[clang] aec12c1 - [analyzer][tests] Add a notion of project sizes

Mon Aug 24 06:13:24 PDT 2020

Author: Valeriy Savchenko
Date: 2020-08-24T16:13:00+03:00
New Revision: aec12c1264ac17877d5cb19750eaa322fe57342d

URL: https://github.com/llvm/llvm-project/commit/aec12c1264ac17877d5cb19750eaa322fe57342d
DIFF: https://github.com/llvm/llvm-project/commit/aec12c1264ac17877d5cb19750eaa322fe57342d.diff

LOG: [analyzer][tests] Add a notion of project sizes

Summary:
Whith the number of projects growing, it is important to be able to
filter them in a more convenient way than by names.  It is especially
important for benchmarks, when it is not viable to analyze big
projects 20 or 50 times in a row.

Because of this reason, this commit adds a notion of sizes and a
filtering interface that puts a limit on a maximum size of the project
to analyze or benchmark.

Sizes assigned to the projects in this commit, do not directly
correspond to the number of lines or files in the project.  The key
factor that is important for the developers of the analyzer is the
time it takes to analyze the project.  And for this very reason,
"size" basically helps to cluster projects based on their analysis
time.

Differential Revision: https://reviews.llvm.org/D83942

Added: 
    

Modified: 
    clang/utils/analyzer/ProjectMap.py
    clang/utils/analyzer/SATest.py
    clang/utils/analyzer/projects/projects.json

Removed: 
    


################################################################################
diff  --git a/clang/utils/analyzer/ProjectMap.py b/clang/utils/analyzer/ProjectMap.py
index 3daa70140562..1e89ce634e57 100644

--- a/clang/utils/analyzer/ProjectMap.py
+++ b/clang/utils/analyzer/ProjectMap.py
@@ -1,7 +1,7 @@
 import json
 import os
 
-from enum import Enum
+from enum import auto, Enum
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple
 
 
@@ -17,6 +17,64 @@ class DownloadType(str, Enum):
     SCRIPT = "script"
 
 
+class Size(int, Enum):
+    """
+    Size of the project.
+
+    Sizes do not directly correspond to the number of lines or files in the
+    project.  The key factor that is important for the developers of the
+    analyzer is the time it takes to analyze the project.  Here is how
+    the following sizes map to times:
+
+    TINY:  <1min
+    SMALL: 1min-10min
+    BIG:   10min-1h
+    HUGE:  >1h
+
+    The borders are a bit of a blur, especially because analysis time varies
+    from one machine to another.  However, the relative times will stay pretty
+    similar, and these groupings will still be helpful.
+
+    UNSPECIFIED is a very special case, which is intentionally last in the list
+    of possible sizes.  If the user wants to filter projects by one of the
+    possible sizes, we want projects with UNSPECIFIED size to be filtered out
+    for any given size.
+    """
+    TINY = auto()
+    SMALL = auto()
+    BIG = auto()
+    HUGE = auto()
+    UNSPECIFIED = auto()
+
+    @staticmethod
+    def from_str(raw_size: Optional[str]) -> "Size":
+        """
+        Construct a Size object from an optional string.
+
+        :param raw_size: optional string representation of the desired Size
+                         object.  None will produce UNSPECIFIED size.
+
+        This method is case-insensitive, so raw sizes 'tiny', 'TINY', and
+        'TiNy' will produce the same result.
+        """
+        if raw_size is None:
+            return Size.UNSPECIFIED
+
+        raw_size_upper = raw_size.upper()
+        # The implementation is decoupled from the actual values of the enum,
+        # so we can easily add or modify it without bothering about this
+        # function.
+        for possible_size in Size:
+            if possible_size.name == raw_size_upper:
+                return possible_size
+
+        possible_sizes = [size.name.lower() for size in Size
+                          # no need in showing our users this size
+                          if size != Size.UNSPECIFIED]
+        raise ValueError(f"Incorrect project size '{raw_size}'. "
+                         f"Available sizes are {possible_sizes}")
+
+
 class ProjectInfo(NamedTuple):
     """
     Information about a project to analyze.
@@ -27,6 +85,7 @@ class ProjectInfo(NamedTuple):
     origin: str = ""
     commit: str = ""
     enabled: bool = True
+    size: Size = Size.UNSPECIFIED
 
     def with_fields(self, **kwargs) -> "ProjectInfo":
         """
@@ -98,6 +157,7 @@ def _parse_project(raw_project: JSON) -> ProjectInfo:
             build_mode: int = raw_project["mode"]
             enabled: bool = raw_project.get("enabled", True)
             source: DownloadType = raw_project.get("source", "zip")
+            size = Size.from_str(raw_project.get("size", None))
 
             if source == DownloadType.GIT:
                 origin, commit = ProjectMap._get_git_params(raw_project)
@@ -105,7 +165,7 @@ def _parse_project(raw_project: JSON) -> ProjectInfo:
                 origin, commit = "", ""
 
             return ProjectInfo(name, build_mode, source, origin, commit,
-                               enabled)
+                               enabled, size)
 
         except KeyError as e:
             raise ValueError(

diff  --git a/clang/utils/analyzer/SATest.py b/clang/utils/analyzer/SATest.py
index 86571902502f..176fe40a2b17 100755
--- a/clang/utils/analyzer/SATest.py
+++ b/clang/utils/analyzer/SATest.py
@@ -37,7 +37,7 @@ def build(parser, args):
 
     SATestBuild.VERBOSE = args.verbose
 
-    projects = get_projects(parser, args.projects)
+    projects = get_projects(parser, args)
     tester = SATestBuild.RegressionTester(args.jobs,
                                           projects,
                                           args.override_compiler,
@@ -84,7 +84,7 @@ def update(parser, args):
 def benchmark(parser, args):
     from SATestBenchmark import Benchmark
 
-    projects = get_projects(parser, args.projects)
+    projects = get_projects(parser, args)
     benchmark = Benchmark(projects, args.iterations, args.output)
     benchmark.run()
 
@@ -94,14 +94,19 @@ def benchmark_compare(parser, args):
     SATestBenchmark.compare(args.old, args.new, args.output)
 
 
-def get_projects(parser, projects_str):
-    from ProjectMap import ProjectMap
+def get_projects(parser, args):
+    from ProjectMap import ProjectMap, Size
 
     project_map = ProjectMap()
     projects = project_map.projects
 
-    if projects_str:
-        projects_arg = projects_str.split(",")
+    def filter_projects(projects, predicate, force=False):
+        return [project.with_fields(enabled=(force or project.enabled) and
+                                    predicate(project))
+                for project in projects]
+
+    if args.projects:
+        projects_arg = args.projects.split(",")
         available_projects = [project.name
                               for project in projects]
 
@@ -113,8 +118,17 @@ def get_projects(parser, projects_str):
                              "{all}.".format(project=manual_project,
                                              all=available_projects))
 
-        projects = [project.with_fields(enabled=project.name in projects_arg)
-                    for project in projects]
+        projects = filter_projects(projects, lambda project:
+                                   project.name in projects_arg,
+                                   force=True)
+
+    try:
+        max_size = Size.from_str(args.max_size)
+    except ValueError as e:
+        parser.error("{}".format(e))
+
+    projects = filter_projects(projects, lambda project:
+                               project.size <= max_size)
 
     return projects
 
@@ -238,6 +252,8 @@ def main():
                               help="Arguments passed to to -analyzer-config")
     build_parser.add_argument("--projects", action="store", default="",
                               help="Comma-separated list of projects to test")
+    build_parser.add_argument("--max-size", action="store", default=None,
+                              help="Maximum size for the projects to test")
     build_parser.add_argument("-v", "--verbose", action="count", default=0)
     build_parser.set_defaults(func=build)
 
@@ -318,6 +334,8 @@ def main():
                               help="Output csv file for the benchmark results")
     bench_parser.add_argument("--projects", action="store", default="",
                               help="Comma-separated list of projects to test")
+    bench_parser.add_argument("--max-size", action="store", default=None,
+                              help="Maximum size for the projects to test")
     bench_parser.set_defaults(func=benchmark)
 
     bench_subparsers = bench_parser.add_subparsers()

diff  --git a/clang/utils/analyzer/projects/projects.json b/clang/utils/analyzer/projects/projects.json
index 84b741035f46..80b61ecd3874 100644
--- a/clang/utils/analyzer/projects/projects.json
+++ b/clang/utils/analyzer/projects/projects.json
@@ -4,139 +4,159 @@
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/jarro2783/cxxopts.git",
-    "commit": "794c975"
+    "commit": "794c975",
+    "size": "tiny"
   },
   {
     "name": "box2d",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/erincatto/box2d.git",
-    "commit": "1025f9a"
+    "commit": "1025f9a",
+    "size": "small"
   },
   {
     "name": "tinyexpr",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/codeplea/tinyexpr.git",
-    "commit": "ffb0d41"
+    "commit": "ffb0d41",
+    "size": "tiny"
   },
   {
     "name": "symengine",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/symengine/symengine.git",
-    "commit": "4f669d59"
+    "commit": "4f669d59",
+    "size": "small"
   },
   {
     "name": "termbox",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/nsf/termbox.git",
-    "commit": "0df1355"
+    "commit": "0df1355",
+    "size": "tiny"
   },
   {
     "name": "tinyvm",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/jakogut/tinyvm.git",
-    "commit": "10c25d8"
+    "commit": "10c25d8",
+    "size": "tiny"
   },
   {
     "name": "tinyspline",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/msteinbeck/tinyspline.git",
-    "commit": "f8b1ab7"
+    "commit": "f8b1ab7",
+    "size": "tiny"
   },
   {
     "name": "oatpp",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/oatpp/oatpp.git",
-    "commit": "d3e60fb"
+    "commit": "d3e60fb",
+    "size": "small"
   },
   {
     "name": "libsoundio",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/andrewrk/libsoundio.git",
-    "commit": "b810bf2"
+    "commit": "b810bf2",
+    "size": "tiny"
   },
   {
     "name": "zstd",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/facebook/zstd.git",
-    "commit": "2af4e073"
+    "commit": "2af4e073",
+    "size": "small"
   },
   {
     "name": "simbody",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/simbody/simbody.git",
-    "commit": "5cf513d"
+    "commit": "5cf513d",
+    "size": "big"
   },
   {
     "name": "duckdb",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/cwida/duckdb.git",
-    "commit": "d098c9f"
+    "commit": "d098c9f",
+    "size": "big"
   },
   {
     "name": "drogon",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/an-tao/drogon.git",
-    "commit": "fd2a612"
+    "commit": "fd2a612",
+    "size": "small"
   },
   {
     "name": "fmt",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/fmtlib/fmt.git",
-    "commit": "5e7c70e"
+    "commit": "5e7c70e",
+    "size": "small"
   },
   {
     "name": "re2",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/google/re2.git",
-    "commit": "2b25567"
+    "commit": "2b25567",
+    "size": "small"
   },
   {
     "name": "cppcheck",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/danmar/cppcheck.git",
-    "commit": "5fa3d53"
+    "commit": "5fa3d53",
+    "size": "small"
   },
   {
     "name": "harfbuzz",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/harfbuzz/harfbuzz.git",
-    "commit": "f8d345e"
+    "commit": "f8d345e",
+    "size": "small"
   },
   {
     "name": "capnproto",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/capnproto/capnproto.git",
-    "commit": "8be1c9f"
+    "commit": "8be1c9f",
+    "size": "small"
   },
   {
     "name": "tmux",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/tmux/tmux.git",
-    "commit": "a5f99e1"
+    "commit": "a5f99e1",
+    "size": "big"
   },
   {
     "name": "faiss",
     "mode": 1,
     "source": "git",
     "origin": "https://github.com/facebookresearch/faiss.git",
-    "commit": "9e5d5b7"
+    "commit": "9e5d5b7",
+    "size": "small"
   }
 ]