[llvm] 5bd9eee - [find_interesting_reviews.py] Add git blame output cache

Mon Dec 23 04:08:37 PST 2019

Author: Kristof Beyls
Date: 2019-12-23T12:08:16Z
New Revision: 5bd9eee53d124d00885395f6f2d6a8c64deca188

URL: https://github.com/llvm/llvm-project/commit/5bd9eee53d124d00885395f6f2d6a8c64deca188
DIFF: https://github.com/llvm/llvm-project/commit/5bd9eee53d124d00885395f6f2d6a8c64deca188.diff

LOG: [find_interesting_reviews.py] Add git blame output cache

The majority of the running time of this script tends to be spent in
running git blame on source files touched by patches under review.

By introducing a git blame output cache, some of the git blame commands
don't have to re-run, and the blame information can be retrieved from a
cache.

I've observed that in a typical run matching patches available for
review with potential reviewers, this speeds up the script's running
time by a factor of about 2.5x.

Added: 
    

Modified: 
    llvm/utils/Reviewing/find_interesting_reviews.py

Removed: 
    


################################################################################
diff  --git a/llvm/utils/Reviewing/find_interesting_reviews.py b/llvm/utils/Reviewing/find_interesting_reviews.py
index b1664f5044e4..58b856986577 100644

--- a/llvm/utils/Reviewing/find_interesting_reviews.py
+++ b/llvm/utils/Reviewing/find_interesting_reviews.py
@@ -458,11 +458,11 @@ def get_git_cmd_output(cmd):
 reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
 
 
-def parse_blame_output_line_porcelain(blame_output):
+def parse_blame_output_line_porcelain(blame_output_lines):
     email2nr_occurences = {}
-    if blame_output is None:
+    if blame_output_lines is None:
         return email2nr_occurences
-    for line in blame_output.split('\n'):
+    for line in blame_output_lines:
         m = reAuthorMail.match(line)
         if m:
             author_email_address = m.group(1)
@@ -473,6 +473,54 @@ def parse_blame_output_line_porcelain(blame_output):
     return email2nr_occurences
 
 
+class BlameOutputCache:
+    def __init__(self):
+        self.cache = {}
+
+    def _populate_cache_for(self, cache_key):
+        assert cache_key not in self.cache
+        git_repo, base_revision, path = cache_key
+        cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
+               "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
+                                                     path)
+        blame_output = get_git_cmd_output(cmd)
+        self.cache[cache_key] = \
+            blame_output.split('\n') if blame_output is not None else None
+        # FIXME: the blame cache could probably be made more effective still if
+        # instead of storing the requested base_revision in the cache, the last
+        # revision before the base revision this file/path got changed in gets
+        # stored. That way multiple project revisions for which this specific
+        # file/patch hasn't changed would get cache hits (instead of misses in
+        # the current implementation).
+
+    def get_blame_output_for(self, git_repo, base_revision, path, start_line=-1,
+                             end_line=-1):
+        cache_key = (git_repo, base_revision, path)
+        if cache_key not in self.cache:
+            self._populate_cache_for(cache_key)
+        assert cache_key in self.cache
+        all_blame_lines = self.cache[cache_key]
+        if all_blame_lines is None:
+            return None
+        if start_line == -1 and end_line == -1:
+            return all_blame_lines
+        assert start_line >= 0
+        assert end_line >= 0
+        assert end_line <= len(all_blame_lines)
+        assert start_line <= len(all_blame_lines)
+        assert start_line <= end_line
+        return all_blame_lines[start_line:end_line]
+
+    def get_parsed_git_blame_for(self, git_repo, base_revision, path,
+                                 start_line=-1, end_line=-1):
+        return parse_blame_output_line_porcelain(
+            self.get_blame_output_for(git_repo, base_revision, path, start_line,
+                                      end_line))
+
+
+blameOutputCache = BlameOutputCache()
+
+
 def find_reviewers_for_
diff _heuristic(
diff ):
     # Heuristic 1: assume good reviewers are the ones that touched the same
     # lines before as this patch is touching.
@@ -496,23 +544,18 @@ def find_reviewers_for_
diff _heuristic(
diff ):
         for hunk in change.hunks:
             for start_line, end_line in hunk.actual_lines_changed_offset:
                 # Collect git blame results for authors in those ranges.
-                cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
-                       "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
-                           git_repo, start_line, end_line, base_revision, path)
-                blame_output = get_git_cmd_output(cmd)
                 for reviewer, nr_occurences in \
-                        parse_blame_output_line_porcelain(blame_output).items():
+                        blameOutputCache.get_parsed_git_blame_for(
+                            git_repo, base_revision, path, start_line, end_line
+                        ).items():
                     if reviewer not in reviewers2nr_lines_touched:
                         reviewers2nr_lines_touched[reviewer] = 0
                     reviewers2nr_lines_touched[reviewer] += nr_occurences
         # Compute heuristic 2: don't look at context, just at files touched.
         # Collect git blame results for authors in those ranges.
-        cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
-               "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
-                                                     path)
-        blame_output = get_git_cmd_output(cmd)
-        for reviewer, nr_occurences in parse_blame_output_line_porcelain(
-                blame_output).items():
+        for reviewer, nr_occurences in \
+                blameOutputCache.get_parsed_git_blame_for(
+                    git_repo, base_revision, path).items():
             if reviewer not in reviewers2nr_files_touched:
                 reviewers2nr_files_touched[reviewer] = 0
             reviewers2nr_files_touched[reviewer] += 1