[llvm] [CI] Extend metrics container to log BuildKite metrics (PR #129699)

Nathan Gauër via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 5 05:15:15 PST 2025


https://github.com/Keenuts updated https://github.com/llvm/llvm-project/pull/129699

>From 89ed6bc997b19b0e1f6b280be2e9e7779ecf7146 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Tue, 4 Mar 2025 11:36:10 +0100
Subject: [PATCH 1/4] [CI] Extend metrics container to log BuildKite metrics

The current container focuses on Github metrics. Before
deprecating BuildKite, we want to make sure the new infra
quality is better, or at least the same.

Being able to compare buildkite metrics with github metrics on grafana
will allow us to easily present the comparison.
---
 .ci/metrics/metrics.py | 165 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 161 insertions(+), 4 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index bd2b51154768d..98d328029921e 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -1,4 +1,6 @@
 import requests
+import dateutil
+import json
 import time
 import os
 from dataclasses import dataclass
@@ -16,6 +18,17 @@
 WORKFLOWS_TO_TRACK = ["LLVM Premerge Checks"]
 SCRAPE_INTERVAL_SECONDS = 5 * 60
 
+# Number of builds to fetch per page. Since we scrape regularly, this can
+# remain small.
+BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 10
+
+# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
+# the metric name in Grafana. This is important not to lose metrics history
+# if the workflow name changes.
+BUILDKITE_WORKFLOW_TO_TRACK = {
+    ":linux: Linux x64": "buildkite_linux",
+    ":windows: Windows x64": "buildkite_windows",
+}
 
 @dataclass
 class JobMetrics:
@@ -35,6 +48,146 @@ class GaugeMetric:
     time_ns: int
 
 
+# Fetches a page of the build list using the GraphQL BuildKite API.
+# Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last **finished** builds by
+# default, or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE **finished** builds older
+# than the one pointer by
+# |cursor| if provided.
+# The |cursor| value is taken from the previous page returned by the API.
+# The returned data had the following format:
+# [
+#   {
+#       "cursor": <value>,
+#       "number": <build-number>,
+#   }
+# ]
+def buildkite_fetch_page_build_list(buildkite_token, after_cursor=None):
+    BUILDKITE_GRAPHQL_QUERY = """
+  query OrganizationShowQuery {{
+    organization(slug: "llvm-project") {{
+      pipelines(search: "Github pull requests", first: 1) {{
+        edges {{
+          node {{
+            builds (state: [FAILED, PASSED], first: {PAGE_SIZE}, after: {AFTER}) {{
+              edges {{
+                cursor
+                node {{
+                  number
+                }}
+              }}
+            }}
+          }}
+        }}
+      }}
+    }}
+  }}
+  """
+    data = BUILDKITE_GRAPHQL_QUERY.format(
+        PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
+        AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
+    )
+    data = data.replace("\n", "").replace('"', '\\"')
+    data = '{ "query": "' + data + '" }'
+    url = "https://graphql.buildkite.com/v1"
+    headers = {
+        "Authorization": "Bearer " + buildkite_token,
+        "Content-Type": "application/json",
+    }
+    r = requests.post(url, data=data, headers=headers)
+    data = r.json()
+    # De-nest the build list.
+    builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
+        "edges"
+    ]
+    # Fold cursor info into the node dictionnary.
+    return [{**x["node"], "cursor": x["cursor"]} for x in builds]
+
+
+# Returns all the info associated with the provided |build_number|.
+# Note: for unknown reasons, graphql returns no jobs for a given build, while
+# this endpoint does, hence why this uses this API instead of graphql.
+def buildkite_get_build_info(build_number):
+    URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
+    return requests.get(URL.format(build_number)).json()
+
+
+# returns the last BUILDKITE_GRAPHQL_BUILDS_PER_PAGE builds by default, or
+# until the build pointed by |last_cursor| is found.
+def buildkite_get_builds_up_to(buildkite_token, last_cursor=None):
+    output = []
+    cursor = None
+
+    while True:
+        page = buildkite_fetch_page_build_list(buildkite_token, cursor)
+        # No cursor provided, return the first page.
+        if last_cursor is None:
+            return page
+
+        # Cursor has been provided, check if present in this page.
+        match_index = next(
+            (i for i, x in enumerate(page) if x["cursor"] == last_cursor), None
+        )
+        # Not present, continue loading more pages.
+        if match_index is None:
+            output += page
+            cursor = page[-1]["cursor"]
+            continue
+        # Cursor found, keep results up to cursor
+        output += page[:match_index]
+        return output
+
+
+# Returns a (metrics, cursor) tuple.
+# Returns the BuildKite workflow metrics up to the build pointed by |last_cursor|.
+# If |last_cursor| is None, no metrics are returned.
+# The returned cursor is either:
+#  - the last processed build.
+#  - the last build if no initial cursor was provided.
+def buildkite_get_metrics(buildkite_token, last_cursor=None):
+
+    builds = buildkite_get_builds_up_to(buildkite_token, last_cursor)
+    # Don't return any metrics if last_cursor is None.
+    # This happens when the program starts.
+    if last_cursor is None:
+        return [], builds[0]["cursor"]
+
+    last_recorded_build = last_cursor
+    output = []
+    for build in builds:
+        info = buildkite_get_build_info(build["number"])
+        last_recorded_build = build["cursor"]
+        for job in info["jobs"]:
+            # Skip this job.
+            if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
+                continue
+
+            created_at = dateutil.parser.isoparse(job["created_at"])
+            scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
+            started_at = dateutil.parser.isoparse(job["started_at"])
+            finished_at = dateutil.parser.isoparse(job["finished_at"])
+
+            job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
+            queue_time = (started_at - scheduled_at).seconds
+            run_time = (finished_at - started_at).seconds
+            status = bool(job["passed"])
+            created_at_ns = int(created_at.timestamp()) * 10**9
+            workflow_id = build["number"]
+            workflow_name = "Github pull requests"
+            output.append(
+                JobMetrics(
+                    job_name,
+                    queue_time,
+                    run_time,
+                    status,
+                    created_at_ns,
+                    workflow_id,
+                    workflow_name,
+                )
+            )
+
+    return output, last_recorded_build
+
+
 def get_sampled_workflow_metrics(github_repo: github.Repository):
     """Gets global statistics about the Github workflow queue
 
@@ -105,7 +258,6 @@ def get_sampled_workflow_metrics(github_repo: github.Repository):
     )
     return workflow_metrics
 
-
 def get_per_workflow_metrics(
     github_repo: github.Repository, workflows_to_track: dict[str, int]
 ):
@@ -211,7 +363,6 @@ def get_per_workflow_metrics(
 
     return workflow_metrics
 
-
 def upload_metrics(workflow_metrics, metrics_userid, api_key):
     """Upload metrics to Grafana.
 
@@ -260,9 +411,12 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
 def main():
     # Authenticate with Github
     auth = Auth.Token(os.environ["GITHUB_TOKEN"])
-
     grafana_api_key = os.environ["GRAFANA_API_KEY"]
     grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
+    buildkite_token = os.environ["BUILDKITE_TOKEN"]
+
+    # The last buildkite build recorded.
+    buildkite_last_cursor = None
 
     workflows_to_track = {}
     for workflow_to_track in WORKFLOWS_TO_TRACK:
@@ -274,7 +428,10 @@ def main():
         github_object = Github(auth=auth)
         github_repo = github_object.get_repo("llvm/llvm-project")
 
-        current_metrics = get_per_workflow_metrics(github_repo, workflows_to_track)
+        current_metrics, buildkite_last_cursor = buildkite_get_metrics(
+            buildkite_token, buildkite_last_cursor
+        )
+        current_metrics += get_per_workflow_metrics(github_repo, workflows_to_track)
         current_metrics += get_sampled_workflow_metrics(github_repo)
 
         upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)

>From e2e1d0723d7043f146db088029dda522142443b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Tue, 4 Mar 2025 13:59:43 +0100
Subject: [PATCH 2/4] format

---
 .ci/metrics/metrics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 98d328029921e..fe4cfa75f66c7 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -144,7 +144,6 @@ def buildkite_get_builds_up_to(buildkite_token, last_cursor=None):
 #  - the last processed build.
 #  - the last build if no initial cursor was provided.
 def buildkite_get_metrics(buildkite_token, last_cursor=None):
-
     builds = buildkite_get_builds_up_to(buildkite_token, last_cursor)
     # Don't return any metrics if last_cursor is None.
     # This happens when the program starts.

>From e4cdb5701d9bb7a11da9cb8144516ac63474d178 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Wed, 5 Mar 2025 14:07:56 +0100
Subject: [PATCH 3/4] pr-feedback

---
 .ci/metrics/metrics.py | 80 +++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index fe4cfa75f66c7..b8eff5aebe6aa 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -48,20 +48,28 @@ class GaugeMetric:
     time_ns: int
 
 
-# Fetches a page of the build list using the GraphQL BuildKite API.
-# Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last **finished** builds by
-# default, or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE **finished** builds older
-# than the one pointer by
-# |cursor| if provided.
-# The |cursor| value is taken from the previous page returned by the API.
-# The returned data had the following format:
-# [
-#   {
-#       "cursor": <value>,
-#       "number": <build-number>,
-#   }
-# ]
-def buildkite_fetch_page_build_list(buildkite_token, after_cursor=None):
+def buildkite_fetch_page_build_list(
+    buildkite_token: str, after_cursor: str = None
+) -> list[dict[str, str]]:
+    """Fetches a page of the build list using the GraphQL BuildKite API. Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last **finished** builds by default, or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE **finished** builds older than the one pointer by |cursor| if provided.
+    The |cursor| value is taken from the previous page returned by the API.
+
+    The returned data had the following format:
+
+    Args:
+      buildkite_token: the secret token to authenticate GraphQL requests.
+      after_cursor: cursor after which to start the page fetch.
+
+    Returns:
+      Returns most recents builds after cursor (if set) with the following format:
+      [
+        {
+            "cursor": <value>,
+            "number": <build-number>,
+        }
+      ]
+    """
+
     BUILDKITE_GRAPHQL_QUERY = """
   query OrganizationShowQuery {{
     organization(slug: "llvm-project") {{
@@ -103,17 +111,29 @@ def buildkite_fetch_page_build_list(buildkite_token, after_cursor=None):
     return [{**x["node"], "cursor": x["cursor"]} for x in builds]
 
 
-# Returns all the info associated with the provided |build_number|.
-# Note: for unknown reasons, graphql returns no jobs for a given build, while
-# this endpoint does, hence why this uses this API instead of graphql.
-def buildkite_get_build_info(build_number):
+def buildkite_get_build_info(build_number: str) -> dict:
+    """Returns all the info associated with the provided build number.
+    Note: for unknown reasons, graphql returns no jobs for a given build, while this endpoint does, hence why this uses this API instead of graphql.
+
+      Args:
+        build_number: which build number to fetch info for.
+
+      Returns:
+        The info for the target build, a JSON dictionnary.
+    """
+
     URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
     return requests.get(URL.format(build_number)).json()
 
 
-# returns the last BUILDKITE_GRAPHQL_BUILDS_PER_PAGE builds by default, or
-# until the build pointed by |last_cursor| is found.
-def buildkite_get_builds_up_to(buildkite_token, last_cursor=None):
+def buildkite_get_builds_up_to(buildkite_token: str, last_cursor: str = None) -> list:
+    """Returns the last BUILDKITE_GRAPHQL_BUILDS_PER_PAGE builds by default, or
+    until the build pointed by |last_cursor| is found.
+
+    Args:
+     buildkite_token: the secret token to authenticate GraphQL requests.
+     last_cursor: the cursor to stop at if set. If None, a full page is fetched.
+    """
     output = []
     cursor = None
 
@@ -137,13 +157,17 @@ def buildkite_get_builds_up_to(buildkite_token, last_cursor=None):
         return output
 
 
-# Returns a (metrics, cursor) tuple.
-# Returns the BuildKite workflow metrics up to the build pointed by |last_cursor|.
-# If |last_cursor| is None, no metrics are returned.
-# The returned cursor is either:
-#  - the last processed build.
-#  - the last build if no initial cursor was provided.
-def buildkite_get_metrics(buildkite_token, last_cursor=None):
+def buildkite_get_metrics(
+    buildkite_token: str, last_cursor: str = None
+) -> (list[JobMetrics], str):
+    """Returns a tuple with:
+    - the metrics to record until |last_cursor| is reached, or none if last cursor is None.
+    - the cursor of the most recent build processed.
+
+    Args:
+     buildkite_token: the secret token to authenticate GraphQL requests.
+     last_cursor: the cursor to stop at if set. If None, a full page is fetched.
+    """
     builds = buildkite_get_builds_up_to(buildkite_token, last_cursor)
     # Don't return any metrics if last_cursor is None.
     # This happens when the program starts.

>From dba4d04ad69fcdd0859fb7cc0c0ffaad3fa0f612 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Wed, 5 Mar 2025 14:14:27 +0100
Subject: [PATCH 4/4] expand one-liner

---
 .ci/metrics/metrics.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index b8eff5aebe6aa..716a9cd7208be 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -144,9 +144,12 @@ def buildkite_get_builds_up_to(buildkite_token: str, last_cursor: str = None) ->
             return page
 
         # Cursor has been provided, check if present in this page.
-        match_index = next(
-            (i for i, x in enumerate(page) if x["cursor"] == last_cursor), None
-        )
+        match_index = None
+        for index, item in enumerate(page):
+            if item["cursor"] == last_cursor:
+                match_index = index
+                break
+
         # Not present, continue loading more pages.
         if match_index is None:
             output += page



More information about the llvm-commits mailing list