[llvm] [CI] Extend metrics container to log BuildKite metrics (PR #130996)

Nathan Gauër via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 13 05:57:23 PDT 2025


https://github.com/Keenuts updated https://github.com/llvm/llvm-project/pull/130996

>From d94c4d1a48419b2f66636f50fd034dd33fec652b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Tue, 11 Mar 2025 19:17:26 +0100
Subject: [PATCH 1/5] [CI] Extend metrics container to log BuildKite metrics

The current container focuses on Github metrics. Before deprecating BuildKite,
we want to make sure the new infra quality is better, or at least the same.

Being able to compare buildkite metrics with github metrics on grafana will
allow us to easily present the comparison.

BuildKite API allows filtering, but doesn't allow changing the result
ordering. Meaning we are left with builds ordered by IDs.
This means a completed job can appear before a running job in the list.
2 solutions from there:
 - keep the cursor on the oldest running workflow
 - keep a list of running workflows to compare.

Because there is no guarantees in workflow ordering, waiting for the
oldest build to complete before reporting any newer build could mean
delaying the more recent build completion reporting by a few hours.
And because grafana cannot ingest metrics older than 2 hours, this is
not an option.

Thus we leave with the second solution: remember what jobs were running
during the last iteration, and record them as soon as they are
completed. Buildkite has at most ~100 pending jobs, so keeping all those
IDs should be OK.
---
 .ci/metrics/metrics.py | 190 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 189 insertions(+), 1 deletion(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 3878dce342fb4..fbf0d71cf50ce 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -1,5 +1,6 @@
 import collections
 import datetime
+import dateutil
 import github
 import logging
 import os
@@ -53,6 +54,18 @@
 # by trial and error).
 GRAFANA_METRIC_MAX_AGE_MN = 120
 
+# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
+# the metric name in Grafana. This is important not to lose metrics history
+# if the workflow name changes.
+BUILDKITE_WORKFLOW_TO_TRACK = {
+    ":linux: Linux x64": "buildkite_linux",
+    ":windows: Windows x64": "buildkite_windows",
+}
+
+# Number of builds to fetch per page. Since we scrape regularly, this can
+# remain small.
+BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
+
 @dataclass
 class JobMetrics:
     job_name: str
@@ -70,6 +83,170 @@ class GaugeMetric:
     time_ns: int
 
 
+def buildkite_fetch_page_build_list(
+    buildkite_token: str, after_cursor: str = None
+) -> list[dict[str, str]]:
+    """Fetches a page of the build list using the GraphQL BuildKite API.
+    Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
+    or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
+    older than the one pointer by |after_cursor| if provided.
+    The |after_cursor| value is taken from the previous page returned by the
+    API.
+    Args:
+      buildkite_token: the secret token to authenticate GraphQL requests.
+      after_cursor: cursor after which to start the page fetch.
+    Returns:
+      The most recent builds after cursor (if set) with the following format:
+      [
+        {
+            "cursor": <value>,
+            "number": <build-number>,
+        }
+      ]
+    """
+
+    BUILDKITE_GRAPHQL_QUERY = """
+    query OrganizationShowQuery {{
+      organization(slug: "llvm-project") {{
+        pipelines(search: "Github pull requests", first: 1) {{
+          edges {{
+            node {{
+              builds (state: [RUNNING, SCHEDULED, CREATING], first: {PAGE_SIZE}, after: {AFTER}) {{
+                edges {{
+                  cursor
+                  node {{
+                    number
+                  }}
+                }}
+              }}
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+    data = BUILDKITE_GRAPHQL_QUERY.format(
+        PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
+        AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
+    )
+    data = data.replace("\n", "").replace('"', '\\"')
+    data = '{ "query": "' + data + '" }'
+    url = "https://graphql.buildkite.com/v1"
+    headers = {
+        "Authorization": "Bearer " + buildkite_token,
+        "Content-Type": "application/json",
+    }
+    r = requests.post(url, data=data, headers=headers)
+    data = r.json()
+    # De-nest the build list.
+    if "errors" in data:
+        logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
+        return []
+    builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
+        "edges"
+    ]
+    # Fold cursor info into the node dictionnary.
+    return [{**x["node"], "cursor": x["cursor"]} for x in builds]
+
+
+def buildkite_get_build_info(build_number: str) -> dict:
+    """Returns all the info associated with the provided build number.
+    Note: for unknown reasons, graphql returns no jobs for a given build,
+    while this endpoint does, hence why this uses this API instead of graphql.
+      Args:
+        build_number: which build number to fetch info for.
+      Returns:
+        The info for the target build, a JSON dictionnary.
+    """
+
+    URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
+    return requests.get(URL.format(build_number)).json()
+
+
+def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
+    """Returns all the running/pending BuildKite builds.
+    Args:
+     buildkite_token: the secret token to authenticate GraphQL requests.
+     last_cursor: the cursor to stop at if set. If None, a full page is fetched.
+    """
+    output = []
+    cursor = None
+    while True:
+        page = buildkite_fetch_page_build_list(buildkite_token, cursor)
+        if len(page) == 0:
+            break
+        cursor = page[-1]["cursor"]
+        output += page
+    return output
+
+
+def buildkite_get_metrics(
+    buildkite_token: str, previously_incomplete: set[int]
+) -> (list[JobMetrics], set[int]):
+    """Returns a tuple with:
+    - the metrics recorded for newly completed workflow jobs.
+    - the set of workflow still running now.
+
+    Args:
+     buildkite_token: the secret token to authenticate GraphQL requests.
+     previously_incomplete: the set of running workflows the last time this
+     function was called.
+    """
+
+    running_builds = buildkite_get_incomplete_tasks(buildkite_token)
+    incomplete_now = set([x["number"] for x in running_builds])
+    output = []
+
+    for build_id in previously_incomplete:
+        if build_id in incomplete_now:
+            continue
+
+        info = buildkite_get_build_info(build_id)
+        metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
+        for job in info["jobs"]:
+            # Skip this job.
+            if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
+                continue
+
+            created_at = dateutil.parser.isoparse(job["created_at"])
+            scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
+            started_at = dateutil.parser.isoparse(job["started_at"])
+            finished_at = dateutil.parser.isoparse(job["finished_at"])
+
+            job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
+            queue_time = (started_at - scheduled_at).seconds
+            run_time = (finished_at - started_at).seconds
+            status = bool(job["passed"])
+
+            # Grafana will refuse to ingest metrics older than ~2 hours, so we
+            # should avoid sending historical data.
+            metric_age_mn = (
+                datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
+            ).total_seconds() / 60
+            if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
+                logging.info(
+                    f"Job {job['name']} from workflow {build_id} dropped due"
+                    + f" to staleness: {metric_age_mn}mn old."
+                )
+                continue
+
+            metric_timestamp_ns = int(metric_timestamp.timestamp()) * 10**9
+            workflow_id = build_id
+            workflow_name = "Github pull requests"
+            output.append(
+                JobMetrics(
+                    job_name,
+                    queue_time,
+                    run_time,
+                    status,
+                    metric_timestamp_ns,
+                    workflow_id,
+                    workflow_name,
+                )
+            )
+
+    return output, incomplete_now
+
 def github_get_metrics(
     github_repo: github.Repository, last_workflows_seen_as_completed: set[int]
 ) -> tuple[list[JobMetrics], int]:
@@ -292,6 +469,7 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
 def main():
     # Authenticate with Github
     github_auth = Auth.Token(os.environ["GITHUB_TOKEN"])
+    buildkite_token = os.environ["BUILDKITE_TOKEN"]
     grafana_api_key = os.environ["GRAFANA_API_KEY"]
     grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
 
@@ -299,6 +477,9 @@ def main():
     # Because the Github queries are broken, we'll simply log a 'processed'
     # bit for the last COUNT_TO_PROCESS workflows.
     gh_last_workflows_seen_as_completed = set()
+    # Stores the list of pending/running builds in BuildKite we need to check
+    # at the next iteration.
+    bk_incomplete = set()
 
     # Enter the main loop. Every five minutes we wake up and dump metrics for
     # the relevant jobs.
@@ -306,9 +487,16 @@ def main():
         github_object = Github(auth=github_auth)
         github_repo = github_object.get_repo("llvm/llvm-project")
 
-        metrics, gh_last_workflows_seen_as_completed = github_get_metrics(
+        gh_metrics, gh_last_workflows_seen_as_completed = github_get_metrics(
             github_repo, gh_last_workflows_seen_as_completed
         )
+        gh_metrics = []
+
+        bk_metrics, bk_incomplete = buildkite_get_metrics(
+            buildkite_token, bk_incomplete
+        )
+
+        metrics = gh_metrics + bk_metrics
         upload_metrics(metrics, grafana_metrics_userid, grafana_api_key)
         logging.info(f"Uploaded {len(metrics)} metrics")
 

>From 64a4b85d915042554e006d2cd11202794f511144 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Thu, 13 Mar 2025 10:32:38 +0100
Subject: [PATCH 2/5] pr-feedback

---
 .ci/metrics/metrics.py | 53 +++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index fbf0d71cf50ce..3ee1c9cdaf7df 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -87,14 +87,17 @@ def buildkite_fetch_page_build_list(
     buildkite_token: str, after_cursor: str = None
 ) -> list[dict[str, str]]:
     """Fetches a page of the build list using the GraphQL BuildKite API.
+
     Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
     or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
     older than the one pointer by |after_cursor| if provided.
     The |after_cursor| value is taken from the previous page returned by the
     API.
+
     Args:
       buildkite_token: the secret token to authenticate GraphQL requests.
       after_cursor: cursor after which to start the page fetch.
+
     Returns:
       The most recent builds after cursor (if set) with the following format:
       [
@@ -130,7 +133,7 @@ def buildkite_fetch_page_build_list(
         AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
     )
     data = data.replace("\n", "").replace('"', '\\"')
-    data = '{ "query": "' + data + '" }'
+    data = json.dumps({"query": data})  #'{ "query": "' + data + '" }'
     url = "https://graphql.buildkite.com/v1"
     headers = {
         "Authorization": "Bearer " + buildkite_token,
@@ -151,12 +154,15 @@ def buildkite_fetch_page_build_list(
 
 def buildkite_get_build_info(build_number: str) -> dict:
     """Returns all the info associated with the provided build number.
+
     Note: for unknown reasons, graphql returns no jobs for a given build,
     while this endpoint does, hence why this uses this API instead of graphql.
-      Args:
-        build_number: which build number to fetch info for.
-      Returns:
-        The info for the target build, a JSON dictionnary.
+
+    Args:
+      build_number: which build number to fetch info for.
+
+    Returns:
+      The info for the target build, a JSON dictionnary.
     """
 
     URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
@@ -165,6 +171,7 @@ def buildkite_get_build_info(build_number: str) -> dict:
 
 def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
     """Returns all the running/pending BuildKite builds.
+
     Args:
      buildkite_token: the secret token to authenticate GraphQL requests.
      last_cursor: the cursor to stop at if set. If None, a full page is fetched.
@@ -184,13 +191,14 @@ def buildkite_get_metrics(
     buildkite_token: str, previously_incomplete: set[int]
 ) -> (list[JobMetrics], set[int]):
     """Returns a tuple with:
+
     - the metrics recorded for newly completed workflow jobs.
     - the set of workflow still running now.
 
     Args:
-     buildkite_token: the secret token to authenticate GraphQL requests.
-     previously_incomplete: the set of running workflows the last time this
-     function was called.
+      buildkite_token: the secret token to authenticate GraphQL requests.
+        previously_incomplete: the set of running workflows the last time this
+        function was called.
     """
 
     running_builds = buildkite_get_incomplete_tasks(buildkite_token)
@@ -204,14 +212,31 @@ def buildkite_get_metrics(
         info = buildkite_get_build_info(build_id)
         metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
         for job in info["jobs"]:
-            # Skip this job.
+            # This workflow is not interesting to us.
             if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
                 continue
 
+            # Note: BuildKite API can return empty dates for some fields
+            # depending on the completion scenario. Example, a job cancelled
+            # before even starting will get an None date for 'started_at'.
+            # For this reason, if a timestamp is missing, we consider it
+            # skipped and keep the last event value.
             created_at = dateutil.parser.isoparse(job["created_at"])
-            scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
-            started_at = dateutil.parser.isoparse(job["started_at"])
-            finished_at = dateutil.parser.isoparse(job["finished_at"])
+            scheduled_at = (
+                dateutil.parser.isoparse(job["scheduled_at"])
+                if "scheduled_at" in job
+                else created_at
+            )
+            started_at = (
+                dateutil.parser.isoparse(job["started_at"])
+                if "started_at" in job
+                else scheduled_at
+            )
+            finished_at = (
+                dateutil.parser.isoparse(job["finished_at"])
+                if "finished_at" in job
+                else started_at
+            )
 
             job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
             queue_time = (started_at - scheduled_at).seconds
@@ -224,7 +249,7 @@ def buildkite_get_metrics(
                 datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
             ).total_seconds() / 60
             if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
-                logging.info(
+                logging.warning(
                     f"Job {job['name']} from workflow {build_id} dropped due"
                     + f" to staleness: {metric_age_mn}mn old."
                 )
@@ -372,7 +397,7 @@ def github_get_metrics(
                 datetime.datetime.now(datetime.timezone.utc) - completed_at
             ).total_seconds() / 60
             if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
-                logging.info(
+                logging.warning(
                     f"Job {job.id} from workflow {task.id} dropped due"
                     + f" to staleness: {metric_age_mn}mn old."
                 )

>From f0ca44f6eefb6f6031bce48977e44736cd16056a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Thu, 13 Mar 2025 10:35:10 +0100
Subject: [PATCH 3/5] import json

---
 .ci/metrics/metrics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 3ee1c9cdaf7df..1049820405add 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -2,6 +2,7 @@
 import datetime
 import dateutil
 import github
+import json
 import logging
 import os
 import requests

>From b23eb2f042f19b493bf35317e2964a35d82495d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Thu, 13 Mar 2025 10:40:36 +0100
Subject: [PATCH 4/5] fix query serialization

---
 .ci/metrics/metrics.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 1049820405add..c6d33e763539f 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -129,19 +129,17 @@ def buildkite_fetch_page_build_list(
       }}
     }}
     """
-    data = BUILDKITE_GRAPHQL_QUERY.format(
+    query = BUILDKITE_GRAPHQL_QUERY.format(
         PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
         AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
     )
-    data = data.replace("\n", "").replace('"', '\\"')
-    data = json.dumps({"query": data})  #'{ "query": "' + data + '" }'
+    query = json.dumps({"query": query})
     url = "https://graphql.buildkite.com/v1"
     headers = {
         "Authorization": "Bearer " + buildkite_token,
         "Content-Type": "application/json",
     }
-    r = requests.post(url, data=data, headers=headers)
-    data = r.json()
+    data = requests.post(url, data=query, headers=headers).json()
     # De-nest the build list.
     if "errors" in data:
         logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
@@ -516,7 +514,6 @@ def main():
         gh_metrics, gh_last_workflows_seen_as_completed = github_get_metrics(
             github_repo, gh_last_workflows_seen_as_completed
         )
-        gh_metrics = []
 
         bk_metrics, bk_incomplete = buildkite_get_metrics(
             buildkite_token, bk_incomplete

>From 0502e7e534be97a57811c3ec1458e5a67a0aa541 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Thu, 13 Mar 2025 10:52:30 +0100
Subject: [PATCH 5/5] fix to handle buildkite cancelling/failling transion
 states

---
 .ci/metrics/metrics.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index c6d33e763539f..1463ab43a812e 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -115,7 +115,7 @@ def buildkite_fetch_page_build_list(
         pipelines(search: "Github pull requests", first: 1) {{
           edges {{
             node {{
-              builds (state: [RUNNING, SCHEDULED, CREATING], first: {PAGE_SIZE}, after: {AFTER}) {{
+              builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
                 edges {{
                   cursor
                   node {{
@@ -215,27 +215,25 @@ def buildkite_get_metrics(
             if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
                 continue
 
-            # Note: BuildKite API can return empty dates for some fields
-            # depending on the completion scenario. Example, a job cancelled
-            # before even starting will get an None date for 'started_at'.
-            # For this reason, if a timestamp is missing, we consider it
-            # skipped and keep the last event value.
             created_at = dateutil.parser.isoparse(job["created_at"])
             scheduled_at = (
-                dateutil.parser.isoparse(job["scheduled_at"])
-                if "scheduled_at" in job
-                else created_at
+                created_at
+                if job["scheduled_at"] is None
+                else dateutil.parser.isoparse(job["scheduled_at"])
             )
             started_at = (
-                dateutil.parser.isoparse(job["started_at"])
-                if "started_at" in job
-                else scheduled_at
-            )
-            finished_at = (
-                dateutil.parser.isoparse(job["finished_at"])
-                if "finished_at" in job
-                else started_at
+                scheduled_at
+                if job["started_at"] is None
+                else dateutil.parser.isoparse(job["started_at"])
             )
+            if job["canceled_at"] is None:
+                finished_at = (
+                    started_at
+                    if job["finished_at"] is None
+                    else dateutil.parser.isoparse(job["finished_at"])
+                )
+            else:
+                finished_at = dateutil.parser.isoparse(job["canceled_at"])
 
             job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
             queue_time = (started_at - scheduled_at).seconds



More information about the llvm-commits mailing list