[llvm] [CI] Remove buildkite from metrics container (PR #143049)
Aiden Grossman via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 5 16:57:47 PDT 2025
https://github.com/boomanaiden154 created https://github.com/llvm/llvm-project/pull/143049
Now that buildkite has been sunsetted, remove buildkite tracking from the metrics container as it does not do anything.
>From 69d80a89926dd21fbd2a2faf0f554e396e837677 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Thu, 5 Jun 2025 23:56:44 +0000
Subject: [PATCH] [CI] Remove buildkite from metrics container
Now that buildkite has been sunsetted, remove buildkite tracking from
the metrics container as it does not do anything.
---
.ci/metrics/metrics.py | 203 +----------------------------------------
1 file changed, 2 insertions(+), 201 deletions(-)
diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 171bc20a95710..143e6ab4cf46a 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -1,12 +1,9 @@
import collections
import datetime
-import dateutil
import github
-import json
import logging
import os
import requests
-import sys
import time
from dataclasses import dataclass
@@ -55,18 +52,6 @@
# by trial and error).
GRAFANA_METRIC_MAX_AGE_MN = 120
-# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
-# the metric name in Grafana. This is important not to lose metrics history
-# if the workflow name changes.
-BUILDKITE_WORKFLOW_TO_TRACK = {
- ":linux: Linux x64": "buildkite_linux",
- ":windows: Windows x64": "buildkite_windows",
-}
-
-# Number of builds to fetch per page. Since we scrape regularly, this can
-# remain small.
-BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
-
@dataclass
class JobMetrics:
@@ -86,181 +71,6 @@ class GaugeMetric:
time_ns: int
-def buildkite_fetch_page_build_list(
- buildkite_token: str, after_cursor: str = None
-) -> list[dict[str, str]]:
- """Fetches a page of the build list using the GraphQL BuildKite API.
-
- Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
- or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
- older than the one pointer by |after_cursor| if provided.
- The |after_cursor| value is taken from the previous page returned by the
- API.
-
- Args:
- buildkite_token: the secret token to authenticate GraphQL requests.
- after_cursor: cursor after which to start the page fetch.
-
- Returns:
- The most recent builds after cursor (if set) with the following format:
- [
- {
- "cursor": <value>,
- "number": <build-number>,
- }
- ]
- """
-
- BUILDKITE_GRAPHQL_QUERY = """
- query OrganizationShowQuery {{
- organization(slug: "llvm-project") {{
- pipelines(search: "Github pull requests", first: 1) {{
- edges {{
- node {{
- builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
- edges {{
- cursor
- node {{
- number
- }}
- }}
- }}
- }}
- }}
- }}
- }}
- }}
- """
- query = BUILDKITE_GRAPHQL_QUERY.format(
- PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
- AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
- )
- query = json.dumps({"query": query})
- url = "https://graphql.buildkite.com/v1"
- headers = {
- "Authorization": "Bearer " + buildkite_token,
- "Content-Type": "application/json",
- }
- data = requests.post(url, data=query, headers=headers).json()
- # De-nest the build list.
- if "errors" in data:
- logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
- return []
- builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
- "edges"
- ]
- # Fold cursor info into the node dictionnary.
- return [{**x["node"], "cursor": x["cursor"]} for x in builds]
-
-
-def buildkite_get_build_info(build_number: str) -> dict:
- """Returns all the info associated with the provided build number.
-
- Note: for unknown reasons, graphql returns no jobs for a given build,
- while this endpoint does, hence why this uses this API instead of graphql.
-
- Args:
- build_number: which build number to fetch info for.
-
- Returns:
- The info for the target build, a JSON dictionnary.
- """
-
- URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
- return requests.get(URL.format(build_number)).json()
-
-
-def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
- """Returns all the running/pending BuildKite builds.
-
- Args:
- buildkite_token: the secret token to authenticate GraphQL requests.
- last_cursor: the cursor to stop at if set. If None, a full page is fetched.
- """
- output = []
- cursor = None
- while True:
- page = buildkite_fetch_page_build_list(buildkite_token, cursor)
- if len(page) == 0:
- break
- cursor = page[-1]["cursor"]
- output += page
- return output
-
-
-def buildkite_get_metrics(
- buildkite_token: str, previously_incomplete: set[int]
-) -> (list[JobMetrics], set[int]):
- """Returns a tuple with:
-
- - the metrics recorded for newly completed workflow jobs.
- - the set of workflow still running now.
-
- Args:
- buildkite_token: the secret token to authenticate GraphQL requests.
- previously_incomplete: the set of running workflows the last time this
- function was called.
- """
-
- running_builds = buildkite_get_incomplete_tasks(buildkite_token)
- incomplete_now = set([x["number"] for x in running_builds])
- output = []
-
- for build_id in previously_incomplete:
- if build_id in incomplete_now:
- continue
-
- info = buildkite_get_build_info(build_id)
- metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
- for job in info["jobs"]:
- # This workflow is not interesting to us.
- if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
- continue
-
- # Don't count canceled jobs.
- if job["canceled_at"]:
- continue
-
- created_at = dateutil.parser.isoparse(job["created_at"])
- scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
- started_at = dateutil.parser.isoparse(job["started_at"])
- finished_at = dateutil.parser.isoparse(job["finished_at"])
-
- job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
- queue_time = (started_at - scheduled_at).seconds
- run_time = (finished_at - started_at).seconds
- status = bool(job["passed"])
-
- # Grafana will refuse to ingest metrics older than ~2 hours, so we
- # should avoid sending historical data.
- metric_age_mn = (
- datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
- ).total_seconds() / 60
- if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
- logging.warning(
- f"Job {job['name']} from workflow {build_id} dropped due"
- + f" to staleness: {metric_age_mn}mn old."
- )
- continue
-
- metric_timestamp_ns = int(metric_timestamp.timestamp()) * 10**9
- workflow_id = build_id
- workflow_name = "Github pull requests"
- output.append(
- JobMetrics(
- job_name,
- queue_time,
- run_time,
- status,
- metric_timestamp_ns,
- workflow_id,
- workflow_name,
- )
- )
-
- return output, incomplete_now
-
-
def github_get_metrics(
github_repo: github.Repository, last_workflows_seen_as_completed: set[int]
) -> tuple[list[JobMetrics], int]:
@@ -478,7 +288,6 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
def main():
# Authenticate with Github
github_auth = Auth.Token(os.environ["GITHUB_TOKEN"])
- buildkite_token = os.environ["BUILDKITE_TOKEN"]
grafana_api_key = os.environ["GRAFANA_API_KEY"]
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
@@ -486,9 +295,6 @@ def main():
# Because the Github queries are broken, we'll simply log a 'processed'
# bit for the last COUNT_TO_PROCESS workflows.
gh_last_workflows_seen_as_completed = set()
- # Stores the list of pending/running builds in BuildKite we need to check
- # at the next iteration.
- bk_incomplete = set()
# Enter the main loop. Every five minutes we wake up and dump metrics for
# the relevant jobs.
@@ -500,13 +306,8 @@ def main():
github_repo, gh_last_workflows_seen_as_completed
)
- bk_metrics, bk_incomplete = buildkite_get_metrics(
- buildkite_token, bk_incomplete
- )
-
- metrics = gh_metrics + bk_metrics
- upload_metrics(metrics, grafana_metrics_userid, grafana_api_key)
- logging.info(f"Uploaded {len(metrics)} metrics")
+ upload_metrics(gh_metrics, grafana_metrics_userid, grafana_api_key)
+ logging.info(f"Uploaded {len(gh_metrics)} metrics")
time.sleep(SCRAPE_INTERVAL_SECONDS)
More information about the llvm-commits
mailing list