[llvm] [CI] Extend metrics container to log BuildKite metrics (PR #130996)
Aiden Grossman via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 12 13:00:57 PDT 2025
================
@@ -70,6 +83,170 @@ class GaugeMetric:
time_ns: int
+def buildkite_fetch_page_build_list(
+ buildkite_token: str, after_cursor: str = None
+) -> list[dict[str, str]]:
+ """Fetches a page of the build list using the GraphQL BuildKite API.
+ Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
+ or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
+ older than the one pointer by |after_cursor| if provided.
+ The |after_cursor| value is taken from the previous page returned by the
+ API.
+ Args:
+ buildkite_token: the secret token to authenticate GraphQL requests.
+ after_cursor: cursor after which to start the page fetch.
+ Returns:
+ The most recent builds after cursor (if set) with the following format:
+ [
+ {
+ "cursor": <value>,
+ "number": <build-number>,
+ }
+ ]
+ """
+
+ BUILDKITE_GRAPHQL_QUERY = """
+ query OrganizationShowQuery {{
+ organization(slug: "llvm-project") {{
+ pipelines(search: "Github pull requests", first: 1) {{
+ edges {{
+ node {{
+ builds (state: [RUNNING, SCHEDULED, CREATING], first: {PAGE_SIZE}, after: {AFTER}) {{
+ edges {{
+ cursor
+ node {{
+ number
+ }}
+ }}
+ }}
+ }}
+ }}
+ }}
+ }}
+ }}
+ """
+ data = BUILDKITE_GRAPHQL_QUERY.format(
+ PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
+ AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
+ )
+ data = data.replace("\n", "").replace('"', '\\"')
+ data = '{ "query": "' + data + '" }'
+ url = "https://graphql.buildkite.com/v1"
+ headers = {
+ "Authorization": "Bearer " + buildkite_token,
+ "Content-Type": "application/json",
+ }
+ r = requests.post(url, data=data, headers=headers)
+ data = r.json()
+ # De-nest the build list.
+ if "errors" in data:
+ logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
+ return []
+ builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
+ "edges"
+ ]
+ # Fold cursor info into the node dictionnary.
+ return [{**x["node"], "cursor": x["cursor"]} for x in builds]
+
+
+def buildkite_get_build_info(build_number: str) -> dict:
+ """Returns all the info associated with the provided build number.
+ Note: for unknown reasons, graphql returns no jobs for a given build,
+ while this endpoint does, hence why this uses this API instead of graphql.
+ Args:
+ build_number: which build number to fetch info for.
+ Returns:
+ The info for the target build, a JSON dictionnary.
+ """
+
+ URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
+ return requests.get(URL.format(build_number)).json()
+
+
+def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
+ """Returns all the running/pending BuildKite builds.
+ Args:
+ buildkite_token: the secret token to authenticate GraphQL requests.
+ last_cursor: the cursor to stop at if set. If None, a full page is fetched.
+ """
+ output = []
+ cursor = None
+ while True:
+ page = buildkite_fetch_page_build_list(buildkite_token, cursor)
+ if len(page) == 0:
+ break
+ cursor = page[-1]["cursor"]
+ output += page
+ return output
+
+
+def buildkite_get_metrics(
+ buildkite_token: str, previously_incomplete: set[int]
+) -> (list[JobMetrics], set[int]):
+ """Returns a tuple with:
+ - the metrics recorded for newly completed workflow jobs.
+ - the set of workflow still running now.
+
+ Args:
+ buildkite_token: the secret token to authenticate GraphQL requests.
+ previously_incomplete: the set of running workflows the last time this
+ function was called.
+ """
+
+ running_builds = buildkite_get_incomplete_tasks(buildkite_token)
+ incomplete_now = set([x["number"] for x in running_builds])
+ output = []
+
+ for build_id in previously_incomplete:
+ if build_id in incomplete_now:
+ continue
+
+ info = buildkite_get_build_info(build_id)
+ metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
+ for job in info["jobs"]:
+ # Skip this job.
+ if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
+ continue
+
+ created_at = dateutil.parser.isoparse(job["created_at"])
+ scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
+ started_at = dateutil.parser.isoparse(job["started_at"])
+ finished_at = dateutil.parser.isoparse(job["finished_at"])
+
+ job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
+ queue_time = (started_at - scheduled_at).seconds
+ run_time = (finished_at - started_at).seconds
+ status = bool(job["passed"])
+
+ # Grafana will refuse to ingest metrics older than ~2 hours, so we
+ # should avoid sending historical data.
+ metric_age_mn = (
+ datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
+ ).total_seconds() / 60
+ if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
+ logging.info(
----------------
boomanaiden154 wrote:
Maybe `logging.warning`? That would imply we're not running frequently enough/not picking up jobs if we hit this condition.
https://github.com/llvm/llvm-project/pull/130996
More information about the llvm-commits
mailing list