[llvm] [CI] Add queue size, running count metrics (PR #122714)

Mon Jan 13 07:32:01 PST 2025

https://github.com/Keenuts updated https://github.com/llvm/llvm-project/pull/122714

>From 6616f8ad75c26ea3bd1a977a0592d268ee798855 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 14:36:29 +0100
Subject: [PATCH 1/2] [CI] Add queue size, running count metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commits allows the container to report 3 additional metrics at
every sampling event:
- a heartbeat
- the size of the workflow queue (filtered)
- the number of running workflows (filtered)

The heartbeat is a simple metric allowing us to monitor the metrics
health. Before this commit, a new metrics was pushed only when a
workflow was completed. This meant we had to wait a few hours
before noticing if the metrics container was unable to push metrics.

In addition to this, this commits adds a sampling of the workflow
queue size and running count. This should allow us to better understand
the load, and improve the autoscale values we pick for the cluster.

Signed-off-by: Nathan Gauër <brioche at google.com>

clang-format

clang-format

Signed-off-by: Nathan Gauër <brioche at google.com>
---
 .ci/metrics/metrics.py | 67 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 8 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 55025e50d1081f..08d8e1c5c8abbb 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -26,6 +26,13 @@ class JobMetrics:
     workflow_id: int
 
 
+ at dataclass
+class GaugeMetric:
+    name: str
+    value: int
+    time_ns: int
+
+
 def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, int]):
     """Gets the metrics for specified Github workflows.
 
@@ -49,8 +56,21 @@ def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, in
 
     workflows_to_include = set(workflows_to_track.keys())
 
+    running_workflow_count = 0
+    queued_workflow_count = 0
+
     while len(workflows_to_include) > 0:
         workflow_run = next(workflow_runs)
+
+        if workflow_run.name in workflows_to_track:
+            # Other states are available (pending, waiting, etc), but the meaning
+            # is not documented (See #70540).
+            # "queued" seems to be the info we want.
+            if workflow_run.status == "queued":
+                queued_workflow_count += 1
+            elif workflow_run.status == "in_progress":
+                running_workflow_count += 1
+
         if workflow_run.status != "completed":
             continue
 
@@ -113,6 +133,29 @@ def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, in
             )
         )
 
+    workflow_metrics.append(
+        GaugeMetric(
+            "premerge_queued_workflow_count",
+            queued_workflow_count,
+            time.time_ns(),
+        )
+    )
+    workflow_metrics.append(
+        GaugeMetric(
+            "premerge_running_workflow_count",
+            running_workflow_count,
+            time.time_ns(),
+        )
+    )
+    # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
+    workflow_metrics.append(
+        GaugeMetric(
+            "metrics_container_heartbeat",
+            1,
+            time.time_ns()
+        )
+    )
+
     return workflow_metrics
 
 
@@ -129,10 +172,16 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
     """
     metrics_batch = []
     for workflow_metric in workflow_metrics:
-        workflow_formatted_name = workflow_metric.job_name.lower().replace(" ", "_")
-        metrics_batch.append(
-            f"{workflow_formatted_name} queue_time={workflow_metric.queue_time},run_time={workflow_metric.run_time},status={workflow_metric.status} {workflow_metric.created_at_ns}"
-        )
+        if isinstance(workflow_metric, GaugeMetric):
+            name = workflow_metric.name.lower().replace(" ", "_")
+            metrics_batch.append(
+                f"{name} value={workflow_metric.value} {workflow_metric.time_ns}"
+            )
+        else:
+            name = workflow_metric.job_name.lower().replace(" ", "_")
+            metrics_batch.append(
+                f"{name} queue_time={workflow_metric.queue_time},run_time={workflow_metric.run_time},status={workflow_metric.status} {workflow_metric.created_at_ns}"
+            )
 
     request_data = "\n".join(metrics_batch)
     response = requests.post(
@@ -166,14 +215,16 @@ def main():
     while True:
         current_metrics = get_metrics(github_repo, workflows_to_track)
         if len(current_metrics) == 0:
-            print("No metrics found to upload.", file=sys.stderr)
-            continue
+            print("No metrics found to upload.", file=sys.stdout)
 
         upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)
-        print(f"Uploaded {len(current_metrics)} metrics", file=sys.stderr)
+        print(f"Uploaded {len(current_metrics)} metrics", file=sys.stdout)
 
         for workflow_metric in reversed(current_metrics):
-            workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id
+            if isinstance(workflow_metric, JobMetrics):
+              workflows_to_track[
+                  workflow_metric.job_name
+              ] = workflow_metric.workflow_id
 
         time.sleep(SCRAPE_INTERVAL_SECONDS)
 

>From e80b334bdec4373cdfd971d86324bd9978232d67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 16:31:42 +0100
Subject: [PATCH 2/2] clang-format

---
 .ci/metrics/metrics.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 08d8e1c5c8abbb..272c21b2a7015d 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -149,11 +149,7 @@ def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, in
     )
     # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
     workflow_metrics.append(
-        GaugeMetric(
-            "metrics_container_heartbeat",
-            1,
-            time.time_ns()
-        )
+        GaugeMetric("metrics_container_heartbeat", 1, time.time_ns())
     )
 
     return workflow_metrics
@@ -222,9 +218,9 @@ def main():
 
         for workflow_metric in reversed(current_metrics):
             if isinstance(workflow_metric, JobMetrics):
-              workflows_to_track[
-                  workflow_metric.job_name
-              ] = workflow_metric.workflow_id
+                workflows_to_track[
+                    workflow_metric.job_name
+                ] = workflow_metric.workflow_id
 
         time.sleep(SCRAPE_INTERVAL_SECONDS)