[llvm] [CI] Add queue size, running count metrics (PR #122714)

Nathan Gauër via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 13 06:38:33 PST 2025


https://github.com/Keenuts created https://github.com/llvm/llvm-project/pull/122714

This commits allows the container to report 2 new additional metrics at
every sampling event: the number of queued workflows, and the number of
running workflows.
This should allows us to better understand cluster load, and allow us to
adjust autoscale parameters.

>From 1a505a54db35227b98fffaade09fd2913af95c86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 14:36:29 +0100
Subject: [PATCH 1/4] [CI] Always send a heartbeat metric
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This script was setup to only upload metrics to Grafana when
a new workflow was available.
If either the Grafana or github token becomes stale, no metrics would
get recorded either.

We have alerting in place to detect a lack of update, but because we
only uploaded metrics on new workflows, we could have normal cases were
no data would get uploaded for a few hours (example, late night
weekend).
For those reasons, the delay before alerting for no-data had to be set
quite high.

By adding a fixed heartbeat in the uploaded metrics, we know we MUST
receive at least 1 metric every 5 minutes, and can have a more reactive
monitoring.

Signed-off-by: Nathan Gauër <brioche at google.com>
---
 .ci/metrics/metrics.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 55025e50d1081f..50360ddefd24c3 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -147,6 +147,15 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
             f"Failed to submit data to Grafana: {response.status_code}", file=sys.stderr
         )
 
+def make_heartbeat_metric():
+  return JobMetrics(
+      "metrics_container_heartbeat",
+      1, # queue time seconds
+      2, # run time seconds
+      3, # job result
+      time.time_ns(), # created at ns
+      0, # workflow run ID
+  )
 
 def main():
     # Authenticate with Github
@@ -166,11 +175,14 @@ def main():
     while True:
         current_metrics = get_metrics(github_repo, workflows_to_track)
         if len(current_metrics) == 0:
-            print("No metrics found to upload.", file=sys.stderr)
-            continue
+            print("No metrics found to upload.", file=sys.stdout)
+
+        # Always send a hearbeat metric so we can monitor is this container
+        # is still able to log to Grafana.
+        current_metrics.append(make_heartbeat_metric())
 
         upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)
-        print(f"Uploaded {len(current_metrics)} metrics", file=sys.stderr)
+        print(f"Uploaded {len(current_metrics)} metrics", file=sys.stdout)
 
         for workflow_metric in reversed(current_metrics):
             workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id

>From 35cf962f585c1bdc9b40c3f04514f722750869ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 14:58:16 +0100
Subject: [PATCH 2/4] clang-format

---
 .ci/metrics/metrics.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 50360ddefd24c3..0e289651785f1c 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -147,15 +147,17 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
             f"Failed to submit data to Grafana: {response.status_code}", file=sys.stderr
         )
 
+
 def make_heartbeat_metric():
-  return JobMetrics(
-      "metrics_container_heartbeat",
-      1, # queue time seconds
-      2, # run time seconds
-      3, # job result
-      time.time_ns(), # created at ns
-      0, # workflow run ID
-  )
+    return JobMetrics(
+        "metrics_container_heartbeat",
+        1, # queue time seconds
+        2, # run time seconds
+        3, # job result
+        time.time_ns(), # created at ns
+        0, # workflow run ID
+    )
+
 
 def main():
     # Authenticate with Github

>From f569206439dad29bbb0e8e53dbe1deeb471e03cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 15:06:17 +0100
Subject: [PATCH 3/4] clang-format

---
 .ci/metrics/metrics.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 0e289651785f1c..6eefcdcec93bfb 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -151,11 +151,11 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
 def make_heartbeat_metric():
     return JobMetrics(
         "metrics_container_heartbeat",
-        1, # queue time seconds
-        2, # run time seconds
-        3, # job result
-        time.time_ns(), # created at ns
-        0, # workflow run ID
+        1,  # queue time seconds
+        2,  # run time seconds
+        3,  # job result
+        time.time_ns(),  # created at ns
+        0,  # workflow run ID
     )
 
 

>From 148a0b0b758ef6358f859c7d5d2a1fc434aceba5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 15:34:54 +0100
Subject: [PATCH 4/4] [CI] Add queue size, running count metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commits allows the container to report 2 new additional metrics at
every sampling event: the number of queued workflows, and the number of
running workflows.
This should allows us to better understand cluster load, and allow us to
adjust autoscale parameters.

Signed-off-by: Nathan Gauër <brioche at google.com>
---
 .ci/metrics/metrics.py | 55 +++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 6eefcdcec93bfb..3dc239a4e08944 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -26,6 +26,13 @@ class JobMetrics:
     workflow_id: int
 
 
+ at dataclass
+class GaugeMetric:
+    name: str
+    value: int
+    time_ns: int
+
+
 def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, int]):
     """Gets the metrics for specified Github workflows.
 
@@ -49,8 +56,21 @@ def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, in
 
     workflows_to_include = set(workflows_to_track.keys())
 
+    running_workflow_count = 0
+    queued_workflow_count = 0
+
     while len(workflows_to_include) > 0:
         workflow_run = next(workflow_runs)
+
+        if workflow_run.status == "in_progress":
+          running_workflow_count += 1
+
+        # Other states are available (pending, waiting, etc), but the meaning
+        # is not documented (See #70540).
+        # "queued" seems to be the info we want.
+        if workflow_run.status == "queued":
+          queued_workflow_count += 1
+
         if workflow_run.status != "completed":
             continue
 
@@ -113,6 +133,11 @@ def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, in
             )
         )
 
+    workflow_metrics.append(GaugeMetric("metrics_container_queued_workflow_count", queued_workflow_count, time.time_ns()))
+    workflow_metrics.append(GaugeMetric("metrics_container_running_workflow_count", running_workflow_count, time.time_ns()))
+    # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
+    workflow_metrics.append(GaugeMetric("metrics_container_heartbeat", 1, time.time_ns()))
+
     return workflow_metrics
 
 
@@ -129,10 +154,14 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
     """
     metrics_batch = []
     for workflow_metric in workflow_metrics:
-        workflow_formatted_name = workflow_metric.job_name.lower().replace(" ", "_")
-        metrics_batch.append(
-            f"{workflow_formatted_name} queue_time={workflow_metric.queue_time},run_time={workflow_metric.run_time},status={workflow_metric.status} {workflow_metric.created_at_ns}"
-        )
+        if isinstance(workflow_metric, GaugeMetric):
+            name = workflow_metric.name.lower().replace(" ", "_")
+            metrics_batch.append(f"{name} value={workflow_metric.value} {workflow_metric.time_ns}")
+        else:
+            name = workflow_metric.job_name.lower().replace(" ", "_")
+            metrics_batch.append(
+                f"{name} queue_time={workflow_metric.queue_time},run_time={workflow_metric.run_time},status={workflow_metric.status} {workflow_metric.created_at_ns}"
+            )
 
     request_data = "\n".join(metrics_batch)
     response = requests.post(
@@ -148,17 +177,6 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
         )
 
 
-def make_heartbeat_metric():
-    return JobMetrics(
-        "metrics_container_heartbeat",
-        1,  # queue time seconds
-        2,  # run time seconds
-        3,  # job result
-        time.time_ns(),  # created at ns
-        0,  # workflow run ID
-    )
-
-
 def main():
     # Authenticate with Github
     auth = Auth.Token(os.environ["GITHUB_TOKEN"])
@@ -179,15 +197,12 @@ def main():
         if len(current_metrics) == 0:
             print("No metrics found to upload.", file=sys.stdout)
 
-        # Always send a hearbeat metric so we can monitor is this container
-        # is still able to log to Grafana.
-        current_metrics.append(make_heartbeat_metric())
-
         upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)
         print(f"Uploaded {len(current_metrics)} metrics", file=sys.stdout)
 
         for workflow_metric in reversed(current_metrics):
-            workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id
+            if isinstance(workflow_metric, JobMetrics):
+              workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id
 
         time.sleep(SCRAPE_INTERVAL_SECONDS)
 



More information about the llvm-commits mailing list