[llvm] [CI] Always send a heartbeat metric (PR #122708)
Nathan Gauër via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 06:06:35 PST 2025
https://github.com/Keenuts updated https://github.com/llvm/llvm-project/pull/122708
>From 1a505a54db35227b98fffaade09fd2913af95c86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 14:36:29 +0100
Subject: [PATCH 1/3] [CI] Always send a heartbeat metric
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This script was setup to only upload metrics to Grafana when
a new workflow was available.
If either the Grafana or github token becomes stale, no metrics would
get recorded either.
We have alerting in place to detect a lack of update, but because we
only uploaded metrics on new workflows, we could have normal cases were
no data would get uploaded for a few hours (example, late night
weekend).
For those reasons, the delay before alerting for no-data had to be set
quite high.
By adding a fixed heartbeat in the uploaded metrics, we know we MUST
receive at least 1 metric every 5 minutes, and can have a more reactive
monitoring.
Signed-off-by: Nathan Gauër <brioche at google.com>
---
.ci/metrics/metrics.py | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 55025e50d1081f..50360ddefd24c3 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -147,6 +147,15 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
f"Failed to submit data to Grafana: {response.status_code}", file=sys.stderr
)
+def make_heartbeat_metric():
+ return JobMetrics(
+ "metrics_container_heartbeat",
+ 1, # queue time seconds
+ 2, # run time seconds
+ 3, # job result
+ time.time_ns(), # created at ns
+ 0, # workflow run ID
+ )
def main():
# Authenticate with Github
@@ -166,11 +175,14 @@ def main():
while True:
current_metrics = get_metrics(github_repo, workflows_to_track)
if len(current_metrics) == 0:
- print("No metrics found to upload.", file=sys.stderr)
- continue
+ print("No metrics found to upload.", file=sys.stdout)
+
+ # Always send a hearbeat metric so we can monitor is this container
+ # is still able to log to Grafana.
+ current_metrics.append(make_heartbeat_metric())
upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)
- print(f"Uploaded {len(current_metrics)} metrics", file=sys.stderr)
+ print(f"Uploaded {len(current_metrics)} metrics", file=sys.stdout)
for workflow_metric in reversed(current_metrics):
workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id
>From 35cf962f585c1bdc9b40c3f04514f722750869ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 14:58:16 +0100
Subject: [PATCH 2/3] clang-format
---
.ci/metrics/metrics.py | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 50360ddefd24c3..0e289651785f1c 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -147,15 +147,17 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
f"Failed to submit data to Grafana: {response.status_code}", file=sys.stderr
)
+
def make_heartbeat_metric():
- return JobMetrics(
- "metrics_container_heartbeat",
- 1, # queue time seconds
- 2, # run time seconds
- 3, # job result
- time.time_ns(), # created at ns
- 0, # workflow run ID
- )
+ return JobMetrics(
+ "metrics_container_heartbeat",
+ 1, # queue time seconds
+ 2, # run time seconds
+ 3, # job result
+ time.time_ns(), # created at ns
+ 0, # workflow run ID
+ )
+
def main():
# Authenticate with Github
>From f569206439dad29bbb0e8e53dbe1deeb471e03cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche at google.com>
Date: Mon, 13 Jan 2025 15:06:17 +0100
Subject: [PATCH 3/3] clang-format
---
.ci/metrics/metrics.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 0e289651785f1c..6eefcdcec93bfb 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -151,11 +151,11 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
def make_heartbeat_metric():
return JobMetrics(
"metrics_container_heartbeat",
- 1, # queue time seconds
- 2, # run time seconds
- 3, # job result
- time.time_ns(), # created at ns
- 0, # workflow run ID
+ 1, # queue time seconds
+ 2, # run time seconds
+ 3, # job result
+ time.time_ns(), # created at ns
+ 0, # workflow run ID
)
More information about the llvm-commits
mailing list