[libcxx] [llvm] [libc++] Also restart failed jobs when they fail for a spurious reason (PR #118550)
Louis Dionne via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 3 13:52:10 PST 2024
https://github.com/ldionne updated https://github.com/llvm/llvm-project/pull/118550
>From d0f5c241d58d5bb1e80ab2450f5415b7ffc79a7f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 3 Dec 2024 16:48:30 -0500
Subject: [PATCH 1/2] [libc++] Also restart failed jobs when they fail for a
spurious reason
Since we moved to a Docker-in-Docker setup, CI jobs sometimes fail due
to the Docker VM dying with 'context cancelled' errors. This is currently
not recognized as a spurious failure, which leads to the job not being
automatically restarted. This patch fixes that.
This requires reorganizing the restarter workflow a bit to consider
failed runs differently based on whether they have failed for a
legitimate or spurious reason.
---
.../libcxx-restart-preempted-jobs.yaml | 131 +++++++++---------
1 file changed, 63 insertions(+), 68 deletions(-)
diff --git a/.github/workflows/libcxx-restart-preempted-jobs.yaml b/.github/workflows/libcxx-restart-preempted-jobs.yaml
index 21879ce19c27c5..b6b3a11dee973d 100644
--- a/.github/workflows/libcxx-restart-preempted-jobs.yaml
+++ b/.github/workflows/libcxx-restart-preempted-jobs.yaml
@@ -32,34 +32,29 @@ jobs:
uses: actions/github-script at 60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
with:
script: |
- const failure_regex = /Process completed with exit code 1./
- const preemption_regex = /The runner has received a shutdown signal/
-
- const wf_run = context.payload.workflow_run
- core.notice(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
+ const FAILURE_REGEX = /Process completed with exit code 1./
+ const SPURIOUS_FAILURE_REGEX = /^context cancelled$/
+ const PREEMPTION_REGEX = /The runner has received a shutdown signal/
+ function log(msg) {
+ core.notice(msg)
+ }
- async function create_check_run(conclusion, message) {
- // Create a check run on the given workflow run to indicate if
- // we are restarting the workflow or not.
- if (conclusion != 'success' && conclusion != 'skipped' && conclusion != 'neutral') {
- core.setFailed('Invalid conclusion: ' + conclusion)
- }
- await github.rest.checks.create({
- owner: context.repo.owner,
- repo: context.repo.repo,
- name: 'Restart Preempted Job',
- head_sha: wf_run.head_sha,
- status: 'completed',
- conclusion: conclusion,
- output: {
- title: 'Restarted Preempted Job',
- summary: message
- }
- })
+ // Return whether a failed check run actually failed due to a spurious error.
+ // This requires looking at the logs of the run.
+ function is_spurious_failure(check_run) {
+ return false;
+ // github.rest.actions.downloadJobLogsForWorkflowRun({
+ // owner,
+ // repo,
+ // job_id,
+ // });
}
- console.log('Listing check runs for suite')
+ const wf_run = context.payload.workflow_run
+ log(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
+
+ log('Listing check runs for suite')
const check_suites = await github.rest.checks.listForSuite({
owner: context.repo.owner,
repo: context.repo.repo,
@@ -67,66 +62,66 @@ jobs:
per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
})
- check_run_ids = [];
+ preemptions = [];
+ spurious_failures = [];
+ legitimate_failures = [];
for (check_run of check_suites.data.check_runs) {
- console.log('Checking check run: ' + check_run.id);
+ log(`Checking check run: ${check_run.id}`);
if (check_run.status != 'completed') {
- console.log('Check run was not completed. Skipping.');
+ log('Check run was not completed. Skipping.');
continue;
}
+
if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
- console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
+ log(`Check run had conclusion: ${check_run.conclusion}. Skipping.`);
continue;
}
- check_run_ids.push(check_run.id);
- }
-
- has_preempted_job = false;
-
- for (check_run_id of check_run_ids) {
- console.log('Listing annotations for check run: ' + check_run_id);
annotations = await github.rest.checks.listAnnotations({
owner: context.repo.owner,
repo: context.repo.repo,
- check_run_id: check_run_id
+ check_run_id: check_run.id
})
- for (annotation of annotations.data) {
- if (annotation.annotation_level != 'failure') {
- continue;
- }
-
- const preemption_match = annotation.message.match(preemption_regex);
-
- if (preemption_match != null) {
- console.log('Found preemption message: ' + annotation.message);
- has_preempted_job = true;
- }
+ preemption_annotation = annotations.data.find(function(annotation) {
+ return annotation.annotation_level == 'failure' &&
+ annotation.message.match(PREEMPTION_REGEX) != null;
+ });
+ if (preemption_annotation != null) {
+ log(`Found preemption message: ${preemption_annotation.message}`);
+ preemptions.push(check_run);
+ break;
+ }
- const failure_match = annotation.message.match(failure_regex);
- if (failure_match != null) {
- // We only want to restart the workflow if all of the failures were due to preemption.
- // We don't want to restart the workflow if there were other failures.
- core.notice('Choosing not to rerun workflow because we found a non-preemption failure' +
- 'Failure message: "' + annotation.message + '"');
- await create_check_run('skipped', 'Choosing not to rerun workflow because we found a non-preemption failure\n'
- + 'Failure message: ' + annotation.message)
- return;
+ failure_annotation = annotations.data.find(function(annotation) {
+ return annotation.annotation_level == 'failure' &&
+ annotation.message.match(FAILURE_REGEX) != null;
+ });
+ if (failure_annotation != null) {
+ // We found a failure annotation, now classify it as spurious or legitimate.
+ if (is_spurious_failure(check_run)) {
+ log(`Found spurious failure message in run: ${check_run.id}`);
+ spurious_failures.push(check_run);
+ } else {
+ log(`Found legitimate failure annotation: ${failure_annotation.message}`);
+ legitimate_failures.push(check_run);
}
+ break;
}
}
- if (!has_preempted_job) {
- core.notice('No preempted jobs found. Not restarting workflow.');
- await create_check_run('neutral', 'No preempted jobs found. Not restarting workflow.')
- return;
+ if (spurious_failures || preemptions) {
+ log('Found some spurious failures and/or preempted jobs');
+ if (legitimate_failures) {
+ log('Also found some legitimate failures, so not restarting the workflow.');
+ } else {
+ log('Did not find any legitimate failures. Restarting workflow.');
+ await github.rest.actions.reRunWorkflowFailedJobs({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ run_id: context.payload.workflow_run.id
+ })
+ }
+ } else {
+ log('Did not find any preempted jobs or spurious failures. Not restarting the workflow.');
}
-
- core.notice("Restarted workflow: " + context.payload.workflow_run.id);
- await github.rest.actions.reRunWorkflowFailedJobs({
- owner: context.repo.owner,
- repo: context.repo.repo,
- run_id: context.payload.workflow_run.id
- })
- await create_check_run('success', 'Restarted workflow run due to preempted job')
>From 8e198ae3efc9397cbaa3d735b050adb2a097045f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 3 Dec 2024 16:51:59 -0500
Subject: [PATCH 2/2] Touch libc++ file to trigger CI
---
libcxx/foo | 0
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 libcxx/foo
diff --git a/libcxx/foo b/libcxx/foo
new file mode 100644
index 00000000000000..e69de29bb2d1d6
More information about the llvm-commits
mailing list