[libcxx] [llvm] [libc++] Also restart failed jobs when they fail for a spurious reason (PR #118550)

Louis Dionne via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 3 14:15:31 PST 2024


https://github.com/ldionne updated https://github.com/llvm/llvm-project/pull/118550

>From d0f5c241d58d5bb1e80ab2450f5415b7ffc79a7f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 3 Dec 2024 16:48:30 -0500
Subject: [PATCH 1/3] [libc++] Also restart failed jobs when they fail for a
 spurious reason

Since we moved to a Docker-in-Docker setup, CI jobs sometimes fail due
to the Docker VM dying with 'context cancelled' errors. This is currently
not recognized as a spurious failure, which leads to the job not being
automatically restarted. This patch fixes that.

This requires reorganizing the restarter workflow a bit to consider
failed runs differently based on whether they have failed for a
legitimate or spurious reason.
---
 .../libcxx-restart-preempted-jobs.yaml        | 131 +++++++++---------
 1 file changed, 63 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/libcxx-restart-preempted-jobs.yaml b/.github/workflows/libcxx-restart-preempted-jobs.yaml
index 21879ce19c27c5..b6b3a11dee973d 100644
--- a/.github/workflows/libcxx-restart-preempted-jobs.yaml
+++ b/.github/workflows/libcxx-restart-preempted-jobs.yaml
@@ -32,34 +32,29 @@ jobs:
         uses: actions/github-script at 60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
         with:
           script: |
-            const failure_regex = /Process completed with exit code 1./
-            const preemption_regex = /The runner has received a shutdown signal/
-
-            const wf_run = context.payload.workflow_run
-            core.notice(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
+            const FAILURE_REGEX = /Process completed with exit code 1./
+            const SPURIOUS_FAILURE_REGEX = /^context cancelled$/
+            const PREEMPTION_REGEX = /The runner has received a shutdown signal/
 
+            function log(msg) {
+              core.notice(msg)
+            }
 
-            async function create_check_run(conclusion, message) {
-                // Create a check run on the given workflow run to indicate if
-                // we are restarting the workflow or not.
-                if (conclusion != 'success' && conclusion != 'skipped' && conclusion != 'neutral') {
-                  core.setFailed('Invalid conclusion: ' + conclusion)
-                }
-                await github.rest.checks.create({
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    name: 'Restart Preempted Job',
-                    head_sha: wf_run.head_sha,
-                    status: 'completed',
-                    conclusion: conclusion,
-                    output: {
-                      title: 'Restarted Preempted Job',
-                      summary: message
-                    }
-                })
+            // Return whether a failed check run actually failed due to a spurious error.
+            // This requires looking at the logs of the run.
+            function is_spurious_failure(check_run) {
+              return false;
+              // github.rest.actions.downloadJobLogsForWorkflowRun({
+              //   owner,
+              //   repo,
+              //   job_id,
+              // });
             }
 
-            console.log('Listing check runs for suite')
+            const wf_run = context.payload.workflow_run
+            log(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
+
+            log('Listing check runs for suite')
             const check_suites = await github.rest.checks.listForSuite({
               owner: context.repo.owner,
               repo: context.repo.repo,
@@ -67,66 +62,66 @@ jobs:
               per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
             })
 
-            check_run_ids = [];
+            preemptions = [];
+            spurious_failures = [];
+            legitimate_failures = [];
             for (check_run of check_suites.data.check_runs) {
-              console.log('Checking check run: ' + check_run.id);
+              log(`Checking check run: ${check_run.id}`);
               if (check_run.status != 'completed') {
-                console.log('Check run was not completed. Skipping.');
+                log('Check run was not completed. Skipping.');
                 continue;
               }
+
               if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
-                console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
+                log(`Check run had conclusion: ${check_run.conclusion}. Skipping.`);
                 continue;
               }
-              check_run_ids.push(check_run.id);
-            }
-
-            has_preempted_job = false;
-
-            for (check_run_id of check_run_ids) {
-              console.log('Listing annotations for check run: ' + check_run_id);
 
               annotations = await github.rest.checks.listAnnotations({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
-                check_run_id: check_run_id
+                check_run_id: check_run.id
               })
 
-              for (annotation of annotations.data) {
-                if (annotation.annotation_level != 'failure') {
-                  continue;
-                }
-
-                const preemption_match = annotation.message.match(preemption_regex);
-
-                if (preemption_match != null) {
-                  console.log('Found preemption message: ' + annotation.message);
-                  has_preempted_job = true;
-                }
+              preemption_annotation = annotations.data.find(function(annotation) {
+                return annotation.annotation_level == 'failure' &&
+                       annotation.message.match(PREEMPTION_REGEX) != null;
+              });
+              if (preemption_annotation != null) {
+                log(`Found preemption message: ${preemption_annotation.message}`);
+                preemptions.push(check_run);
+                break;
+              }
 
-                const failure_match = annotation.message.match(failure_regex);
-                if (failure_match != null) {
-                  // We only want to restart the workflow if all of the failures were due to preemption.
-                  // We don't want to restart the workflow if there were other failures.
-                  core.notice('Choosing not to rerun workflow because we found a non-preemption failure' +
-                    'Failure message: "' + annotation.message + '"');
-                  await create_check_run('skipped', 'Choosing not to rerun workflow because we found a non-preemption failure\n'
-                    + 'Failure message: ' + annotation.message)
-                  return;
+              failure_annotation = annotations.data.find(function(annotation) {
+                return annotation.annotation_level == 'failure' &&
+                       annotation.message.match(FAILURE_REGEX) != null;
+              });
+              if (failure_annotation != null) {
+                // We found a failure annotation, now classify it as spurious or legitimate.
+                if (is_spurious_failure(check_run)) {
+                  log(`Found spurious failure message in run: ${check_run.id}`);
+                  spurious_failures.push(check_run);
+                } else {
+                  log(`Found legitimate failure annotation: ${failure_annotation.message}`);
+                  legitimate_failures.push(check_run);
                 }
+                break;
               }
             }
 
-            if (!has_preempted_job) {
-              core.notice('No preempted jobs found. Not restarting workflow.');
-              await create_check_run('neutral', 'No preempted jobs found. Not restarting workflow.')
-              return;
+            if (spurious_failures || preemptions) {
+              log('Found some spurious failures and/or preempted jobs');
+              if (legitimate_failures) {
+                log('Also found some legitimate failures, so not restarting the workflow.');
+              } else {
+                log('Did not find any legitimate failures. Restarting workflow.');
+                await github.rest.actions.reRunWorkflowFailedJobs({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  run_id: context.payload.workflow_run.id
+                })
+              }
+            } else {
+              log('Did not find any preempted jobs or spurious failures. Not restarting the workflow.');
             }
-
-            core.notice("Restarted workflow: " + context.payload.workflow_run.id);
-            await github.rest.actions.reRunWorkflowFailedJobs({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                run_id: context.payload.workflow_run.id
-              })
-            await create_check_run('success', 'Restarted workflow run due to preempted job')

>From 8e198ae3efc9397cbaa3d735b050adb2a097045f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 3 Dec 2024 16:51:59 -0500
Subject: [PATCH 2/3] Touch libc++ file to trigger CI

---
 libcxx/foo | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 libcxx/foo

diff --git a/libcxx/foo b/libcxx/foo
new file mode 100644
index 00000000000000..e69de29bb2d1d6

>From 8e3d68655b9693651fe88930db8824ebabe6ca76 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 3 Dec 2024 17:15:20 -0500
Subject: [PATCH 3/3] I think we can actually detect Docker issues through
 annotations alone

---
 .../libcxx-restart-preempted-jobs.yaml        | 31 ++++---------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/libcxx-restart-preempted-jobs.yaml b/.github/workflows/libcxx-restart-preempted-jobs.yaml
index b6b3a11dee973d..fe45a4370748cb 100644
--- a/.github/workflows/libcxx-restart-preempted-jobs.yaml
+++ b/.github/workflows/libcxx-restart-preempted-jobs.yaml
@@ -33,24 +33,12 @@ jobs:
         with:
           script: |
             const FAILURE_REGEX = /Process completed with exit code 1./
-            const SPURIOUS_FAILURE_REGEX = /^context cancelled$/
-            const PREEMPTION_REGEX = /The runner has received a shutdown signal/
+            const PREEMPTION_REGEX = /The runner has received a shutdown signal|The operation was canceled/
 
             function log(msg) {
               core.notice(msg)
             }
 
-            // Return whether a failed check run actually failed due to a spurious error.
-            // This requires looking at the logs of the run.
-            function is_spurious_failure(check_run) {
-              return false;
-              // github.rest.actions.downloadJobLogsForWorkflowRun({
-              //   owner,
-              //   repo,
-              //   job_id,
-              // });
-            }
-
             const wf_run = context.payload.workflow_run
             log(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
 
@@ -63,7 +51,6 @@ jobs:
             })
 
             preemptions = [];
-            spurious_failures = [];
             legitimate_failures = [];
             for (check_run of check_suites.data.check_runs) {
               log(`Checking check run: ${check_run.id}`);
@@ -98,20 +85,14 @@ jobs:
                        annotation.message.match(FAILURE_REGEX) != null;
               });
               if (failure_annotation != null) {
-                // We found a failure annotation, now classify it as spurious or legitimate.
-                if (is_spurious_failure(check_run)) {
-                  log(`Found spurious failure message in run: ${check_run.id}`);
-                  spurious_failures.push(check_run);
-                } else {
-                  log(`Found legitimate failure annotation: ${failure_annotation.message}`);
-                  legitimate_failures.push(check_run);
-                }
+                log(`Found legitimate failure annotation: ${failure_annotation.message}`);
+                legitimate_failures.push(check_run);
                 break;
               }
             }
 
-            if (spurious_failures || preemptions) {
-              log('Found some spurious failures and/or preempted jobs');
+            if (preemptions) {
+              log('Found some preempted jobs');
               if (legitimate_failures) {
                 log('Also found some legitimate failures, so not restarting the workflow.');
               } else {
@@ -123,5 +104,5 @@ jobs:
                 })
               }
             } else {
-              log('Did not find any preempted jobs or spurious failures. Not restarting the workflow.');
+              log('Did not find any preempted jobs. Not restarting the workflow.');
             }



More information about the llvm-commits mailing list