[clang] [llvm] [Clang] Add env var for nvptx-arch/amdgpu-arch timeout (PR #102521)

Joel E. Denny via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 8 16:34:48 PDT 2024


https://github.com/jdenny-ornl updated https://github.com/llvm/llvm-project/pull/102521

>From 6546428805b52f1b6f350193ab08ff027892710f Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 8 Aug 2024 15:02:04 -0400
Subject: [PATCH 1/5] [Clang] Add env var for nvptx-arch/amdgpu-arch timeout

When working on very busy systems, check-offload frequently fails many
tests with this diagnostic:

```
clang: error: cannot determine amdgcn architecture: /tmp/llvm/build/bin/amdgpu-arch: Child timed out: ; consider passing it via '-march'
```

The timeout is 10 seconds.  This patch accepts the environment
variable `CLANG_TOOL_CHAIN_PROGRAM_WAIT` to increase it.

It should be documented somewhere.  Any suggestions on where?
---
 clang/lib/Driver/ToolChain.cpp         | 10 +++++++++-
 clang/lib/Driver/ToolChains/AMDGPU.cpp |  3 ++-
 clang/lib/Driver/ToolChains/Cuda.cpp   |  3 ++-
 llvm/utils/lit/lit/TestingConfig.py    |  1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 2d50c2cbbc881c..04b281e1bb10cd 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
@@ -105,7 +106,7 @@ ToolChain::ToolChain(const Driver &D, const llvm::Triple &T,
 
 llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
 ToolChain::executeToolChainProgram(StringRef Executable,
-                                   unsigned SecondsToWait) const {
+                                   unsigned DefaultSecondsToWait) const {
   llvm::SmallString<64> OutputFile;
   llvm::sys::fs::createTemporaryFile("toolchain-program", "txt", OutputFile);
   llvm::FileRemover OutputRemover(OutputFile.c_str());
@@ -116,6 +117,13 @@ ToolChain::executeToolChainProgram(StringRef Executable,
   };
 
   std::string ErrorMessage;
+  int SecondsToWait = DefaultSecondsToWait;
+  if (std::optional<std::string> Str =
+          llvm::sys::Process::GetEnv("CLANG_TOOL_CHAIN_PROGRAM_WAIT")) {
+    int Val = std::atoi(Str->c_str());
+    if (Val > 0)
+      SecondsToWait = Val;
+  }
   if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait,
                                 /*MemoryLimit=*/0, &ErrorMessage))
     return llvm::createStringError(std::error_code(),
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index aa8f9197cfabc3..4ed366d21f5c43 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -899,7 +899,8 @@ AMDGPUToolChain::getSystemGPUArchs(const ArgList &Args) const {
   else
     Program = GetProgramPath("amdgpu-arch");
 
-  auto StdoutOrErr = executeToolChainProgram(Program, /*SecondsToWait=*/10);
+  auto StdoutOrErr = executeToolChainProgram(Program,
+                                             /*DefaultSecondsToWait=*/10);
   if (!StdoutOrErr)
     return StdoutOrErr.takeError();
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 17c952c808f725..104217eaf5d849 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -804,7 +804,8 @@ NVPTXToolChain::getSystemGPUArchs(const ArgList &Args) const {
   else
     Program = GetProgramPath("nvptx-arch");
 
-  auto StdoutOrErr = executeToolChainProgram(Program, /*SecondsToWait=*/10);
+  auto StdoutOrErr = executeToolChainProgram(Program,
+                                             /*DefaultSecondsToWait=*/10);
   if (!StdoutOrErr)
     return StdoutOrErr.takeError();
 
diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
index eb9f8de2a7f960..06713429d06b4b 100644
--- a/llvm/utils/lit/lit/TestingConfig.py
+++ b/llvm/utils/lit/lit/TestingConfig.py
@@ -26,6 +26,7 @@ def fromdefaults(litConfig):
             "SYSTEMROOT",
             "TERM",
             "CLANG",
+            "CLANG_TOOL_CHAIN_PROGRAM_WAIT",
             "LLDB",
             "LD_PRELOAD",
             "LLVM_SYMBOLIZER_PATH",

>From 711cf93741ba618c7ee8051190b18bba938121fa Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 8 Aug 2024 16:21:28 -0400
Subject: [PATCH 2/5] Apply reviewer suggestion

---
 clang/include/clang/Basic/DiagnosticDriverKinds.td | 3 ++-
 clang/lib/Driver/ToolChain.cpp                     | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 3d8240f8357b40..05642f803d07de 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -99,7 +99,8 @@ def warn_drv_amdgpu_cov6: Warning<
   " use at your own risk">;
 def err_drv_undetermined_gpu_arch : Error<
   "cannot determine %0 architecture: %1; consider passing it via "
-  "'%2'">;
+  "'%2' or increasing the tool timeout using the environment variable "
+  "'CLANG_TOOL_CHAIN_PROGRAM_WAIT' (in secs, <=0 is inifinite)">;
 def warn_drv_multi_gpu_arch : Warning<
   "multiple %0 architectures are detected: %1; only the first one is used for "
   "'%2'">, InGroup<MultiGPU>;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 04b281e1bb10cd..22de2bdeaa29f5 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -123,6 +123,8 @@ ToolChain::executeToolChainProgram(StringRef Executable,
     int Val = std::atoi(Str->c_str());
     if (Val > 0)
       SecondsToWait = Val;
+    else if (Val <= 0)
+      SecondsToWait = 0; // infinite
   }
   if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait,
                                 /*MemoryLimit=*/0, &ErrorMessage))

>From 6ec00ff8e8ca2d4a3f7267f3189ee190e211ca05 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 8 Aug 2024 16:26:24 -0400
Subject: [PATCH 3/5] Update clang/lib/Driver/ToolChain.cpp

---
 clang/lib/Driver/ToolChain.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 22de2bdeaa29f5..c6436fb2ee33b4 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -123,7 +123,7 @@ ToolChain::executeToolChainProgram(StringRef Executable,
     int Val = std::atoi(Str->c_str());
     if (Val > 0)
       SecondsToWait = Val;
-    else if (Val <= 0)
+    else
       SecondsToWait = 0; // infinite
   }
   if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait,

>From 37cd64bc733f06d3caf3c660f57b5d717a59faf7 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 8 Aug 2024 18:59:15 -0400
Subject: [PATCH 4/5] Apply reviewer suggestions

---
 .../include/clang/Basic/DiagnosticDriverKinds.td |  7 ++++---
 clang/include/clang/Driver/ToolChain.h           |  2 +-
 clang/lib/Driver/ToolChain.cpp                   | 14 ++++++++------
 clang/test/Driver/amdgpu-hip-system-arch.c       |  8 ++++++++
 clang/test/Driver/nvptx-cuda-system-arch.c       |  8 ++++++++
 clang/test/Driver/openmp-system-arch.c           | 16 ++++++++++++++++
 llvm/utils/lit/lit/TestingConfig.py              |  2 +-
 7 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 05642f803d07de..92a602829933ce 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -98,9 +98,10 @@ def warn_drv_amdgpu_cov6: Warning<
   "code object v6 is still in development and not ready for production use yet;"
   " use at your own risk">;
 def err_drv_undetermined_gpu_arch : Error<
-  "cannot determine %0 architecture: %1; consider passing it via "
-  "'%2' or increasing the tool timeout using the environment variable "
-  "'CLANG_TOOL_CHAIN_PROGRAM_WAIT' (in secs, <=0 is inifinite)">;
+  "cannot determine %0 architecture: %1; consider passing it via '%2'; "
+  "environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool "
+  "timeout (integer secs, <=0 is infinite)">;
+
 def warn_drv_multi_gpu_arch : Warning<
   "multiple %0 architectures are detected: %1; only the first one is used for "
   "'%2'">, InGroup<MultiGPU>;
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index ece1384d5d3c02..fbac660548346e 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -206,7 +206,7 @@ class ToolChain {
   /// Executes the given \p Executable and returns the stdout.
   llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
   executeToolChainProgram(StringRef Executable,
-                          unsigned SecondsToWait = 0) const;
+                          unsigned DefaultSecondsToWait = 0) const;
 
   void setTripleEnvironment(llvm::Triple::EnvironmentType Env);
 
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index c6436fb2ee33b4..5fd8af373e1db5 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -119,12 +119,14 @@ ToolChain::executeToolChainProgram(StringRef Executable,
   std::string ErrorMessage;
   int SecondsToWait = DefaultSecondsToWait;
   if (std::optional<std::string> Str =
-          llvm::sys::Process::GetEnv("CLANG_TOOL_CHAIN_PROGRAM_WAIT")) {
-    int Val = std::atoi(Str->c_str());
-    if (Val > 0)
-      SecondsToWait = Val;
-    else
-      SecondsToWait = 0; // infinite
+          llvm::sys::Process::GetEnv("CLANG_TOOLCHAIN_PROGRAM_TIMEOUT")) {
+    int SecondsToWait;
+    if (!llvm::to_integer(*Str, SecondsToWait))
+      return llvm::createStringError(std::error_code(),
+                                     "CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected "
+                                     "an integer, got '" +
+                                         *Str + "'");
+    SecondsToWait = std::min(SecondsToWait, 0);
   }
   if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait,
                                 /*MemoryLimit=*/0, &ErrorMessage))
diff --git a/clang/test/Driver/amdgpu-hip-system-arch.c b/clang/test/Driver/amdgpu-hip-system-arch.c
index a46077b38fac08..f25a4087080f6d 100644
--- a/clang/test/Driver/amdgpu-hip-system-arch.c
+++ b/clang/test/Driver/amdgpu-hip-system-arch.c
@@ -29,3 +29,11 @@
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib --offload-new-driver --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 -x hip %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=ARCH-GFX906
 // ARCH-GFX906: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx906"
+
+// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed.
+// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
+// RUN:     --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 \
+// RUN:     -x hip %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT
+// BAD-TIMEOUT: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
diff --git a/clang/test/Driver/nvptx-cuda-system-arch.c b/clang/test/Driver/nvptx-cuda-system-arch.c
index e6a2fa40f0a038..6a8a218406d139 100644
--- a/clang/test/Driver/nvptx-cuda-system-arch.c
+++ b/clang/test/Driver/nvptx-cuda-system-arch.c
@@ -42,3 +42,11 @@
 // RUN:     --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda %s 2>&1 | FileCheck %s --check-prefix=MARCH-sm_89
 // MARCH-sm_89: warning: multiple nvptx64 architectures are detected: sm_89, sm_80; only the first one is used for '-march' [-Wmulti-gpu]
 // MARCH-sm_89: "-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}}"-target-cpu" "sm_89"
+
+// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed.
+// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib \
+// RUN:     --offload-arch=native --nvptx-arch-tool=%t/nvptx_arch_sm_70 \
+// RUN:     --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda -x cuda %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT
+// BAD-TIMEOUT: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
diff --git a/clang/test/Driver/openmp-system-arch.c b/clang/test/Driver/openmp-system-arch.c
index ea6ec6428592b3..cd49f460099666 100644
--- a/clang/test/Driver/openmp-system-arch.c
+++ b/clang/test/Driver/openmp-system-arch.c
@@ -75,3 +75,19 @@
 // RUN:     -fopenmp-targets=amdgcn-amd-amdhsa --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=AMDGPU
 // AMDGPU: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '-march'
+
+// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for nvptx-arch.
+// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:     -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib \
+// RUN:     --nvptx-arch-tool=%t/nvptx_arch_sm_70 %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT-NVPTX
+// BAD-TIMEOUT-NVPTX: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
+
+// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for amdgpu-arch.
+// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT= \
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:     -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib \
+// RUN:     --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT-AMDGPU
+// BAD-TIMEOUT-AMDGPU: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got ''; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
index 06713429d06b4b..9d8e93460933d8 100644
--- a/llvm/utils/lit/lit/TestingConfig.py
+++ b/llvm/utils/lit/lit/TestingConfig.py
@@ -26,7 +26,7 @@ def fromdefaults(litConfig):
             "SYSTEMROOT",
             "TERM",
             "CLANG",
-            "CLANG_TOOL_CHAIN_PROGRAM_WAIT",
+            "CLANG_TOOLCHAIN_PROGRAM_TIMEOUT",
             "LLDB",
             "LD_PRELOAD",
             "LLVM_SYMBOLIZER_PATH",

>From 42ac67eae5c3952083de1fcb68b9344374ba56f6 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 8 Aug 2024 19:33:44 -0400
Subject: [PATCH 5/5] I wish I knew how to write timeout tests

---
 clang/lib/Driver/ToolChain.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 5fd8af373e1db5..dbac6c370e85e6 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -120,7 +120,6 @@ ToolChain::executeToolChainProgram(StringRef Executable,
   int SecondsToWait = DefaultSecondsToWait;
   if (std::optional<std::string> Str =
           llvm::sys::Process::GetEnv("CLANG_TOOLCHAIN_PROGRAM_TIMEOUT")) {
-    int SecondsToWait;
     if (!llvm::to_integer(*Str, SecondsToWait))
       return llvm::createStringError(std::error_code(),
                                      "CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected "



More information about the cfe-commits mailing list