[clang] [Clang] Make OpenMP offloading consistently use the bound architecture (PR #125135)
Joseph Huber via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 30 15:21:01 PST 2025
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/125135
Summary:
OpenMP was weirdly split between using the bound architecture from
`--offload-arch=` and the old `-march=` option which only worked for
single jobs. This patch removes that special handling. The main benefit
here is that we can now use `getToolchainArgs` without it throwing an
error.
I'm assuming SYCL doesn't care about this because they don't use an
architecture.
>From 010ac5fccbe55b4f3ec139e99cf33de4a752eb00 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 30 Jan 2025 17:18:15 -0600
Subject: [PATCH] [Clang] Make OpenMP offloading consistently use the bound
architecture
Summary:
OpenMP was weirdly split between using the bound architecture from
`--offload-arch=` and the old `-march=` option which only worked for
single jobs. This patch removes that special handling. The main benefit
here is that we can now use `getToolchainArgs` without it throwing an
error.
I'm assuming SYCL doesn't care about this because they don't use an
architecture.
---
clang/lib/Driver/Driver.cpp | 45 ++++++++++---------
clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 28 +-----------
clang/lib/Driver/ToolChains/Cuda.cpp | 28 ------------
.../Driver/amdgpu-openmp-system-arch-fail.c | 4 +-
clang/test/Driver/amdgpu-openmp-toolchain.c | 12 ++---
clang/test/Driver/openmp-offload-gpu.c | 16 +++----
clang/test/Driver/openmp-offload-jit.c | 8 ++--
clang/test/Driver/openmp-system-arch.c | 8 ++--
8 files changed, 49 insertions(+), 100 deletions(-)
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 87855fdb799710..fb6e46070afdc2 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4708,23 +4708,7 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
return KnownArchs.lookup(TC);
llvm::DenseSet<StringRef> Archs;
- for (auto *Arg : Args) {
- // Extract any '--[no-]offload-arch' arguments intended for this toolchain.
- std::unique_ptr<llvm::opt::Arg> ExtractedArg = nullptr;
- if (Arg->getOption().matches(options::OPT_Xopenmp_target_EQ) &&
- ToolChain::getOpenMPTriple(Arg->getValue(0)) == TC->getTriple()) {
- Arg->claim();
- unsigned Index = Args.getBaseArgs().MakeIndex(Arg->getValue(1));
- unsigned Prev = Index;
- ExtractedArg = getOpts().ParseOneArg(Args, Index);
- if (!ExtractedArg || Index > Prev + 1) {
- TC->getDriver().Diag(diag::err_drv_invalid_Xopenmp_target_with_args)
- << Arg->getAsString(Args);
- continue;
- }
- Arg = ExtractedArg.get();
- }
-
+ for (auto *Arg : C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind)) {
// Add or remove the seen architectures in order of appearance. If an
// invalid architecture is given we simply exit.
if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) {
@@ -4781,14 +4765,31 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
return Archs;
if (Archs.empty()) {
- if (Kind == Action::OFK_Cuda)
+ if (Kind == Action::OFK_Cuda) {
Archs.insert(OffloadArchToString(OffloadArch::CudaDefault));
- else if (Kind == Action::OFK_HIP)
+ } else if (Kind == Action::OFK_HIP) {
Archs.insert(OffloadArchToString(OffloadArch::HIPDefault));
- else if (Kind == Action::OFK_OpenMP)
- Archs.insert(StringRef());
- else if (Kind == Action::OFK_SYCL)
+ } else if (Kind == Action::OFK_SYCL) {
Archs.insert(StringRef());
+ } else if (Kind == Action::OFK_OpenMP) {
+ // Accept legacy `-march` device arguments for OpenMP.
+ if (auto *Arg = C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind)
+ .getLastArg(options::OPT_march_EQ)) {
+ Archs.insert(Arg->getValue());
+ } else {
+ auto ArchsOrErr = TC->getSystemGPUArchs(Args);
+ if (!ArchsOrErr) {
+ TC->getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+ << llvm::Triple::getArchTypeName(TC->getArch())
+ << llvm::toString(ArchsOrErr.takeError()) << "--offload-arch";
+ } else if (!ArchsOrErr->empty()) {
+ for (auto Arch : *ArchsOrErr)
+ Archs.insert(Args.MakeArgStringRef(Arch));
+ } else {
+ Archs.insert(StringRef());
+ }
+ }
+ }
} else {
Args.ClaimAllArgs(options::OPT_offload_arch_EQ);
Args.ClaimAllArgs(options::OPT_no_offload_arch_EQ);
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 3f0b3f2d86b3ed..f59e8584abbcb9 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -71,33 +71,9 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs(
const OptTable &Opts = getDriver().getOpts();
- if (DeviceOffloadKind == Action::OFK_OpenMP) {
- for (Arg *A : Args)
- if (!llvm::is_contained(*DAL, A))
- DAL->append(A);
-
- if (!DAL->hasArg(options::OPT_march_EQ)) {
- StringRef Arch = BoundArch;
- if (Arch.empty()) {
- auto ArchsOrErr = getSystemGPUArchs(Args);
- if (!ArchsOrErr) {
- std::string ErrMsg =
- llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));
- getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
- << llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";
- Arch = OffloadArchToString(OffloadArch::HIPDefault);
- } else {
- Arch = Args.MakeArgString(ArchsOrErr->front());
- }
- }
- DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);
- }
-
- return DAL;
- }
-
for (Arg *A : Args) {
- DAL->append(A);
+ if (!llvm::is_contained(*DAL, A))
+ DAL->append(A);
}
if (!BoundArch.empty()) {
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 0922a97ed7c19d..c800e9cfa0a8d8 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -969,34 +969,6 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
const OptTable &Opts = getDriver().getOpts();
- // For OpenMP device offloading, append derived arguments. Make sure
- // flags are not duplicated.
- // Also append the compute capability.
- if (DeviceOffloadKind == Action::OFK_OpenMP) {
- for (Arg *A : Args)
- if (!llvm::is_contained(*DAL, A))
- DAL->append(A);
-
- if (!DAL->hasArg(options::OPT_march_EQ)) {
- StringRef Arch = BoundArch;
- if (Arch.empty()) {
- auto ArchsOrErr = getSystemGPUArchs(Args);
- if (!ArchsOrErr) {
- std::string ErrMsg =
- llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));
- getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
- << llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";
- Arch = OffloadArchToString(OffloadArch::CudaDefault);
- } else {
- Arch = Args.MakeArgString(ArchsOrErr->front());
- }
- }
- DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);
- }
-
- return DAL;
- }
-
for (Arg *A : Args) {
// Make sure flags are not duplicated.
if (!llvm::is_contained(*DAL, A)) {
diff --git a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
index b7e1d0b2c56659..eb037183b4c3cb 100644
--- a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
+++ b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
@@ -12,9 +12,9 @@
// case when amdgpu_arch returns nothing or fails
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_fail %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: cannot determine amdgcn architecture{{.*}}; consider passing it via '-march'
+// NO-OUTPUT-ERROR: error: cannot determine amdgcn architecture{{.*}}; consider passing it via '--offload-arch'
// case when amdgpu_arch does not return anything with successful execution
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=EMPTY-OUTPUT
-// EMPTY-OUTPUT: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '-march'
+// EMPTY-OUTPUT: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '--offload-arch'
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index 1c2ee26173139c..1f4d724a269e68 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -16,12 +16,12 @@
// CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp)
// CHECK-PHASES: 1: preprocessor, {0}, cpp-output, (host-openmp)
// CHECK-PHASES: 2: compiler, {1}, ir, (host-openmp)
-// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp)
-// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp)
-// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp)
-// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (amdgcn-amd-amdhsa)" {5}, ir
-// CHECK-PHASES: 7: backend, {6}, ir, (device-openmp)
-// CHECK-PHASES: 8: offload, "device-openmp (amdgcn-amd-amdhsa)" {7}, ir
+// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp, gfx906)
+// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp, gfx906)
+// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp, gfx906)
+// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (amdgcn-amd-amdhsa:gfx906)" {5}, ir
+// CHECK-PHASES: 7: backend, {6}, ir, (device-openmp, gfx906)
+// CHECK-PHASES: 8: offload, "device-openmp (amdgcn-amd-amdhsa:gfx906)" {7}, ir
// CHECK-PHASES: 9: clang-offload-packager, {8}, image, (device-openmp)
// CHECK-PHASES: 10: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {9}, ir
// CHECK-PHASES: 11: backend, {10}, assembler, (host-openmp)
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index 74bd2a6aeee468..1f7e2996068c4a 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -235,13 +235,13 @@
// CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp)
// CHECK-PHASES: 1: preprocessor, {0}, cpp-output, (host-openmp)
// CHECK-PHASES: 2: compiler, {1}, ir, (host-openmp)
-// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp)
-// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp)
-// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp)
-// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (nvptx64-nvidia-cuda)" {5}, ir
-// CHECK-PHASES: 7: backend, {6}, assembler, (device-openmp)
-// CHECK-PHASES: 8: assembler, {7}, object, (device-openmp)
-// CHECK-PHASES: 9: offload, "device-openmp (nvptx64-nvidia-cuda)" {8}, object
+// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp, sm_52)
+// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp, sm_52)
+// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp, sm_52)
+// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (nvptx64-nvidia-cuda:sm_52)" {5}, ir
+// CHECK-PHASES: 7: backend, {6}, assembler, (device-openmp, sm_52)
+// CHECK-PHASES: 8: assembler, {7}, object, (device-openmp, sm_52)
+// CHECK-PHASES: 9: offload, "device-openmp (nvptx64-nvidia-cuda:sm_52)" {8}, object
// CHECK-PHASES: 10: clang-offload-packager, {9}, image
// CHECK-PHASES: 11: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {10}, ir
// CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp)
@@ -315,7 +315,7 @@
// RUN: -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 --offload-device-only -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEVICE-ONLY
// CHECK-DEVICE-ONLY: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"], output: "[[HOST_BC:.*]]"
// CHECK-DEVICE-ONLY: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_ASM:.*]]"
-// CHECK-DEVICE-ONLY: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "{{.*}}-openmp-nvptx64-nvidia-cuda.o"
+// CHECK-DEVICE-ONLY: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "{{.*}}-openmp-nvptx64-nvidia-cuda-sm_52.o"
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 --offload-device-only -E -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEVICE-ONLY-PP
diff --git a/clang/test/Driver/openmp-offload-jit.c b/clang/test/Driver/openmp-offload-jit.c
index 57f265ac37eacd..67b941a387a6a0 100644
--- a/clang/test/Driver/openmp-offload-jit.c
+++ b/clang/test/Driver/openmp-offload-jit.c
@@ -19,11 +19,11 @@
// PHASES-JIT: 0: input, "[[INPUT:.+]]", c, (host-openmp)
// PHASES-JIT-NEXT: 1: preprocessor, {0}, cpp-output, (host-openmp)
// PHASES-JIT-NEXT: 2: compiler, {1}, ir, (host-openmp)
-// PHASES-JIT-NEXT: 3: input, "[[INPUT]]", c, (device-openmp)
-// PHASES-JIT-NEXT: 4: preprocessor, {3}, cpp-output, (device-openmp)
-// PHASES-JIT-NEXT: 5: compiler, {4}, ir, (device-openmp)
+// PHASES-JIT-NEXT: 3: input, "[[INPUT]]", c, (device-openmp, {{.*}})
+// PHASES-JIT-NEXT: 4: preprocessor, {3}, cpp-output, (device-openmp, {{.*}})
+// PHASES-JIT-NEXT: 5: compiler, {4}, ir, (device-openmp, {{.*}})
// PHASES-JIT-NEXT: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp ([[TARGET:.+]])" {5}, ir
-// PHASES-JIT-NEXT: 7: backend, {6}, lto-bc, (device-openmp)
+// PHASES-JIT-NEXT: 7: backend, {6}, lto-bc, (device-openmp, {{.*}})
// PHASES-JIT-NEXT: 8: offload, "device-openmp ([[TARGET]])" {7}, lto-bc
// PHASES-JIT-NEXT: 9: clang-offload-packager, {8}, image, (device-openmp)
// PHASES-JIT-NEXT: 10: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {9}, ir
diff --git a/clang/test/Driver/openmp-system-arch.c b/clang/test/Driver/openmp-system-arch.c
index d097c6bc065484..51021e352f4d30 100644
--- a/clang/test/Driver/openmp-system-arch.c
+++ b/clang/test/Driver/openmp-system-arch.c
@@ -68,13 +68,13 @@
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp \
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda --nvptx-arch-tool=%t/nvptx_arch_empty %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=NVPTX
-// NVPTX: error: cannot determine nvptx64 architecture: No NVIDIA GPU detected in the system; consider passing it via '-march'
+// NVPTX: error: cannot determine nvptx64 architecture: No NVIDIA GPU detected in the system; consider passing it via '--offload-arch'
// case when 'amdgpu-arch' returns nothing using `-fopenmp-targets=`.
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp \
// RUN: -fopenmp-targets=amdgcn-amd-amdhsa --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=AMDGPU
-// AMDGPU: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '-march'
+// AMDGPU: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '--offload-arch'
// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for nvptx-arch.
// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
@@ -82,7 +82,7 @@
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib \
// RUN: --nvptx-arch-tool=%t/nvptx_arch_sm_70 %s 2>&1 | \
// RUN: FileCheck %s --check-prefix=BAD-TIMEOUT-NVPTX
-// BAD-TIMEOUT-NVPTX: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
+// BAD-TIMEOUT-NVPTX: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for amdgpu-arch.
// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT= \
@@ -90,4 +90,4 @@
// RUN: -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib \
// RUN: --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 %s 2>&1 | \
// RUN: FileCheck %s --check-prefix=BAD-TIMEOUT-AMDGPU
-// BAD-TIMEOUT-AMDGPU: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got ''; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
+// BAD-TIMEOUT-AMDGPU: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got ''; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
More information about the cfe-commits
mailing list