[clang] [Driver] Enable -ftime-trace for CUDA/HIP device compilation (PR #179681)
Yaxun Liu via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 4 07:01:52 PST 2026
https://github.com/yxsamliu created https://github.com/llvm/llvm-project/pull/179681
Previously, -ftime-trace only generated trace files for host compilation
when compiling CUDA/HIP code. Device compilation was excluded because
the OffloadingPrefix was non-empty, causing handleTimeTrace() to be
skipped.
This patch enables -ftime-trace for offload device compilation by:
1. Passing the offloading prefix to handleTimeTrace()
2. Including the bound architecture in the trace filename
3. Deriving the trace output directory from the -o option for device
compilation (since the device output is a temp file)
Trace files are now generated for each offload target:
- Host: output.json
- Device: output-hip-amdgcn-amd-amdhsa-gfx906.json
Note: When using --save-temps, multiple compilation phases (preprocess,
compile, codegen) write to the same trace file, with each phase
overwriting the previous. This is pre-existing behavior that also
affects regular C++ compilation and is not addressed by this patch.
This addresses a long-standing limitation noted in D150282.
Made with [Cursor](https://cursor.com)
>From 8f1aa03e972655439933ffcc2a3fbe982ebf359f Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Wed, 4 Feb 2026 09:50:54 -0500
Subject: [PATCH] [Driver] Enable -ftime-trace for CUDA/HIP device compilation
Previously, -ftime-trace only generated trace files for host compilation
when compiling CUDA/HIP code. Device compilation was excluded because
the OffloadingPrefix was non-empty, causing handleTimeTrace() to be
skipped.
This patch enables -ftime-trace for offload device compilation by:
1. Passing the offloading prefix to handleTimeTrace()
2. Including the bound architecture in the trace filename
3. Deriving the trace output directory from the -o option for device
compilation (since the device output is a temp file)
Trace files are now generated for each offload target:
- Host: output.json
- Device: output-hip-amdgcn-amd-amdhsa-gfx906.json
Note: When using --save-temps, multiple compilation phases (preprocess,
compile, codegen) write to the same trace file, with each phase
overwriting the previous. This is pre-existing behavior that also
affects regular C++ compilation and is not addressed by this patch.
This addresses a long-standing limitation noted in D150282.
---
clang/lib/Driver/Driver.cpp | 52 +++++++++++++++++++++++++------
clang/test/Driver/ftime-trace.cpp | 35 +++++++++++++++++++++
2 files changed, 77 insertions(+), 10 deletions(-)
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index eb3f9cbea2845..4df11efab5967 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5821,7 +5821,8 @@ InputInfoList Driver::BuildJobsForAction(
static void handleTimeTrace(Compilation &C, const ArgList &Args,
const JobAction *JA, const char *BaseInput,
- const InputInfo &Result) {
+ const InputInfo &Result,
+ StringRef OffloadingPrefix = "") {
Arg *A =
Args.getLastArg(options::OPT_ftime_trace, options::OPT_ftime_trace_EQ);
if (!A)
@@ -5830,18 +5831,43 @@ static void handleTimeTrace(Compilation &C, const ArgList &Args,
if (A->getOption().matches(options::OPT_ftime_trace_EQ)) {
Path = A->getValue();
if (llvm::sys::fs::is_directory(Path)) {
- SmallString<128> Tmp(Result.getFilename());
- llvm::sys::path::replace_extension(Tmp, "json");
- llvm::sys::path::append(Path, llvm::sys::path::filename(Tmp));
+ // When -ftime-trace=<dir> and it's a directory:
+ // - For host/non-offload: use the output filename stem
+ // - For offload: use input filename stem + offloading prefix
+ SmallString<128> Tmp;
+ if (OffloadingPrefix.empty()) {
+ Tmp = llvm::sys::path::stem(Result.getFilename());
+ } else {
+ Tmp = llvm::sys::path::stem(BaseInput);
+ Tmp += OffloadingPrefix;
+ }
+ Tmp += ".json";
+ llvm::sys::path::append(Path, Tmp);
}
} else {
if (Arg *DumpDir = Args.getLastArgNoClaim(options::OPT_dumpdir)) {
- // The trace file is ${dumpdir}${basename}.json. Note that dumpdir may not
- // end with a path separator.
+ // The trace file is ${dumpdir}${basename}${offloadprefix}.json. Note
+ // that dumpdir may not end with a path separator.
Path = DumpDir->getValue();
- Path += llvm::sys::path::filename(BaseInput);
+ Path += llvm::sys::path::stem(BaseInput);
+ Path += OffloadingPrefix;
+ } else if (!OffloadingPrefix.empty()) {
+ // For offloading, derive path from -o option or use current directory.
+ // The Result filename may be a temp file, so we use the -o output
+ // directory combined with the input filename and offload prefix.
+ if (Arg *FinalOutput = Args.getLastArg(options::OPT_o)) {
+ Path = llvm::sys::path::parent_path(FinalOutput->getValue());
+ if (!Path.empty())
+ Path += llvm::sys::path::get_separator();
+ }
+ Path += llvm::sys::path::stem(BaseInput);
+ Path += OffloadingPrefix;
} else {
- Path = Result.getFilename();
+ // Use the output filename stem for the trace file.
+ Path = llvm::sys::path::parent_path(Result.getFilename());
+ if (!Path.empty())
+ Path += llvm::sys::path::get_separator();
+ Path += llvm::sys::path::stem(Result.getFilename());
}
llvm::sys::path::replace_extension(Path, "json");
}
@@ -6100,8 +6126,14 @@ InputInfoList Driver::BuildJobsForActionNoCache(
AtTopLevel, MultipleArchs,
OffloadingPrefix),
BaseInput);
- if (T->canEmitIR() && OffloadingPrefix.empty())
- handleTimeTrace(C, Args, JA, BaseInput, Result);
+ if (T->canEmitIR()) {
+ // For time trace, include the bound arch in the prefix to ensure unique
+ // trace files for each offload target.
+ std::string TimeTracePrefix = OffloadingPrefix;
+ if (!OffloadingPrefix.empty() && !BoundArch.empty())
+ TimeTracePrefix += "-" + BoundArch.str();
+ handleTimeTrace(C, Args, JA, BaseInput, Result, TimeTracePrefix);
+ }
}
if (CCCPrintBindings && !CCGenDiagnostics) {
diff --git a/clang/test/Driver/ftime-trace.cpp b/clang/test/Driver/ftime-trace.cpp
index 60c5885704b58..530d52482497a 100644
--- a/clang/test/Driver/ftime-trace.cpp
+++ b/clang/test/Driver/ftime-trace.cpp
@@ -63,6 +63,41 @@
// UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-verbose'
// UNUSED-NOT: warning:
+/// Test HIP offloading: -ftime-trace should generate traces for both host and device.
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -x hip d/a.cpp --offload-arch=gfx906 --offload-arch=gfx90a \
+// RUN: -c -o e/a.o --target=x86_64-linux-gnu 2>&1 \
+// RUN: | FileCheck %s --check-prefix=HIP
+// HIP-DAG: -cc1{{.*}} "-triple" "amdgcn-amd-amdhsa"{{.*}} "-ftime-trace=e/a-hip-amdgcn-amd-amdhsa-gfx906.json"
+// HIP-DAG: -cc1{{.*}} "-triple" "amdgcn-amd-amdhsa"{{.*}} "-ftime-trace=e/a-hip-amdgcn-amd-amdhsa-gfx90a.json"
+// HIP-DAG: -cc1{{.*}} "-triple" "x86_64{{.*}}"{{.*}} "-ftime-trace=e/a.json"
+
+/// Test HIP offloading with new driver: same output as above.
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -x hip d/a.cpp --offload-arch=gfx906 --offload-arch=gfx90a \
+// RUN: -c -o e/a.o --target=x86_64-linux-gnu --offload-new-driver 2>&1 \
+// RUN: | FileCheck %s --check-prefix=HIP
+
+/// Test HIP offloading with -ftime-trace=<dir>: traces go to specified directory.
+// RUN: %clang -### -ftime-trace=f -ftime-trace-granularity=0 -x hip d/a.cpp --offload-arch=gfx906 \
+// RUN: -c -o e/a.o --target=x86_64-linux-gnu 2>&1 \
+// RUN: | FileCheck %s --check-prefix=HIP-DIR
+// HIP-DIR-DAG: -cc1{{.*}} "-triple" "amdgcn-amd-amdhsa"{{.*}} "-ftime-trace=f{{/|\\\\}}a-hip-amdgcn-amd-amdhsa-gfx906.json"
+// HIP-DIR-DAG: -cc1{{.*}} "-triple" "x86_64{{.*}}"{{.*}} "-ftime-trace=f{{/|\\\\}}a.json"
+
+/// Test HIP offloading with --save-temps: both host and device get unique trace files.
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -x hip d/a.cpp --offload-arch=gfx906 \
+// RUN: -c -o e/a.o --target=x86_64-linux-gnu --save-temps 2>&1 \
+// RUN: | FileCheck %s --check-prefix=HIP-SAVE-TEMPS
+// HIP-SAVE-TEMPS-DAG: -cc1{{.*}} "-triple" "amdgcn-amd-amdhsa"{{.*}} "-ftime-trace=e/a-hip-amdgcn-amd-amdhsa-gfx906.json"
+// HIP-SAVE-TEMPS-DAG: -cc1{{.*}} "-triple" "x86_64{{.*}}"{{.*}} "-ftime-trace=e/a-host-x86_64-unknown-linux-gnu.json"
+
+/// Test CUDA offloading: -ftime-trace should generate traces for both host and device.
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -x cuda d/a.cpp --offload-arch=sm_70 --offload-arch=sm_80 \
+// RUN: -c -o e/a.o --target=x86_64-linux-gnu --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CUDA
+// CUDA-DAG: -cc1{{.*}} "-triple" "nvptx64-nvidia-cuda"{{.*}} "-ftime-trace=e/a-cuda-nvptx64-nvidia-cuda-sm_70.json"
+// CUDA-DAG: -cc1{{.*}} "-triple" "nvptx64-nvidia-cuda"{{.*}} "-ftime-trace=e/a-cuda-nvptx64-nvidia-cuda-sm_80.json"
+// CUDA-DAG: -cc1{{.*}} "-triple" "x86_64{{.*}}"{{.*}} "-ftime-trace=e/a.json"
+
template <typename T>
struct Struct {
T Num;
More information about the cfe-commits
mailing list