[clang] e178824 - [CUDA][HIP] Rename and fix `-fcuda-approx-transcendentals`

Tue Jul 25 09:01:58 PDT 2023

Author: Yaxun (Sam) Liu
Date: 2023-07-25T12:01:41-04:00
New Revision: e17882430e5c243c7244dadcd0a6b5cce2219386

URL: https://github.com/llvm/llvm-project/commit/e17882430e5c243c7244dadcd0a6b5cce2219386
DIFF: https://github.com/llvm/llvm-project/commit/e17882430e5c243c7244dadcd0a6b5cce2219386.diff

LOG: [CUDA][HIP] Rename and fix `-fcuda-approx-transcendentals`

Rename -fcuda-approx-transcendentals as
-fgpu-approx-transcendentals and pass it
to both device and host clang -cc1.

Fix its interaction with -ffast-math to allow
-fno-gpu-approx-transcendentals to override
the implicit -fcuda-approx-transcendentals
due to -ffast-math.

Rename the predefined macro to be
__CLANG_GPU_APPROX_TRANSCENDENTALS__.
Emit the macro for both device and host compilation.

Reviewed by: Artem Belevich, Fangrui Song

Differential Revision: https://reviews.llvm.org/D154797

Added: 
    

Modified: 
    clang/include/clang/Basic/LangOptions.def
    clang/include/clang/Driver/Options.td
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/lib/Driver/ToolChains/Cuda.cpp
    clang/lib/Driver/ToolChains/HIPAMD.cpp
    clang/lib/Driver/ToolChains/HIPSPV.cpp
    clang/lib/Frontend/InitPreprocessor.cpp
    clang/lib/Headers/__clang_cuda_math.h
    clang/lib/Headers/__clang_cuda_runtime_wrapper.h
    clang/lib/Headers/__clang_hip_math.h
    clang/test/Driver/hip-macros.hip
    clang/test/Driver/hip-options.hip
    clang/test/Headers/__clang_hip_math.hip
    clang/test/Headers/nvptx_device_math_sin.c
    clang/test/Headers/nvptx_device_math_sin.cpp
    clang/test/Preprocessor/cuda-approx-transcendentals.cu

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index f7ec0406f33e3f..5f1ebced561506 100644

--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -265,7 +265,7 @@ ENUM_LANGOPT(HLSLVersion, HLSLLangStd, 16, HLSL_Unset, "HLSL Version")
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
-LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
+LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
 LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP")

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 229f6141c750fe..dec49001c590e7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6801,11 +6801,12 @@ def sycl_std_EQ : Joined<["-"], "sycl-std=">, Group<sycl_Group>,
   MarshallingInfoEnum<LangOpts<"SYCLVersion">, "SYCL_None">,
   ShouldParseIf<!strconcat(fsycl_is_device.KeyPath, "||", fsycl_is_host.KeyPath)>;
 
-defm cuda_approx_transcendentals : BoolFOption<"cuda-approx-transcendentals",
-  LangOpts<"CUDADeviceApproxTranscendentals">, DefaultFalse,
+defm gpu_approx_transcendentals : BoolFOption<"gpu-approx-transcendentals",
+  LangOpts<"GPUDeviceApproxTranscendentals">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Use">, NegFlag<SetFalse, [], "Don't use">,
-  BothFlags<[], " approximate transcendental functions">>,
-  ShouldParseIf<fcuda_is_device.KeyPath>;
+  BothFlags<[], " approximate transcendental functions">>;
+def : Flag<["-"], "fcuda-approx-transcendentals">, Alias<fgpu_approx_transcendentals>;
+def : Flag<["-"], "fno-cuda-approx-transcendentals">, Alias<fno_gpu_approx_transcendentals>;
 
 //===----------------------------------------------------------------------===//
 // Frontend Options - cc1 + fc1

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index adb550d9c5da50..1863432a660035 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7245,6 +7245,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     auto CUID = cast<InputAction>(SourceAction)->getId();
     if (!CUID.empty())
       CmdArgs.push_back(Args.MakeArgString(Twine("-cuid=") + Twine(CUID)));
+
+    // -ffast-math turns on -fgpu-approx-transcendentals implicitly, but will
+    // be overriden by -fno-gpu-approx-transcendentals.
+    bool UseApproxTranscendentals = Args.hasFlag(
+        options::OPT_ffast_math, options::OPT_fno_fast_math, false);
+    if (Args.hasFlag(options::OPT_fgpu_approx_transcendentals,
+                     options::OPT_fno_gpu_approx_transcendentals,
+                     UseApproxTranscendentals))
+      CmdArgs.push_back("-fgpu-approx-transcendentals");
+  } else {
+    Args.claimAllArgs(options::OPT_fgpu_approx_transcendentals,
+                      options::OPT_fno_gpu_approx_transcendentals);
   }
 
   if (IsHIP) {

diff  --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 3a577650eb082a..47187f554db29d 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -801,10 +801,6 @@ void CudaToolChain::addClangTargetOptions(
     CC1Args.append(
         {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
 
-    if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
-                           options::OPT_fno_cuda_approx_transcendentals, false))
-      CC1Args.push_back("-fcuda-approx-transcendentals");
-
     // Unsized function arguments used for variadics were introduced in CUDA-9.0
     // We still do not support generating code that actually uses variadic
     // arguments yet, but we do need to allow parsing them as recent CUDA

diff  --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index e509a01f2f977e..7ff880270cab3f 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -243,10 +243,6 @@ void HIPAMDToolChain::addClangTargetOptions(
 
   CC1Args.push_back("-fcuda-is-device");
 
-  if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
-                         options::OPT_fno_cuda_approx_transcendentals, false))
-    CC1Args.push_back("-fcuda-approx-transcendentals");
-
   if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                           false))
     CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});

diff  --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index ea6a16029130df..a144b28057f4c2 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -143,10 +143,6 @@ void HIPSPVToolChain::addClangTargetOptions(
        // TODO: Allow autovectorization when SPIR-V backend arrives.
        "-mllvm", "-vectorize-loops=false", "-mllvm", "-vectorize-slp=false"});
 
-  if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
-                         options::OPT_fno_cuda_approx_transcendentals, false))
-    CC1Args.push_back("-fcuda-approx-transcendentals");
-
   // Default to "hidden" visibility, as object level linking will not be
   // supported for the foreseeable future.
   if (!DriverArgs.hasArg(options::OPT_fvisibility_EQ,

diff  --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index f8fae82fba1208..93cd97bf8b681d 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1297,11 +1297,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__CUDA_ARCH__");
   }
 
-  // We need to communicate this to our CUDA header wrapper, which in turn
-  // informs the proper CUDA headers of this choice.
-  if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) {
-    Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
-  }
+  // We need to communicate this to our CUDA/HIP header wrapper, which in turn
+  // informs the proper CUDA/HIP headers of this choice.
+  if (LangOpts.GPUDeviceApproxTranscendentals)
+    Builder.defineMacro("__CLANG_GPU_APPROX_TRANSCENDENTALS__");
 
   // Define a macro indicating that the source file is being compiled with a
   // SYCL device compiler which doesn't produce host binary.

diff  --git a/clang/lib/Headers/__clang_cuda_math.h b/clang/lib/Headers/__clang_cuda_math.h
index e447590393ec71..c06da2713fd8ce 100644
--- a/clang/lib/Headers/__clang_cuda_math.h
+++ b/clang/lib/Headers/__clang_cuda_math.h
@@ -45,9 +45,9 @@
 // libdevice provides fast low precision and slow full-recision implementations
 // for some functions. Which one gets selected depends on
 // __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
-// -ffast-math or -fcuda-approx-transcendentals are in effect.
+// -ffast-math or -fgpu-approx-transcendentals are in effect.
 #pragma push_macro("__FAST_OR_SLOW")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
 #define __FAST_OR_SLOW(fast, slow) fast
 #else
 #define __FAST_OR_SLOW(fast, slow) slow

diff  --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
index 512fc300fc344c..d369c86fe10645 100644
--- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -196,12 +196,12 @@ inline __host__ double __signbitd(double x) {
 
 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
-// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+// sin and exp.  This is controlled in clang by -fgpu-approx-transcendentals.
 //
 // device_functions.hpp uses __USE_FAST_MATH__ for a 
diff erent purpose (fast vs.
 // slow divides), so we need to scope our define carefully here.
 #pragma push_macro("__USE_FAST_MATH__")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
 #define __USE_FAST_MATH__ 1
 #endif
 

diff  --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 7689f4221ce440..005f0b006032c0 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -34,10 +34,10 @@
 
 // Device library provides fast low precision and slow full-recision
 // implementations for some functions. Which one gets selected depends on
-// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
-// -ffast-math or -fcuda-approx-transcendentals are in effect.
+// __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if
+// -ffast-math or -fgpu-approx-transcendentals are in effect.
 #pragma push_macro("__FAST_OR_SLOW")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
 #define __FAST_OR_SLOW(fast, slow) fast
 #else
 #define __FAST_OR_SLOW(fast, slow) slow

diff  --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip
index 9954ddf075e66f..3b3afba0b18ca3 100644
--- a/clang/test/Driver/hip-macros.hip
+++ b/clang/test/Driver/hip-macros.hip
@@ -78,3 +78,11 @@
 // PTS-DAG: #define HIP_API_PER_THREAD_DEFAULT_STREAM 1
 // NOPTS-NOT: #define __HIP_API_PER_THREAD_DEFAULT_STREAM__
 // NOPTS-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
+
+// RUN: %clang -E -dM --offload-arch=gfx906 -nogpuinc -nogpulib \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=NOAPPROX %s
+// RUN: %clang -E -dM --offload-arch=gfx906 -nogpuinc -nogpulib -fgpu-approx-transcendentals \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=APPROX %s
+// NOAPPROX-NOT: #define __CLANG_GPU_APPROX_TRANSCENDENTALS__
+// APPROX: #define __CLANG_GPU_APPROX_TRANSCENDENTALS__ 1
+// APPROX: #define __CLANG_GPU_APPROX_TRANSCENDENTALS__ 1

diff  --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index edbe4ff3acbedf..af4d1893729a5a 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -169,3 +169,39 @@
 // RUN: %clang -### -nogpuinc -nogpulib -fhip-fp32-correctly-rounded-divide-sqrt \
 // RUN:   --cuda-gpu-arch=gfx906  %s 2>&1 | FileCheck -check-prefixes=CRDS %s
 // CRDS-NOT: "-f{{(no-)?}}hip-fp32-correctly-rounded-divide-sqrt"
+
+// Check -fgpu-approx-transcendentals is passed to clang -cc1 but
+// (default) -fno-gpu-approx-transcendentals is not.
+// -ffast-math implies -fgpu-approx-transcendentals, which can be overridden
+// by -fno-gpu-approx-transcendentals.
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fgpu-approx-transcendentals \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=APPROX %s
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -ffast-math \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=APPROX %s
+
+// APPROX: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fgpu-approx-transcendentals"
+// APPROX: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fgpu-approx-transcendentals"
+
+// RUN: %clang -### -nogpuinc -nogpulib -fno-gpu-approx-transcendentals \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOAPPROX %s
+
+// RUN: %clang -### -nogpuinc -nogpulib \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOAPPROX %s
+
+// RUN: %clang -### -nogpuinc -nogpulib -ffast-math -fno-fast-math \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOAPPROX %s
+
+// RUN: %clang -### -nogpuinc -nogpulib -ffast-math -fno-gpu-approx-transcendentals \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOAPPROX %s
+
+// NOAPPROX-NOT: "-f{{(no-)?}}gpu-approx-transcendentals"
+
+// Check no warnings for -fgpu-approx-transcendentals.
+
+// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fgpu-approx-transcendentals \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | count 0
+
+// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nostdinc -nostdlib -fgpu-approx-transcendentals \
+// RUN:   -x c++ %s 2>&1 | count 0

diff  --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index a4c00bfb4a1a04..6b3e06b4895bc5 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -22,7 +22,7 @@
 // RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
 // RUN:   -internal-isystem %S/Inputs/include \
 // RUN:   -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
-// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -fcuda-approx-transcendentals -o - \
+// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -fgpu-approx-transcendentals -o - \
 // RUN:   -D__HIPCC_RTC__ | FileCheck -check-prefixes=CHECK,APPROX %s
 
 #define BOOL_TYPE int
@@ -3694,10 +3694,13 @@ extern "C" __device__ BOOL_TYPE test___signbit(double x) {
 //
 // APPROX-LABEL: @test_sincosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    store float [[CALL_I_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    [[CALL1_I_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
-// APPROX-NEXT:    store float [[CALL1_I_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincosf(float x, float *y, float *z) {

diff  --git a/clang/test/Headers/nvptx_device_math_sin.c b/clang/test/Headers/nvptx_device_math_sin.c
index acd138ae4ec3ec..884f98ef4ef540 100644
--- a/clang/test/Headers/nvptx_device_math_sin.c
+++ b/clang/test/Headers/nvptx_device_math_sin.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -x c -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SLOW
 // RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast
-// RUN: %clang_cc1 -x c -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=FAST
+// RUN: %clang_cc1 -x c -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fgpu-approx-transcendentals -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=FAST
 // expected-no-diagnostics
 
 #include <math.h>

diff  --git a/clang/test/Headers/nvptx_device_math_sin.cpp b/clang/test/Headers/nvptx_device_math_sin.cpp
index 10abe8c12e8720..3f4fe5bff6b2e0 100644
--- a/clang/test/Headers/nvptx_device_math_sin.cpp
+++ b/clang/test/Headers/nvptx_device_math_sin.cpp
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SLOW
 // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast
-// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=FAST
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fgpu-approx-transcendentals -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=FAST
 // expected-no-diagnostics
 
 #include <cmath>

diff  --git a/clang/test/Preprocessor/cuda-approx-transcendentals.cu b/clang/test/Preprocessor/cuda-approx-transcendentals.cu
index 409eabb4dac049..747dc7ccaf63c9 100644
--- a/clang/test/Preprocessor/cuda-approx-transcendentals.cu
+++ b/clang/test/Preprocessor/cuda-approx-transcendentals.cu
@@ -3,6 +3,6 @@
 // RUN: %clang -fcuda-approx-transcendentals --cuda-device-only -nocudainc -nocudalib -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
 // RUN: %clang -ffast-math --cuda-device-only -nocudainc -nocudalib -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
 
-// HOST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
-// DEVICE-NOFAST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
-// DEVICE-FAST: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+// HOST-NOT: __CLANG_GPU_APPROX_TRANSCENDENTALS__
+// DEVICE-NOFAST-NOT: __GPU_CUDA_APPROX_TRANSCENDENTALS__
+// DEVICE-FAST: __CLANG_GPU_APPROX_TRANSCENDENTALS__