[clang] 11d06b9 - [HIP] Add default header and include path

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Fri Jun 5 09:45:23 PDT 2020


Author: Yaxun (Sam) Liu
Date: 2020-06-05T12:44:57-04:00
New Revision: 11d06b9511bd25aabbfad10dff548b0ce29135a5

URL: https://github.com/llvm/llvm-project/commit/11d06b9511bd25aabbfad10dff548b0ce29135a5
DIFF: https://github.com/llvm/llvm-project/commit/11d06b9511bd25aabbfad10dff548b0ce29135a5.diff

LOG: [HIP] Add default header and include path

To support std::complex and some other standard C/C++ functions in HIP device code,
they need to be forced to be __host__ __device__ functions by pragmas. This is done
by some clang standard C++ wrapper headers which are shared between cuda-clang and hip-Clang.

For these standard C++ wapper headers to work properly, specific include path order
has to be enforced:

  clang C++ wrapper include path
  standard C++ include path
  clang include path

Also, these C++ wrapper headers require device version of some standard C/C++ functions
must be declared before including them. This needs to be done by including a default
header which declares or defines these device functions. The default header is always
included before any other headers are included by users.

This patch adds the the default header and include path for HIP.

Differential Revision: https://reviews.llvm.org/D81176

Added: 
    clang/lib/Driver/ToolChains/ROCm.h
    clang/lib/Headers/__clang_hip_libdevice_declares.h
    clang/lib/Headers/__clang_hip_math.h
    clang/lib/Headers/__clang_hip_runtime_wrapper.h
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/hip.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ockl.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_on.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_off.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_on.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1010.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1011.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1012.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_803.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_900.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_on.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ocml.bc
    clang/test/Driver/Inputs/rocm/amdgcn/bitcode/opencl.bc
    clang/test/Driver/Inputs/rocm/include/hip/hip_runtime.h
    clang/test/Driver/hip-include-path.hip

Modified: 
    clang/include/clang/Basic/DiagnosticDriverKinds.td
    clang/include/clang/Driver/Options.td
    clang/include/clang/Driver/ToolChain.h
    clang/lib/Driver/ToolChain.cpp
    clang/lib/Driver/ToolChains/AMDGPU.cpp
    clang/lib/Driver/ToolChains/AMDGPU.h
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/lib/Driver/ToolChains/Cuda.cpp
    clang/lib/Driver/ToolChains/Gnu.cpp
    clang/lib/Driver/ToolChains/Gnu.h
    clang/lib/Driver/ToolChains/HIP.cpp
    clang/lib/Driver/ToolChains/HIP.h
    clang/lib/Driver/ToolChains/Linux.cpp
    clang/lib/Driver/ToolChains/Linux.h
    clang/lib/Driver/ToolChains/MSVC.cpp
    clang/lib/Driver/ToolChains/MSVC.h
    clang/lib/Headers/CMakeLists.txt
    clang/lib/Headers/__clang_cuda_math_forward_declares.h
    clang/test/Driver/hip-device-libs.hip
    clang/test/Driver/rocm-detect.cl
    clang/test/Driver/rocm-detect.hip
    clang/test/Driver/rocm-device-libs.cl
    clang/test/Driver/rocm-not-found.cl

Removed: 
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/hip.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ockl.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_off.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_on.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_off.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_on.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1010.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1011.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1012.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_803.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_900.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_off.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_on.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ocml.bc
    clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/opencl.bc


################################################################################
diff  --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index d010a7dfb2de..80242e53a1ae 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -58,7 +58,7 @@ def err_drv_no_cuda_libdevice : Error<
 
 def err_drv_no_rocm_installation : Error<
   "cannot find ROCm installation.  Provide its path via --rocm-path, or pass "
-  "-nogpulib.">;
+  "-nogpulib and -nogpuinc to build without ROCm device library and HIP includes.">;
 def err_drv_no_rocm_device_lib : Error<
   "cannot find device library for %0. Provide path to 
diff erent ROCm installation "
   "via --rocm-path, or pass -nogpulib to build without linking default libraries.">;

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d3f0ccb09ef4..c7cae452215a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2640,7 +2640,8 @@ def no_pedantic : Flag<["-", "--"], "no-pedantic">, Group<pedantic_Group>;
 def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
 def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
   HelpText<"Disable builtin #include directories">;
-def nocudainc : Flag<["-"], "nocudainc">;
+def nogpuinc : Flag<["-"], "nogpuinc">;
+def : Flag<["-"], "nocudainc">, Alias<nogpuinc>;
 def nogpulib : Flag<["-"], "nogpulib">,
   HelpText<"Do not link device library for CUDA/HIP device compilation">;
 def : Flag<["-"], "nocudalib">, Alias<nogpulib>;

diff  --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 0a35e9e6a01a..e8bb86be5554 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -617,6 +617,10 @@ class ToolChain {
   virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                   llvm::opt::ArgStringList &CC1Args) const;
 
+  /// Add arguments to use system-specific HIP includes.
+  virtual void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                                 llvm::opt::ArgStringList &CC1Args) const;
+
   /// Add arguments to use MCU GCC toolchain includes.
   virtual void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args) const;

diff  --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index cf04fd07e2a0..2bda7de8ac3a 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -991,6 +991,9 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
 void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                    ArgStringList &CC1Args) const {}
 
+void ToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                  ArgStringList &CC1Args) const {}
+
 void ToolChain::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                     ArgStringList &CC1Args) const {}
 

diff  --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 3e51bd00bae4..44a7e7fc3be0 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -199,6 +199,40 @@ void RocmInstallationDetector::print(raw_ostream &OS) const {
     OS << "Found ROCm installation: " << InstallPath << '\n';
 }
 
+void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                                 ArgStringList &CC1Args) const {
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    // HIP header includes standard library wrapper headers under clang
+    // cuda_wrappers directory. Since these wrapper headers include_next
+    // standard C++ headers, whereas libc++ headers include_next other clang
+    // headers. The include paths have to follow this order:
+    // - wrapper include path
+    // - standard C++ include path
+    // - other clang include path
+    // Since standard C++ and other clang include paths are added in other
+    // places after this function, here we only need to make sure wrapper
+    // include path is added.
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include");
+    llvm::sys::path::append(P, "cuda_wrappers");
+    CC1Args.push_back("-internal-isystem");
+    CC1Args.push_back(DriverArgs.MakeArgString(P));
+    CC1Args.push_back("-include");
+    CC1Args.push_back("__clang_hip_runtime_wrapper.h");
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nogpuinc))
+    return;
+
+  if (!isValid()) {
+    D.Diag(diag::err_drv_no_rocm_installation);
+    return;
+  }
+
+  CC1Args.push_back("-internal-isystem");
+  CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
+}
+
 void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfo &Output,
                                   const InputInfoList &Inputs,

diff  --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index 230af868298f..9418a0a509c7 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_AMDGPU_H
 
 #include "Gnu.h"
+#include "ROCm.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
@@ -21,148 +22,6 @@
 namespace clang {
 namespace driver {
 
-/// A class to find a viable ROCM installation
-/// TODO: Generalize to handle libclc.
-class RocmInstallationDetector {
-private:
-  struct ConditionalLibrary {
-    SmallString<0> On;
-    SmallString<0> Off;
-
-    bool isValid() const {
-      return !On.empty() && !Off.empty();
-    }
-
-    StringRef get(bool Enabled) const {
-      assert(isValid());
-      return Enabled ? On : Off;
-    }
-  };
-
-  const Driver &D;
-  bool IsValid = false;
-  //RocmVersion Version = RocmVersion::UNKNOWN;
-  SmallString<0> InstallPath;
-  //SmallString<0> BinPath;
-  SmallString<0> LibPath;
-  SmallString<0> LibDevicePath;
-  SmallString<0> IncludePath;
-  llvm::StringMap<std::string> LibDeviceMap;
-
-  // Libraries that are always linked.
-  SmallString<0> OCML;
-  SmallString<0> OCKL;
-
-  // Libraries that are always linked depending on the language
-  SmallString<0> OpenCL;
-  SmallString<0> HIP;
-
-  // Libraries swapped based on compile flags.
-  ConditionalLibrary WavefrontSize64;
-  ConditionalLibrary FiniteOnly;
-  ConditionalLibrary UnsafeMath;
-  ConditionalLibrary DenormalsAreZero;
-  ConditionalLibrary CorrectlyRoundedSqrt;
-
-  bool allGenericLibsValid() const {
-    return !OCML.empty() && !OCKL.empty() && !OpenCL.empty() && !HIP.empty() &&
-           WavefrontSize64.isValid() && FiniteOnly.isValid() &&
-           UnsafeMath.isValid() && DenormalsAreZero.isValid() &&
-           CorrectlyRoundedSqrt.isValid();
-  }
-
-  // CUDA architectures for which we have raised an error in
-  // CheckRocmVersionSupportsArch.
-  mutable llvm::SmallSet<CudaArch, 4> ArchsWithBadVersion;
-
-  void scanLibDevicePath();
-
-public:
-  RocmInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
-                           const llvm::opt::ArgList &Args);
-
-  /// Add arguments needed to link default bitcode libraries.
-  void addCommonBitcodeLibCC1Args(const llvm::opt::ArgList &DriverArgs,
-                                  llvm::opt::ArgStringList &CC1Args,
-                                  StringRef LibDeviceFile, bool Wave64,
-                                  bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
-                                  bool FastRelaxedMath, bool CorrectSqrt) const;
-
-  /// Emit an error if Version does not support the given Arch.
-  ///
-  /// If either Version or Arch is unknown, does not emit an error.  Emits at
-  /// most one error per Arch.
-  void CheckRocmVersionSupportsArch(CudaArch Arch) const;
-
-  /// Check whether we detected a valid Rocm install.
-  bool isValid() const { return IsValid; }
-  /// Print information about the detected CUDA installation.
-  void print(raw_ostream &OS) const;
-
-  /// Get the detected Rocm install's version.
-  // RocmVersion version() const { return Version; }
-
-  /// Get the detected Rocm installation path.
-  StringRef getInstallPath() const { return InstallPath; }
-
-  /// Get the detected path to Rocm's bin directory.
-  // StringRef getBinPath() const { return BinPath; }
-
-  /// Get the detected Rocm Include path.
-  StringRef getIncludePath() const { return IncludePath; }
-
-  /// Get the detected Rocm library path.
-  StringRef getLibPath() const { return LibPath; }
-
-  /// Get the detected Rocm device library path.
-  StringRef getLibDevicePath() const { return LibDevicePath; }
-
-  StringRef getOCMLPath() const {
-    assert(!OCML.empty());
-    return OCML;
-  }
-
-  StringRef getOCKLPath() const {
-    assert(!OCKL.empty());
-    return OCKL;
-  }
-
-  StringRef getOpenCLPath() const {
-    assert(!OpenCL.empty());
-    return OpenCL;
-  }
-
-  StringRef getHIPPath() const {
-    assert(!HIP.empty());
-    return HIP;
-  }
-
-  StringRef getWavefrontSize64Path(bool Enabled) const {
-    return WavefrontSize64.get(Enabled);
-  }
-
-  StringRef getFiniteOnlyPath(bool Enabled) const {
-    return FiniteOnly.get(Enabled);
-  }
-
-  StringRef getUnsafeMathPath(bool Enabled) const {
-    return UnsafeMath.get(Enabled);
-  }
-
-  StringRef getDenormalsAreZeroPath(bool Enabled) const {
-    return DenormalsAreZero.get(Enabled);
-  }
-
-  StringRef getCorrectlyRoundedSqrtPath(bool Enabled) const {
-    return CorrectlyRoundedSqrt.get(Enabled);
-  }
-
-  /// Get libdevice file for given architecture
-  std::string getLibDeviceFile(StringRef Gpu) const {
-    return LibDeviceMap.lookup(Gpu);
-  }
-};
-
 namespace tools {
 namespace amdgpu {
 

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b20048768e44..32e60c13e1d8 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1202,12 +1202,14 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_MP);
   Args.AddLastArg(CmdArgs, options::OPT_MV);
 
-  // Add offload include arguments specific for CUDA.  This must happen before
-  // we -I or -include anything else, because we must pick up the CUDA headers
-  // from the particular CUDA installation, rather than from e.g.
-  // /usr/local/include.
+  // Add offload include arguments specific for CUDA/HIP.  This must happen
+  // before we -I or -include anything else, because we must pick up the
+  // CUDA/HIP headers from the particular CUDA/ROCm installation, rather than
+  // from e.g. /usr/local/include.
   if (JA.isOffloading(Action::OFK_Cuda))
     getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
+  if (JA.isOffloading(Action::OFK_HIP))
+    getToolChain().AddHIPIncludeArgs(Args, CmdArgs);
 
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.

diff  --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 08064de13b5b..55b205921e39 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -241,7 +241,7 @@ void CudaInstallationDetector::AddCudaIncludeArgs(
     CC1Args.push_back(DriverArgs.MakeArgString(P));
   }
 
-  if (DriverArgs.hasArg(options::OPT_nocudainc))
+  if (DriverArgs.hasArg(options::OPT_nogpuinc))
     return;
 
   if (!isValid()) {
@@ -765,7 +765,7 @@ void CudaToolChain::adjustDebugInfoKind(
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                        ArgStringList &CC1Args) const {
   // Check our CUDA version if we're going to include the CUDA headers.
-  if (!DriverArgs.hasArg(options::OPT_nocudainc) &&
+  if (!DriverArgs.hasArg(options::OPT_nogpuinc) &&
       !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
     StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
     assert(!Arch.empty() && "Must have an explicit GPU arch.");

diff  --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index ac9eb46dacb5..a392fa00ea75 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2578,7 +2578,7 @@ bool Generic_GCC::GCCInstallationDetector::ScanGentooGccConfig(
 Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple &Triple,
                          const ArgList &Args)
     : ToolChain(D, Triple, Args), GCCInstallation(D),
-      CudaInstallation(D, Triple, Args) {
+      CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);

diff  --git a/clang/lib/Driver/ToolChains/Gnu.h b/clang/lib/Driver/ToolChains/Gnu.h
index e43414ae35f0..8ef9b4fdb6cd 100644
--- a/clang/lib/Driver/ToolChains/Gnu.h
+++ b/clang/lib/Driver/ToolChains/Gnu.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_GNU_H
 
 #include "Cuda.h"
+#include "ROCm.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include <set>
@@ -278,6 +279,7 @@ class LLVM_LIBRARY_VISIBILITY Generic_GCC : public ToolChain {
 protected:
   GCCInstallationDetector GCCInstallation;
   CudaInstallationDetector CudaInstallation;
+  RocmInstallationDetector RocmInstallation;
 
 public:
   Generic_GCC(const Driver &D, const llvm::Triple &Triple,

diff  --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index a7510f721145..7e58fe7bb3fb 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -427,6 +427,11 @@ void HIPToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
   HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
 }
 
+void HIPToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                     ArgStringList &CC1Args) const {
+  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+}
+
 SanitizerMask HIPToolChain::getSupportedSanitizers() const {
   // The HIPToolChain only supports sanitizers in the sense that it allows
   // sanitizer arguments on the command line if they are supported by the host

diff  --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index 01a0ee916bc0..353775e5bbf7 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -107,6 +107,8 @@ class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public ROCMToolChain {
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
 
   SanitizerMask getSupportedSanitizers() const override;
 

diff  --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 8188c972f446..7df49c787c8e 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -797,6 +797,11 @@ void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs,
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
+void Linux::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                              ArgStringList &CC1Args) const {
+  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+}
+
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                 ArgStringList &CC1Args) const {
   if (GCCInstallation.isValid()) {

diff  --git a/clang/lib/Driver/ToolChains/Linux.h b/clang/lib/Driver/ToolChains/Linux.h
index 550cb96b0b9a..0a7e5bac25b1 100644
--- a/clang/lib/Driver/ToolChains/Linux.h
+++ b/clang/lib/Driver/ToolChains/Linux.h
@@ -31,6 +31,8 @@ class LLVM_LIBRARY_VISIBILITY Linux : public Generic_ELF {
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
   CXXStdlibType GetDefaultCXXStdlibType() const override;

diff  --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index c8b272c60919..8271ca780f80 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -739,7 +739,8 @@ std::unique_ptr<Command> visualstudio::Compiler::GetCommand(
 
 MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
-    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args) {
+    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args),
+      RocmInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
@@ -797,6 +798,11 @@ void MSVCToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
+void MSVCToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                      ArgStringList &CC1Args) const {
+  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+}
+
 void MSVCToolChain::printVerboseInfo(raw_ostream &OS) const {
   CudaInstallation.print(OS);
 }

diff  --git a/clang/lib/Driver/ToolChains/MSVC.h b/clang/lib/Driver/ToolChains/MSVC.h
index 41a69a82fecf..85208eaa3cc3 100644
--- a/clang/lib/Driver/ToolChains/MSVC.h
+++ b/clang/lib/Driver/ToolChains/MSVC.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MSVC_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MSVC_H
 
+#include "AMDGPU.h"
 #include "Cuda.h"
 #include "clang/Basic/DebugInfoOptions.h"
 #include "clang/Driver/Compilation.h"
@@ -125,6 +126,9 @@ class LLVM_LIBRARY_VISIBILITY MSVCToolChain : public ToolChain {
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
 
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
+
   bool getWindowsSDKLibraryPath(std::string &path) const;
   /// Check if Universal CRT should be used if available
   bool getUniversalCRTLibraryPath(std::string &path) const;
@@ -155,6 +159,7 @@ class LLVM_LIBRARY_VISIBILITY MSVCToolChain : public ToolChain {
   std::string VCToolChainPath;
   ToolsetLayout VSLayout = ToolsetLayout::OlderVS;
   CudaInstallationDetector CudaInstallation;
+  RocmInstallationDetector RocmInstallation;
 };
 
 } // end namespace toolchains

diff  --git a/clang/lib/Driver/ToolChains/ROCm.h b/clang/lib/Driver/ToolChains/ROCm.h
new file mode 100644
index 000000000000..9f5fa451472b
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/ROCm.h
@@ -0,0 +1,166 @@
+//===--- ROCm.h - ROCm installation detector --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
+
+#include "clang/Basic/Cuda.h"
+#include "clang/Driver/Options.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+
+namespace clang {
+namespace driver {
+
+/// A class to find a viable ROCM installation
+/// TODO: Generalize to handle libclc.
+class RocmInstallationDetector {
+private:
+  struct ConditionalLibrary {
+    SmallString<0> On;
+    SmallString<0> Off;
+
+    bool isValid() const { return !On.empty() && !Off.empty(); }
+
+    StringRef get(bool Enabled) const {
+      assert(isValid());
+      return Enabled ? On : Off;
+    }
+  };
+
+  const Driver &D;
+  bool IsValid = false;
+  // RocmVersion Version = RocmVersion::UNKNOWN;
+  SmallString<0> InstallPath;
+  // SmallString<0> BinPath;
+  SmallString<0> LibPath;
+  SmallString<0> LibDevicePath;
+  SmallString<0> IncludePath;
+  llvm::StringMap<std::string> LibDeviceMap;
+
+  // Libraries that are always linked.
+  SmallString<0> OCML;
+  SmallString<0> OCKL;
+
+  // Libraries that are always linked depending on the language
+  SmallString<0> OpenCL;
+  SmallString<0> HIP;
+
+  // Libraries swapped based on compile flags.
+  ConditionalLibrary WavefrontSize64;
+  ConditionalLibrary FiniteOnly;
+  ConditionalLibrary UnsafeMath;
+  ConditionalLibrary DenormalsAreZero;
+  ConditionalLibrary CorrectlyRoundedSqrt;
+
+  bool allGenericLibsValid() const {
+    return !OCML.empty() && !OCKL.empty() && !OpenCL.empty() && !HIP.empty() &&
+           WavefrontSize64.isValid() && FiniteOnly.isValid() &&
+           UnsafeMath.isValid() && DenormalsAreZero.isValid() &&
+           CorrectlyRoundedSqrt.isValid();
+  }
+
+  // GPU architectures for which we have raised an error in
+  // CheckRocmVersionSupportsArch.
+  mutable llvm::SmallSet<CudaArch, 4> ArchsWithBadVersion;
+
+  void scanLibDevicePath();
+
+public:
+  RocmInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
+                           const llvm::opt::ArgList &Args);
+
+  /// Add arguments needed to link default bitcode libraries.
+  void addCommonBitcodeLibCC1Args(const llvm::opt::ArgList &DriverArgs,
+                                  llvm::opt::ArgStringList &CC1Args,
+                                  StringRef LibDeviceFile, bool Wave64,
+                                  bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
+                                  bool FastRelaxedMath, bool CorrectSqrt) const;
+
+  /// Emit an error if Version does not support the given Arch.
+  ///
+  /// If either Version or Arch is unknown, does not emit an error.  Emits at
+  /// most one error per Arch.
+  void CheckRocmVersionSupportsArch(CudaArch Arch) const;
+
+  /// Check whether we detected a valid Rocm install.
+  bool isValid() const { return IsValid; }
+  /// Print information about the detected ROCm installation.
+  void print(raw_ostream &OS) const;
+
+  /// Get the detected Rocm install's version.
+  // RocmVersion version() const { return Version; }
+
+  /// Get the detected Rocm installation path.
+  StringRef getInstallPath() const { return InstallPath; }
+
+  /// Get the detected path to Rocm's bin directory.
+  // StringRef getBinPath() const { return BinPath; }
+
+  /// Get the detected Rocm Include path.
+  StringRef getIncludePath() const { return IncludePath; }
+
+  /// Get the detected Rocm library path.
+  StringRef getLibPath() const { return LibPath; }
+
+  /// Get the detected Rocm device library path.
+  StringRef getLibDevicePath() const { return LibDevicePath; }
+
+  StringRef getOCMLPath() const {
+    assert(!OCML.empty());
+    return OCML;
+  }
+
+  StringRef getOCKLPath() const {
+    assert(!OCKL.empty());
+    return OCKL;
+  }
+
+  StringRef getOpenCLPath() const {
+    assert(!OpenCL.empty());
+    return OpenCL;
+  }
+
+  StringRef getHIPPath() const {
+    assert(!HIP.empty());
+    return HIP;
+  }
+
+  StringRef getWavefrontSize64Path(bool Enabled) const {
+    return WavefrontSize64.get(Enabled);
+  }
+
+  StringRef getFiniteOnlyPath(bool Enabled) const {
+    return FiniteOnly.get(Enabled);
+  }
+
+  StringRef getUnsafeMathPath(bool Enabled) const {
+    return UnsafeMath.get(Enabled);
+  }
+
+  StringRef getDenormalsAreZeroPath(bool Enabled) const {
+    return DenormalsAreZero.get(Enabled);
+  }
+
+  StringRef getCorrectlyRoundedSqrtPath(bool Enabled) const {
+    return CorrectlyRoundedSqrt.get(Enabled);
+  }
+
+  /// Get libdevice file for given architecture
+  std::string getLibDeviceFile(StringRef Gpu) const {
+    return LibDeviceMap.lookup(Gpu);
+  }
+
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const;
+};
+
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H

diff  --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1a1f7b30f106..fd9e3a0d672f 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -45,6 +45,9 @@ set(files
   __clang_cuda_libdevice_declares.h
   __clang_cuda_math_forward_declares.h
   __clang_cuda_runtime_wrapper.h
+  __clang_hip_libdevice_declares.h
+  __clang_hip_math.h
+  __clang_hip_runtime_wrapper.h
   cetintrin.h
   cet.h
   cldemoteintrin.h

diff  --git a/clang/lib/Headers/__clang_cuda_math_forward_declares.h b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
index 3d6d0b9115a1..7c0b3575b25a 100644
--- a/clang/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -8,8 +8,8 @@
  */
 #ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
 #define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
-#ifndef __CUDA__
-#error "This file is for CUDA compilation only."
+#if !__CUDA__ && !__HIP__
+#error "This file is for CUDA/HIP compilation only."
 #endif
 
 // This file forward-declares of some math functions we (or the CUDA headers)

diff  --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h
new file mode 100644
index 000000000000..e1cd49a39c65
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -0,0 +1,326 @@
+/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
+#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
+
+extern "C" {
+
+// BEGIN FLOAT
+__device__ __attribute__((const)) float __ocml_acos_f32(float);
+__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
+__device__ __attribute__((const)) float __ocml_asin_f32(float);
+__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
+__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
+__device__ __attribute__((const)) float __ocml_atan_f32(float);
+__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
+__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_ceil_f32(float);
+__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
+                                                                       float);
+__device__ float __ocml_cos_f32(float);
+__device__ float __ocml_native_cos_f32(float);
+__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
+__device__ float __ocml_cospi_f32(float);
+__device__ float __ocml_i0_f32(float);
+__device__ float __ocml_i1_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
+__device__ __attribute__((pure)) float __ocml_erf_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
+__device__ __attribute__((const)) float __ocml_fabs_f32(float);
+__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
+__device__ __attribute__((const)) float __ocml_floor_f32(float);
+__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
+                                                                   float);
+__device__ float __ocml_frexp_f32(float,
+                                  __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
+__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
+__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
+__device__ __attribute__((const)) int __ocml_isinf_f32(float);
+__device__ __attribute__((const)) int __ocml_isnan_f32(float);
+__device__ float __ocml_j0_f32(float);
+__device__ float __ocml_j1_f32(float);
+__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
+__device__ float __ocml_lgamma_f32(float);
+__device__ __attribute__((pure)) float __ocml_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
+__device__ __attribute__((pure)) float __ocml_log2_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
+__device__ __attribute__((const)) float __ocml_logb_f32(float);
+__device__ __attribute__((pure)) float __ocml_log_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
+__device__ float __ocml_modf_f32(float,
+                                 __attribute__((address_space(5))) float *);
+__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
+__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
+__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
+                                                        float);
+__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
+__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
+__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
+__device__ float __ocml_remquo_f32(float, float,
+                                   __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
+__device__ __attribute__((const)) float __ocml_rint_f32(float);
+__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
+                                                         float);
+__device__ __attribute__((const)) float __ocml_round_f32(float);
+__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
+__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
+__device__ __attribute__((const)) int __ocml_signbit_f32(float);
+__device__ float __ocml_sincos_f32(float,
+                                   __attribute__((address_space(5))) float *);
+__device__ float __ocml_sincospi_f32(float,
+                                     __attribute__((address_space(5))) float *);
+__device__ float __ocml_sin_f32(float);
+__device__ float __ocml_native_sin_f32(float);
+__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
+__device__ float __ocml_sinpi_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
+__device__ float __ocml_tan_f32(float);
+__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
+__device__ float __ocml_tgamma_f32(float);
+__device__ __attribute__((const)) float __ocml_trunc_f32(float);
+__device__ float __ocml_y0_f32(float);
+__device__ float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
+
+__device__ __attribute__((const)) float
+__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__ __attribute__((const)) double __ocml_acos_f64(double);
+__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
+__device__ __attribute__((const)) double __ocml_asin_f64(double);
+__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
+__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
+__device__ __attribute__((const)) double __ocml_atan_f64(double);
+__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
+__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_ceil_f64(double);
+__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
+__device__ double __ocml_cos_f64(double);
+__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
+__device__ double __ocml_cospi_f64(double);
+__device__ double __ocml_i0_f64(double);
+__device__ double __ocml_i1_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
+__device__ __attribute__((pure)) double __ocml_erf_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp_f64(double);
+__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
+__device__ __attribute__((const)) double __ocml_fabs_f64(double);
+__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
+__device__ __attribute__((const)) double __ocml_floor_f64(double);
+__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
+__device__ double __ocml_frexp_f64(double,
+                                   __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
+__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
+__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
+__device__ __attribute__((const)) int __ocml_isinf_f64(double);
+__device__ __attribute__((const)) int __ocml_isnan_f64(double);
+__device__ double __ocml_j0_f64(double);
+__device__ double __ocml_j1_f64(double);
+__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
+__device__ double __ocml_lgamma_f64(double);
+__device__ __attribute__((pure)) double __ocml_log10_f64(double);
+__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
+__device__ __attribute__((pure)) double __ocml_log2_f64(double);
+__device__ __attribute__((const)) double __ocml_logb_f64(double);
+__device__ __attribute__((pure)) double __ocml_log_f64(double);
+__device__ double __ocml_modf_f64(double,
+                                  __attribute__((address_space(5))) double *);
+__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
+__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
+__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
+                                                         double);
+__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
+                                                         double);
+__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
+__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
+__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
+__device__ double __ocml_remquo_f64(double, double,
+                                    __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
+__device__ __attribute__((const)) double __ocml_rint_f64(double);
+__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
+                                                          double);
+__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
+                                                          double, double);
+__device__ __attribute__((const)) double __ocml_round_f64(double);
+__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
+__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
+__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
+__device__ __attribute__((const)) int __ocml_signbit_f64(double);
+__device__ double __ocml_sincos_f64(double,
+                                    __attribute__((address_space(5))) double *);
+__device__ double
+__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
+__device__ double __ocml_sin_f64(double);
+__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
+__device__ double __ocml_sinpi_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
+__device__ double __ocml_tan_f64(double);
+__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
+__device__ double __ocml_tgamma_f64(double);
+__device__ __attribute__((const)) double __ocml_trunc_f64(double);
+__device__ double __ocml_y0_f64(double);
+__device__ double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
+                                                            double);
+
+__device__ __attribute__((const)) double
+__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
+__device__ __attribute__((const)) double
+__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
+
+__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
+__device__ _Float16 __ocml_cos_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
+                                                          _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
+__device__ _Float16 __ocml_sin_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+
+typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
+typedef short __2i16 __attribute__((ext_vector_type(2)));
+
+__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
+                                                     float c, bool s);
+__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
+__device__ __2f16 __ocml_cos_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
+__device__ __attribute__((const))
+__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
+__device__ inline __2f16
+__llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL.
+{
+  return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)};
+}
+__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
+__device__ __2f16 __ocml_sin_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+
+} // extern "C"
+
+#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__

diff  --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
new file mode 100644
index 000000000000..fcc9a3bdbe17
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -0,0 +1,1185 @@
+/*===---- __clang_hip_math.h - HIP math decls -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_HIP_MATH_H__
+#define __CLANG_HIP_MATH_H__
+
+#include <algorithm>
+#include <limits.h>
+#include <limits>
+#include <stdint.h>
+
+#pragma push_macro("__DEVICE__")
+#pragma push_macro("__RETURN_TYPE")
+
+// to be consistent with __clang_cuda_math_forward_declares
+#define __DEVICE__ static __device__
+#define __RETURN_TYPE bool
+
+__DEVICE__
+inline uint64_t __make_mantissa_base8(const char *__tagp) {
+  uint64_t __r = 0;
+  while (__tagp) {
+    char __tmp = *__tagp;
+
+    if (__tmp >= '0' && __tmp <= '7')
+      __r = (__r * 8u) + __tmp - '0';
+    else
+      return 0;
+
+    ++__tagp;
+  }
+
+  return __r;
+}
+
+__DEVICE__
+inline uint64_t __make_mantissa_base10(const char *__tagp) {
+  uint64_t __r = 0;
+  while (__tagp) {
+    char __tmp = *__tagp;
+
+    if (__tmp >= '0' && __tmp <= '9')
+      __r = (__r * 10u) + __tmp - '0';
+    else
+      return 0;
+
+    ++__tagp;
+  }
+
+  return __r;
+}
+
+__DEVICE__
+inline uint64_t __make_mantissa_base16(const char *__tagp) {
+  uint64_t __r = 0;
+  while (__tagp) {
+    char __tmp = *__tagp;
+
+    if (__tmp >= '0' && __tmp <= '9')
+      __r = (__r * 16u) + __tmp - '0';
+    else if (__tmp >= 'a' && __tmp <= 'f')
+      __r = (__r * 16u) + __tmp - 'a' + 10;
+    else if (__tmp >= 'A' && __tmp <= 'F')
+      __r = (__r * 16u) + __tmp - 'A' + 10;
+    else
+      return 0;
+
+    ++__tagp;
+  }
+
+  return __r;
+}
+
+__DEVICE__
+inline uint64_t __make_mantissa(const char *__tagp) {
+  if (!__tagp)
+    return 0u;
+
+  if (*__tagp == '0') {
+    ++__tagp;
+
+    if (*__tagp == 'x' || *__tagp == 'X')
+      return __make_mantissa_base16(__tagp);
+    else
+      return __make_mantissa_base8(__tagp);
+  }
+
+  return __make_mantissa_base10(__tagp);
+}
+
+// BEGIN FLOAT
+__DEVICE__
+inline float abs(float __x) { return __ocml_fabs_f32(__x); }
+__DEVICE__
+inline float acosf(float __x) { return __ocml_acos_f32(__x); }
+__DEVICE__
+inline float acoshf(float __x) { return __ocml_acosh_f32(__x); }
+__DEVICE__
+inline float asinf(float __x) { return __ocml_asin_f32(__x); }
+__DEVICE__
+inline float asinhf(float __x) { return __ocml_asinh_f32(__x); }
+__DEVICE__
+inline float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
+__DEVICE__
+inline float atanf(float __x) { return __ocml_atan_f32(__x); }
+__DEVICE__
+inline float atanhf(float __x) { return __ocml_atanh_f32(__x); }
+__DEVICE__
+inline float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
+__DEVICE__
+inline float ceilf(float __x) { return __ocml_ceil_f32(__x); }
+__DEVICE__
+inline float copysignf(float __x, float __y) {
+  return __ocml_copysign_f32(__x, __y);
+}
+__DEVICE__
+inline float cosf(float __x) { return __ocml_cos_f32(__x); }
+__DEVICE__
+inline float coshf(float __x) { return __ocml_cosh_f32(__x); }
+__DEVICE__
+inline float cospif(float __x) { return __ocml_cospi_f32(__x); }
+__DEVICE__
+inline float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
+__DEVICE__
+inline float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
+__DEVICE__
+inline float erfcf(float __x) { return __ocml_erfc_f32(__x); }
+__DEVICE__
+inline float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
+__DEVICE__
+inline float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
+__DEVICE__
+inline float erff(float __x) { return __ocml_erf_f32(__x); }
+__DEVICE__
+inline float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
+__DEVICE__
+inline float exp10f(float __x) { return __ocml_exp10_f32(__x); }
+__DEVICE__
+inline float exp2f(float __x) { return __ocml_exp2_f32(__x); }
+__DEVICE__
+inline float expf(float __x) { return __ocml_exp_f32(__x); }
+__DEVICE__
+inline float expm1f(float __x) { return __ocml_expm1_f32(__x); }
+__DEVICE__
+inline float fabsf(float __x) { return __ocml_fabs_f32(__x); }
+__DEVICE__
+inline float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
+__DEVICE__
+inline float fdividef(float __x, float __y) { return __x / __y; }
+__DEVICE__
+inline float floorf(float __x) { return __ocml_floor_f32(__x); }
+__DEVICE__
+inline float fmaf(float __x, float __y, float __z) {
+  return __ocml_fma_f32(__x, __y, __z);
+}
+__DEVICE__
+inline float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
+__DEVICE__
+inline float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
+__DEVICE__
+inline float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
+__DEVICE__
+inline float frexpf(float __x, int *__nptr) {
+  int __tmp;
+  float __r =
+      __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
+  *__nptr = __tmp;
+
+  return __r;
+}
+__DEVICE__
+inline float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
+__DEVICE__
+inline int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
+__DEVICE__
+inline __RETURN_TYPE isfinite(float __x) { return __ocml_isfinite_f32(__x); }
+__DEVICE__
+inline __RETURN_TYPE isinf(float __x) { return __ocml_isinf_f32(__x); }
+__DEVICE__
+inline __RETURN_TYPE isnan(float __x) { return __ocml_isnan_f32(__x); }
+__DEVICE__
+inline float j0f(float __x) { return __ocml_j0_f32(__x); }
+__DEVICE__
+inline float j1f(float __x) { return __ocml_j1_f32(__x); }
+__DEVICE__
+inline float jnf(int __n,
+                 float __x) { // TODO: we could use Ahmes multiplication
+                              // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case.
+  if (__n == 0)
+    return j0f(__x);
+  if (__n == 1)
+    return j1f(__x);
+
+  float __x0 = j0f(__x);
+  float __x1 = j1f(__x);
+  for (int __i = 1; __i < __n; ++__i) {
+    float __x2 = (2 * __i) / __x * __x1 - __x0;
+    __x0 = __x1;
+    __x1 = __x2;
+  }
+
+  return __x1;
+}
+__DEVICE__
+inline float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
+__DEVICE__
+inline float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
+__DEVICE__
+inline long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
+__DEVICE__
+inline long long int llroundf(float __x) { return __ocml_round_f32(__x); }
+__DEVICE__
+inline float log10f(float __x) { return __ocml_log10_f32(__x); }
+__DEVICE__
+inline float log1pf(float __x) { return __ocml_log1p_f32(__x); }
+__DEVICE__
+inline float log2f(float __x) { return __ocml_log2_f32(__x); }
+__DEVICE__
+inline float logbf(float __x) { return __ocml_logb_f32(__x); }
+__DEVICE__
+inline float logf(float __x) { return __ocml_log_f32(__x); }
+__DEVICE__
+inline long int lrintf(float __x) { return __ocml_rint_f32(__x); }
+__DEVICE__
+inline long int lroundf(float __x) { return __ocml_round_f32(__x); }
+__DEVICE__
+inline float modff(float __x, float *__iptr) {
+  float __tmp;
+  float __r =
+      __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
+  *__iptr = __tmp;
+
+  return __r;
+}
+__DEVICE__
+inline float nanf(const char *__tagp) {
+  union {
+    float val;
+    struct ieee_float {
+      uint32_t mantissa : 22;
+      uint32_t quiet : 1;
+      uint32_t exponent : 8;
+      uint32_t sign : 1;
+    } bits;
+
+    static_assert(sizeof(float) == sizeof(ieee_float), "");
+  } __tmp;
+
+  __tmp.bits.sign = 0u;
+  __tmp.bits.exponent = ~0u;
+  __tmp.bits.quiet = 1u;
+  __tmp.bits.mantissa = __make_mantissa(__tagp);
+
+  return __tmp.val;
+}
+__DEVICE__
+inline float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
+__DEVICE__
+inline float nextafterf(float __x, float __y) {
+  return __ocml_nextafter_f32(__x, __y);
+}
+__DEVICE__
+inline float norm3df(float __x, float __y, float __z) {
+  return __ocml_len3_f32(__x, __y, __z);
+}
+__DEVICE__
+inline float norm4df(float __x, float __y, float __z, float __w) {
+  return __ocml_len4_f32(__x, __y, __z, __w);
+}
+__DEVICE__
+inline float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
+__DEVICE__
+inline float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
+__DEVICE__
+inline float
+normf(int __dim,
+      const float *__a) { // TODO: placeholder until OCML adds support.
+  float __r = 0;
+  while (__dim--) {
+    __r += __a[0] * __a[0];
+    ++__a;
+  }
+
+  return __ocml_sqrt_f32(__r);
+}
+__DEVICE__
+inline float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
+__DEVICE__
+inline float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
+__DEVICE__
+inline float remainderf(float __x, float __y) {
+  return __ocml_remainder_f32(__x, __y);
+}
+__DEVICE__
+inline float remquof(float __x, float __y, int *__quo) {
+  int __tmp;
+  float __r = __ocml_remquo_f32(
+      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
+  *__quo = __tmp;
+
+  return __r;
+}
+__DEVICE__
+inline float rhypotf(float __x, float __y) {
+  return __ocml_rhypot_f32(__x, __y);
+}
+__DEVICE__
+inline float rintf(float __x) { return __ocml_rint_f32(__x); }
+__DEVICE__
+inline float rnorm3df(float __x, float __y, float __z) {
+  return __ocml_rlen3_f32(__x, __y, __z);
+}
+
+__DEVICE__
+inline float rnorm4df(float __x, float __y, float __z, float __w) {
+  return __ocml_rlen4_f32(__x, __y, __z, __w);
+}
+__DEVICE__
+inline float
+rnormf(int __dim,
+       const float *__a) { // TODO: placeholder until OCML adds support.
+  float __r = 0;
+  while (__dim--) {
+    __r += __a[0] * __a[0];
+    ++__a;
+  }
+
+  return __ocml_rsqrt_f32(__r);
+}
+__DEVICE__
+inline float roundf(float __x) { return __ocml_round_f32(__x); }
+__DEVICE__
+inline float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
+__DEVICE__
+inline float scalblnf(float __x, long int __n) {
+  return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n)
+                         : __ocml_scalb_f32(__x, __n);
+}
+__DEVICE__
+inline float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
+__DEVICE__
+inline __RETURN_TYPE signbit(float __x) { return __ocml_signbit_f32(__x); }
+__DEVICE__
+inline void sincosf(float __x, float *__sptr, float *__cptr) {
+  float __tmp;
+
+  *__sptr =
+      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
+  *__cptr = __tmp;
+}
+__DEVICE__
+inline void sincospif(float __x, float *__sptr, float *__cptr) {
+  float __tmp;
+
+  *__sptr = __ocml_sincospi_f32(
+      __x, (__attribute__((address_space(5))) float *)&__tmp);
+  *__cptr = __tmp;
+}
+__DEVICE__
+inline float sinf(float __x) { return __ocml_sin_f32(__x); }
+__DEVICE__
+inline float sinhf(float __x) { return __ocml_sinh_f32(__x); }
+__DEVICE__
+inline float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
+__DEVICE__
+inline float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
+__DEVICE__
+inline float tanf(float __x) { return __ocml_tan_f32(__x); }
+__DEVICE__
+inline float tanhf(float __x) { return __ocml_tanh_f32(__x); }
+__DEVICE__
+inline float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
+__DEVICE__
+inline float truncf(float __x) { return __ocml_trunc_f32(__x); }
+__DEVICE__
+inline float y0f(float __x) { return __ocml_y0_f32(__x); }
+__DEVICE__
+inline float y1f(float __x) { return __ocml_y1_f32(__x); }
+__DEVICE__
+inline float ynf(int __n,
+                 float __x) { // TODO: we could use Ahmes multiplication
+                              // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case. Placeholder until OCML adds
+  //       support.
+  if (__n == 0)
+    return y0f(__x);
+  if (__n == 1)
+    return y1f(__x);
+
+  float __x0 = y0f(__x);
+  float __x1 = y1f(__x);
+  for (int __i = 1; __i < __n; ++__i) {
+    float __x2 = (2 * __i) / __x * __x1 - __x0;
+    __x0 = __x1;
+    __x1 = __x2;
+  }
+
+  return __x1;
+}
+
+// BEGIN INTRINSICS
+__DEVICE__
+inline float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
+__DEVICE__
+inline float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
+__DEVICE__
+inline float __expf(float __x) { return __ocml_native_exp_f32(__x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fadd_rd(float __x, float __y) {
+  return __ocml_add_rtn_f32(__x, __y);
+}
+#endif
+__DEVICE__
+inline float __fadd_rn(float __x, float __y) { return __x + __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fadd_ru(float __x, float __y) {
+  return __ocml_add_rtp_f32(__x, __y);
+}
+__DEVICE__
+inline float __fadd_rz(float __x, float __y) {
+  return __ocml_add_rtz_f32(__x, __y);
+}
+__DEVICE__
+inline float __fdiv_rd(float __x, float __y) {
+  return __ocml_div_rtn_f32(__x, __y);
+}
+#endif
+__DEVICE__
+inline float __fdiv_rn(float __x, float __y) { return __x / __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fdiv_ru(float __x, float __y) {
+  return __ocml_div_rtp_f32(__x, __y);
+}
+__DEVICE__
+inline float __fdiv_rz(float __x, float __y) {
+  return __ocml_div_rtz_f32(__x, __y);
+}
+#endif
+__DEVICE__
+inline float __fdividef(float __x, float __y) { return __x / __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fmaf_rd(float __x, float __y, float __z) {
+  return __ocml_fma_rtn_f32(__x, __y, __z);
+}
+#endif
+__DEVICE__
+inline float __fmaf_rn(float __x, float __y, float __z) {
+  return __ocml_fma_f32(__x, __y, __z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fmaf_ru(float __x, float __y, float __z) {
+  return __ocml_fma_rtp_f32(__x, __y, __z);
+}
+__DEVICE__
+inline float __fmaf_rz(float __x, float __y, float __z) {
+  return __ocml_fma_rtz_f32(__x, __y, __z);
+}
+__DEVICE__
+inline float __fmul_rd(float __x, float __y) {
+  return __ocml_mul_rtn_f32(__x, __y);
+}
+#endif
+__DEVICE__
+inline float __fmul_rn(float __x, float __y) { return __x * __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fmul_ru(float __x, float __y) {
+  return __ocml_mul_rtp_f32(__x, __y);
+}
+__DEVICE__
+inline float __fmul_rz(float __x, float __y) {
+  return __ocml_mul_rtz_f32(__x, __y);
+}
+__DEVICE__
+inline float __frcp_rd(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+#endif
+__DEVICE__
+inline float __frcp_rn(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __frcp_ru(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+__DEVICE__
+inline float __frcp_rz(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+#endif
+__DEVICE__
+inline float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
+#endif
+__DEVICE__
+inline float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
+__DEVICE__
+inline float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
+__DEVICE__
+inline float __fsub_rd(float __x, float __y) {
+  return __ocml_sub_rtn_f32(__x, __y);
+}
+#endif
+__DEVICE__
+inline float __fsub_rn(float __x, float __y) { return __x - __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fsub_ru(float __x, float __y) {
+  return __ocml_sub_rtp_f32(__x, __y);
+}
+__DEVICE__
+inline float __fsub_rz(float __x, float __y) {
+  return __ocml_sub_rtz_f32(__x, __y);
+}
+#endif
+__DEVICE__
+inline float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
+__DEVICE__
+inline float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
+__DEVICE__
+inline float __logf(float __x) { return __ocml_native_log_f32(__x); }
+__DEVICE__
+inline float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
+__DEVICE__
+inline float __saturatef(float __x) {
+  return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x);
+}
+__DEVICE__
+inline void __sincosf(float __x, float *__sptr, float *__cptr) {
+  *__sptr = __ocml_native_sin_f32(__x);
+  *__cptr = __ocml_native_cos_f32(__x);
+}
+__DEVICE__
+inline float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
+__DEVICE__
+inline float __tanf(float __x) { return __ocml_tan_f32(__x); }
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__DEVICE__
+inline double abs(double __x) { return __ocml_fabs_f64(__x); }
+__DEVICE__
+inline double acos(double __x) { return __ocml_acos_f64(__x); }
+__DEVICE__
+inline double acosh(double __x) { return __ocml_acosh_f64(__x); }
+__DEVICE__
+inline double asin(double __x) { return __ocml_asin_f64(__x); }
+__DEVICE__
+inline double asinh(double __x) { return __ocml_asinh_f64(__x); }
+__DEVICE__
+inline double atan(double __x) { return __ocml_atan_f64(__x); }
+__DEVICE__
+inline double atan2(double __x, double __y) {
+  return __ocml_atan2_f64(__x, __y);
+}
+__DEVICE__
+inline double atanh(double __x) { return __ocml_atanh_f64(__x); }
+__DEVICE__
+inline double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
+__DEVICE__
+inline double ceil(double __x) { return __ocml_ceil_f64(__x); }
+__DEVICE__
+inline double copysign(double __x, double __y) {
+  return __ocml_copysign_f64(__x, __y);
+}
+__DEVICE__
+inline double cos(double __x) { return __ocml_cos_f64(__x); }
+__DEVICE__
+inline double cosh(double __x) { return __ocml_cosh_f64(__x); }
+__DEVICE__
+inline double cospi(double __x) { return __ocml_cospi_f64(__x); }
+__DEVICE__
+inline double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
+__DEVICE__
+inline double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
+__DEVICE__
+inline double erf(double __x) { return __ocml_erf_f64(__x); }
+__DEVICE__
+inline double erfc(double __x) { return __ocml_erfc_f64(__x); }
+__DEVICE__
+inline double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
+__DEVICE__
+inline double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
+__DEVICE__
+inline double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
+__DEVICE__
+inline double exp(double __x) { return __ocml_exp_f64(__x); }
+__DEVICE__
+inline double exp10(double __x) { return __ocml_exp10_f64(__x); }
+__DEVICE__
+inline double exp2(double __x) { return __ocml_exp2_f64(__x); }
+__DEVICE__
+inline double expm1(double __x) { return __ocml_expm1_f64(__x); }
+__DEVICE__
+inline double fabs(double __x) { return __ocml_fabs_f64(__x); }
+__DEVICE__
+inline double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
+__DEVICE__
+inline double floor(double __x) { return __ocml_floor_f64(__x); }
+__DEVICE__
+inline double fma(double __x, double __y, double __z) {
+  return __ocml_fma_f64(__x, __y, __z);
+}
+__DEVICE__
+inline double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
+__DEVICE__
+inline double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
+__DEVICE__
+inline double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
+__DEVICE__
+inline double frexp(double __x, int *__nptr) {
+  int __tmp;
+  double __r =
+      __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
+  *__nptr = __tmp;
+
+  return __r;
+}
+__DEVICE__
+inline double hypot(double __x, double __y) {
+  return __ocml_hypot_f64(__x, __y);
+}
+__DEVICE__
+inline int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
+__DEVICE__
+inline __RETURN_TYPE isfinite(double __x) { return __ocml_isfinite_f64(__x); }
+__DEVICE__
+inline __RETURN_TYPE isinf(double __x) { return __ocml_isinf_f64(__x); }
+__DEVICE__
+inline __RETURN_TYPE isnan(double __x) { return __ocml_isnan_f64(__x); }
+__DEVICE__
+inline double j0(double __x) { return __ocml_j0_f64(__x); }
+__DEVICE__
+inline double j1(double __x) { return __ocml_j1_f64(__x); }
+__DEVICE__
+inline double jn(int __n,
+                 double __x) { // TODO: we could use Ahmes multiplication
+                               // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case. Placeholder until OCML adds
+  //       support.
+  if (__n == 0)
+    return j0f(__x);
+  if (__n == 1)
+    return j1f(__x);
+
+  double __x0 = j0f(__x);
+  double __x1 = j1f(__x);
+  for (int __i = 1; __i < __n; ++__i) {
+    double __x2 = (2 * __i) / __x * __x1 - __x0;
+    __x0 = __x1;
+    __x1 = __x2;
+  }
+
+  return __x1;
+}
+__DEVICE__
+inline double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
+__DEVICE__
+inline double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
+__DEVICE__
+inline long long int llrint(double __x) { return __ocml_rint_f64(__x); }
+__DEVICE__
+inline long long int llround(double __x) { return __ocml_round_f64(__x); }
+__DEVICE__
+inline double log(double __x) { return __ocml_log_f64(__x); }
+__DEVICE__
+inline double log10(double __x) { return __ocml_log10_f64(__x); }
+__DEVICE__
+inline double log1p(double __x) { return __ocml_log1p_f64(__x); }
+__DEVICE__
+inline double log2(double __x) { return __ocml_log2_f64(__x); }
+__DEVICE__
+inline double logb(double __x) { return __ocml_logb_f64(__x); }
+__DEVICE__
+inline long int lrint(double __x) { return __ocml_rint_f64(__x); }
+__DEVICE__
+inline long int lround(double __x) { return __ocml_round_f64(__x); }
+__DEVICE__
+inline double modf(double __x, double *__iptr) {
+  double __tmp;
+  double __r =
+      __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
+  *__iptr = __tmp;
+
+  return __r;
+}
+__DEVICE__
+inline double nan(const char *__tagp) {
+#if !_WIN32
+  union {
+    double val;
+    struct ieee_double {
+      uint64_t mantissa : 51;
+      uint32_t quiet : 1;
+      uint32_t exponent : 11;
+      uint32_t sign : 1;
+    } bits;
+    static_assert(sizeof(double) == sizeof(ieee_double), "");
+  } __tmp;
+
+  __tmp.bits.sign = 0u;
+  __tmp.bits.exponent = ~0u;
+  __tmp.bits.quiet = 1u;
+  __tmp.bits.mantissa = __make_mantissa(__tagp);
+
+  return __tmp.val;
+#else
+  static_assert(sizeof(uint64_t) == sizeof(double));
+  uint64_t val = __make_mantissa(__tagp);
+  val |= 0xFFF << 51;
+  return *reinterpret_cast<double *>(&val);
+#endif
+}
+__DEVICE__
+inline double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
+__DEVICE__
+inline double nextafter(double __x, double __y) {
+  return __ocml_nextafter_f64(__x, __y);
+}
+__DEVICE__
+inline double
+norm(int __dim,
+     const double *__a) { // TODO: placeholder until OCML adds support.
+  double __r = 0;
+  while (__dim--) {
+    __r += __a[0] * __a[0];
+    ++__a;
+  }
+
+  return __ocml_sqrt_f64(__r);
+}
+__DEVICE__
+inline double norm3d(double __x, double __y, double __z) {
+  return __ocml_len3_f64(__x, __y, __z);
+}
+__DEVICE__
+inline double norm4d(double __x, double __y, double __z, double __w) {
+  return __ocml_len4_f64(__x, __y, __z, __w);
+}
+__DEVICE__
+inline double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
+__DEVICE__
+inline double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
+__DEVICE__
+inline double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
+__DEVICE__
+inline double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
+__DEVICE__
+inline double remainder(double __x, double __y) {
+  return __ocml_remainder_f64(__x, __y);
+}
+__DEVICE__
+inline double remquo(double __x, double __y, int *__quo) {
+  int __tmp;
+  double __r = __ocml_remquo_f64(
+      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
+  *__quo = __tmp;
+
+  return __r;
+}
+__DEVICE__
+inline double rhypot(double __x, double __y) {
+  return __ocml_rhypot_f64(__x, __y);
+}
+__DEVICE__
+inline double rint(double __x) { return __ocml_rint_f64(__x); }
+__DEVICE__
+inline double
+rnorm(int __dim,
+      const double *__a) { // TODO: placeholder until OCML adds support.
+  double __r = 0;
+  while (__dim--) {
+    __r += __a[0] * __a[0];
+    ++__a;
+  }
+
+  return __ocml_rsqrt_f64(__r);
+}
+__DEVICE__
+inline double rnorm3d(double __x, double __y, double __z) {
+  return __ocml_rlen3_f64(__x, __y, __z);
+}
+__DEVICE__
+inline double rnorm4d(double __x, double __y, double __z, double __w) {
+  return __ocml_rlen4_f64(__x, __y, __z, __w);
+}
+__DEVICE__
+inline double round(double __x) { return __ocml_round_f64(__x); }
+__DEVICE__
+inline double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
+__DEVICE__
+inline double scalbln(double __x, long int __n) {
+  return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n)
+                         : __ocml_scalb_f64(__x, __n);
+}
+__DEVICE__
+inline double scalbn(double __x, int __n) {
+  return __ocml_scalbn_f64(__x, __n);
+}
+__DEVICE__
+inline __RETURN_TYPE signbit(double __x) { return __ocml_signbit_f64(__x); }
+__DEVICE__
+inline double sin(double __x) { return __ocml_sin_f64(__x); }
+__DEVICE__
+inline void sincos(double __x, double *__sptr, double *__cptr) {
+  double __tmp;
+  *__sptr = __ocml_sincos_f64(
+      __x, (__attribute__((address_space(5))) double *)&__tmp);
+  *__cptr = __tmp;
+}
+__DEVICE__
+inline void sincospi(double __x, double *__sptr, double *__cptr) {
+  double __tmp;
+  *__sptr = __ocml_sincospi_f64(
+      __x, (__attribute__((address_space(5))) double *)&__tmp);
+  *__cptr = __tmp;
+}
+__DEVICE__
+inline double sinh(double __x) { return __ocml_sinh_f64(__x); }
+__DEVICE__
+inline double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
+__DEVICE__
+inline double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
+__DEVICE__
+inline double tan(double __x) { return __ocml_tan_f64(__x); }
+__DEVICE__
+inline double tanh(double __x) { return __ocml_tanh_f64(__x); }
+__DEVICE__
+inline double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
+__DEVICE__
+inline double trunc(double __x) { return __ocml_trunc_f64(__x); }
+__DEVICE__
+inline double y0(double __x) { return __ocml_y0_f64(__x); }
+__DEVICE__
+inline double y1(double __x) { return __ocml_y1_f64(__x); }
+__DEVICE__
+inline double yn(int __n,
+                 double __x) { // TODO: we could use Ahmes multiplication
+                               // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case. Placeholder until OCML adds
+  //       support.
+  if (__n == 0)
+    return j0f(__x);
+  if (__n == 1)
+    return j1f(__x);
+
+  double __x0 = j0f(__x);
+  double __x1 = j1f(__x);
+  for (int __i = 1; __i < __n; ++__i) {
+    double __x2 = (2 * __i) / __x * __x1 - __x0;
+    __x0 = __x1;
+    __x1 = __x2;
+  }
+
+  return __x1;
+}
+
+// BEGIN INTRINSICS
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dadd_rd(double __x, double __y) {
+  return __ocml_add_rtn_f64(__x, __y);
+}
+#endif
+__DEVICE__
+inline double __dadd_rn(double __x, double __y) { return __x + __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dadd_ru(double __x, double __y) {
+  return __ocml_add_rtp_f64(__x, __y);
+}
+__DEVICE__
+inline double __dadd_rz(double __x, double __y) {
+  return __ocml_add_rtz_f64(__x, __y);
+}
+__DEVICE__
+inline double __ddiv_rd(double __x, double __y) {
+  return __ocml_div_rtn_f64(__x, __y);
+}
+#endif
+__DEVICE__
+inline double __ddiv_rn(double __x, double __y) { return __x / __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __ddiv_ru(double __x, double __y) {
+  return __ocml_div_rtp_f64(__x, __y);
+}
+__DEVICE__
+inline double __ddiv_rz(double __x, double __y) {
+  return __ocml_div_rtz_f64(__x, __y);
+}
+__DEVICE__
+inline double __dmul_rd(double __x, double __y) {
+  return __ocml_mul_rtn_f64(__x, __y);
+}
+#endif
+__DEVICE__
+inline double __dmul_rn(double __x, double __y) { return __x * __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dmul_ru(double __x, double __y) {
+  return __ocml_mul_rtp_f64(__x, __y);
+}
+__DEVICE__
+inline double __dmul_rz(double __x, double __y) {
+  return __ocml_mul_rtz_f64(__x, __y);
+}
+__DEVICE__
+inline double __drcp_rd(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+#endif
+__DEVICE__
+inline double __drcp_rn(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __drcp_ru(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+__DEVICE__
+inline double __drcp_rz(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+__DEVICE__
+inline double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }
+#endif
+__DEVICE__
+inline double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }
+__DEVICE__
+inline double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
+__DEVICE__
+inline double __dsub_rd(double __x, double __y) {
+  return __ocml_sub_rtn_f64(__x, __y);
+}
+#endif
+__DEVICE__
+inline double __dsub_rn(double __x, double __y) { return __x - __y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dsub_ru(double __x, double __y) {
+  return __ocml_sub_rtp_f64(__x, __y);
+}
+__DEVICE__
+inline double __dsub_rz(double __x, double __y) {
+  return __ocml_sub_rtz_f64(__x, __y);
+}
+__DEVICE__
+inline double __fma_rd(double __x, double __y, double __z) {
+  return __ocml_fma_rtn_f64(__x, __y, __z);
+}
+#endif
+__DEVICE__
+inline double __fma_rn(double __x, double __y, double __z) {
+  return __ocml_fma_f64(__x, __y, __z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __fma_ru(double __x, double __y, double __z) {
+  return __ocml_fma_rtp_f64(__x, __y, __z);
+}
+__DEVICE__
+inline double __fma_rz(double __x, double __y, double __z) {
+  return __ocml_fma_rtz_f64(__x, __y, __z);
+}
+#endif
+// END INTRINSICS
+// END DOUBLE
+
+// BEGIN INTEGER
+__DEVICE__
+inline int abs(int __x) {
+  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+__DEVICE__
+inline long labs(long __x) {
+  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+__DEVICE__
+inline long long llabs(long long __x) {
+  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+
+#if defined(__cplusplus)
+__DEVICE__
+inline long abs(long __x) { return labs(__x); }
+__DEVICE__
+inline long long abs(long long __x) { return llabs(__x); }
+#endif
+// END INTEGER
+
+__DEVICE__
+inline _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
+  return __ocml_fma_f16(__x, __y, __z);
+}
+
+__DEVICE__
+inline float fma(float __x, float __y, float __z) {
+  return fmaf(__x, __y, __z);
+}
+
+#pragma push_macro("__DEF_FUN1")
+#pragma push_macro("__DEF_FUN2")
+#pragma push_macro("__DEF_FUNI")
+#pragma push_macro("__DEF_FLOAT_FUN2I")
+#pragma push_macro("__HIP_OVERLOAD1")
+#pragma push_macro("__HIP_OVERLOAD2")
+
+// __hip_enable_if::type is a type function which returns __T if __B is true.
+template <bool __B, class __T = void> struct __hip_enable_if {};
+
+template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
+
+// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
+// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
+// floor(double).
+#define __HIP_OVERLOAD1(__retty, __fn)                                         \
+  template <typename __T>                                                      \
+  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+  __fn(__T __x) {                                                              \
+    return ::__fn((double)__x);                                                \
+  }
+
+// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
+// or integer argument to avoid compilation error due to ambibuity. e.g.
+// max(5.0f, 6.0) is resolved with max(double, double).
+#define __HIP_OVERLOAD2(__retty, __fn)                                         \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__                                                                   \
+      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
+                                   std::numeric_limits<__T2>::is_specialized,  \
+                               __retty>::type                                  \
+      __fn(__T1 __x, __T2 __y) {                                               \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+// Define cmath functions with float argument and returns float.
+#define __DEF_FUN1(__retty, __func)                                            \
+  __DEVICE__                                                                   \
+  inline float __func(float __x) { return __func##f(__x); }                    \
+  __HIP_OVERLOAD1(__retty, __func)
+
+// Define cmath functions with float argument and returns __retty.
+#define __DEF_FUNI(__retty, __func)                                            \
+  __DEVICE__                                                                   \
+  inline __retty __func(float __x) { return __func##f(__x); }                  \
+  __HIP_OVERLOAD1(__retty, __func)
+
+// define cmath functions with two float arguments.
+#define __DEF_FUN2(__retty, __func)                                            \
+  __DEVICE__                                                                   \
+  inline float __func(float __x, float __y) { return __func##f(__x, __y); }    \
+  __HIP_OVERLOAD2(__retty, __func)
+
+__DEF_FUN1(double, acos)
+__DEF_FUN1(double, acosh)
+__DEF_FUN1(double, asin)
+__DEF_FUN1(double, asinh)
+__DEF_FUN1(double, atan)
+__DEF_FUN2(double, atan2);
+__DEF_FUN1(double, atanh)
+__DEF_FUN1(double, cbrt)
+__DEF_FUN1(double, ceil)
+__DEF_FUN2(double, copysign);
+__DEF_FUN1(double, cos)
+__DEF_FUN1(double, cosh)
+__DEF_FUN1(double, erf)
+__DEF_FUN1(double, erfc)
+__DEF_FUN1(double, exp)
+__DEF_FUN1(double, exp2)
+__DEF_FUN1(double, expm1)
+__DEF_FUN1(double, fabs)
+__DEF_FUN2(double, fdim);
+__DEF_FUN1(double, floor)
+__DEF_FUN2(double, fmax);
+__DEF_FUN2(double, fmin);
+__DEF_FUN2(double, fmod);
+//__HIP_OVERLOAD1(int, fpclassify)
+__DEF_FUN2(double, hypot);
+__DEF_FUNI(int, ilogb)
+__HIP_OVERLOAD1(bool, isfinite)
+__HIP_OVERLOAD2(bool, isgreater);
+__HIP_OVERLOAD2(bool, isgreaterequal);
+__HIP_OVERLOAD1(bool, isinf);
+__HIP_OVERLOAD2(bool, isless);
+__HIP_OVERLOAD2(bool, islessequal);
+__HIP_OVERLOAD2(bool, islessgreater);
+__HIP_OVERLOAD1(bool, isnan);
+//__HIP_OVERLOAD1(bool, isnormal)
+__HIP_OVERLOAD2(bool, isunordered);
+__DEF_FUN1(double, lgamma)
+__DEF_FUN1(double, log)
+__DEF_FUN1(double, log10)
+__DEF_FUN1(double, log1p)
+__DEF_FUN1(double, log2)
+__DEF_FUN1(double, logb)
+__DEF_FUNI(long long, llrint)
+__DEF_FUNI(long long, llround)
+__DEF_FUNI(long, lrint)
+__DEF_FUNI(long, lround)
+__DEF_FUN1(double, nearbyint);
+__DEF_FUN2(double, nextafter);
+__DEF_FUN2(double, pow);
+__DEF_FUN2(double, remainder);
+__DEF_FUN1(double, rint);
+__DEF_FUN1(double, round);
+__HIP_OVERLOAD1(bool, signbit)
+__DEF_FUN1(double, sin)
+__DEF_FUN1(double, sinh)
+__DEF_FUN1(double, sqrt)
+__DEF_FUN1(double, tan)
+__DEF_FUN1(double, tanh)
+__DEF_FUN1(double, tgamma)
+__DEF_FUN1(double, trunc);
+
+// define cmath functions with a float and an integer argument.
+#define __DEF_FLOAT_FUN2I(__func)                                              \
+  __DEVICE__                                                                   \
+  inline float __func(float __x, int __y) { return __func##f(__x, __y); }
+__DEF_FLOAT_FUN2I(scalbn)
+
+template <class T> __DEVICE__ inline T min(T __arg1, T __arg2) {
+  return (__arg1 < __arg2) ? __arg1 : __arg2;
+}
+
+template <class T> __DEVICE__ inline T max(T __arg1, T __arg2) {
+  return (__arg1 > __arg2) ? __arg1 : __arg2;
+}
+
+__DEVICE__ inline int min(int __arg1, int __arg2) {
+  return (__arg1 < __arg2) ? __arg1 : __arg2;
+}
+__DEVICE__ inline int max(int __arg1, int __arg2) {
+  return (__arg1 > __arg2) ? __arg1 : __arg2;
+}
+
+__DEVICE__
+inline float max(float __x, float __y) { return fmaxf(__x, __y); }
+
+__DEVICE__
+inline double max(double __x, double __y) { return fmax(__x, __y); }
+
+__DEVICE__
+inline float min(float __x, float __y) { return fminf(__x, __y); }
+
+__DEVICE__
+inline double min(double __x, double __y) { return fmin(__x, __y); }
+
+__HIP_OVERLOAD2(double, max)
+__HIP_OVERLOAD2(double, min)
+
+__host__ inline static int min(int __arg1, int __arg2) {
+  return std::min(__arg1, __arg2);
+}
+
+__host__ inline static int max(int __arg1, int __arg2) {
+  return std::max(__arg1, __arg2);
+}
+
+#pragma pop_macro("__DEF_FUN1")
+#pragma pop_macro("__DEF_FUN2")
+#pragma pop_macro("__DEF_FUNI")
+#pragma pop_macro("__DEF_FLOAT_FUN2I")
+#pragma pop_macro("__HIP_OVERLOAD1")
+#pragma pop_macro("__HIP_OVERLOAD2")
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__RETURN_TYPE")
+
+#endif // __CLANG_HIP_MATH_H__

diff  --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
new file mode 100644
index 000000000000..8c86649fc960
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -0,0 +1,64 @@
+/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ */
+
+#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
+#define __CLANG_HIP_RUNTIME_WRAPPER_H__
+
+#if __HIP__
+
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+#if __HIP_ENABLE_DEVICE_MALLOC__
+extern "C" __device__ void *__hip_malloc(size_t __size);
+extern "C" __device__ void *__hip_free(void *__ptr);
+static inline __device__ void *malloc(size_t __size) {
+  return __hip_malloc(__size);
+}
+static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
+#else
+static inline __device__ void *malloc(size_t __size) {
+  __builtin_trap();
+  return nullptr;
+}
+static inline __device__ void *free(void *__ptr) {
+  __builtin_trap();
+  return nullptr;
+}
+#endif
+
+#include <__clang_hip_libdevice_declares.h>
+#include <__clang_hip_math.h>
+
+#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#include <__clang_cuda_math_forward_declares.h>
+#include <__clang_cuda_complex_builtins.h>
+
+#include <algorithm>
+#include <complex>
+#include <new>
+#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+
+#define __CLANG_HIP_RUNTIME_WRPPER_INCLUDED__ 1
+
+#endif // __HIP__
+#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/hip.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/hip.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/hip.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/hip.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ockl.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ockl.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ockl.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ockl.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_off.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_off.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_on.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_on.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_on.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_off.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_off.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_off.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_on.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_on.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_on.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1010.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1010.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1010.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1010.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1011.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1011.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1011.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1011.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1012.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1012.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1012.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1012.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_803.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_803.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_803.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_803.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_900.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_900.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_900.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_900.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_off.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_off.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_on.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_on.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_on.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_off.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_on.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ocml.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ocml.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ocml.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ocml.bc

diff  --git a/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/opencl.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/opencl.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/opencl.bc
rename to clang/test/Driver/Inputs/rocm/amdgcn/bitcode/opencl.bc

diff  --git a/clang/test/Driver/Inputs/rocm/include/hip/hip_runtime.h b/clang/test/Driver/Inputs/rocm/include/hip/hip_runtime.h
new file mode 100644
index 000000000000..e69de29bb2d1

diff  --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index eaa3f83ae247..3dd798476e2b 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -8,7 +8,7 @@
 // Test subtarget with flushing on by default.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:  --cuda-gpu-arch=gfx803 \
-// RUN:  --rocm-path=%S/Inputs/rocm-device-libs   \
+// RUN:  --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -16,7 +16,7 @@
 // Test subtarget with flushing off by ddefault.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:  --cuda-gpu-arch=gfx900 \
-// RUN:  --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:  --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -25,7 +25,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -34,7 +34,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -43,7 +43,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -52,7 +52,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -61,7 +61,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -69,7 +69,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs   \
+// RUN:   --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -77,7 +77,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs   \
+// RUN:   --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -85,7 +85,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -93,13 +93,13 @@
 // Test --hip-device-lib-path flag
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
-// RUN:   --hip-device-lib-path=%S/Inputs/rocm-device-libs/amdgcn/bitcode   \
+// RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
 
 // Test environment variable HIP_DEVICE_LIB_PATH
-// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm-device-libs/amdgcn/bitcode \
+// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode \
 // RUN:   %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \

diff  --git a/clang/test/Driver/hip-include-path.hip b/clang/test/Driver/hip-include-path.hip
new file mode 100644
index 000000000000..9fa3125a5070
--- /dev/null
+++ b/clang/test/Driver/hip-include-path.hip
@@ -0,0 +1,31 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
+// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpulib %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=COMMON,WRAP,HIP %s
+
+// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
+// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nobuiltininc -nogpulib %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=COMMON,NOWRAP,HIP %s
+
+// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
+// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpuinc -nogpulib %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=COMMON,WRAP,NOHIP %s
+
+// COMMON-LABEL: clang{{.*}} -cc1 -triple amdgcn-amd-amdhsa
+// WRAP: clang/{{.*}}/include/cuda_wrappers
+// NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
+// HIP: {{.*}}Inputs/rocm/include
+// NOHIP-NOT: {{.*}}Inputs/rocm/include
+// COMMON: {{.*}}include/c++
+// COMMON: clang/{{.*}}/include
+
+// COMMON-LABEL: clang{{.*}} -cc1 -triple x86_64-unknown-linux-gnu
+// WRAP: clang/{{.*}}/include/cuda_wrappers
+// NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
+// HIP: {{.*}}Inputs/rocm/include
+// NOHIP-NOT: {{.*}}Inputs/rocm/include
+// COMMON: {{.*}}include/c++
+// COMMON: clang/{{.*}}/include

diff  --git a/clang/test/Driver/rocm-detect.cl b/clang/test/Driver/rocm-detect.cl
index b143098c9074..75378bf003be 100644
--- a/clang/test/Driver/rocm-detect.cl
+++ b/clang/test/Driver/rocm-detect.cl
@@ -7,12 +7,12 @@
 // target not included in the test.
 
 // RUN: %clang -### -v -target amdgcn-amd-amdhsa -mcpu=gfx902 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 
 // RUN: %clang -### -v -target amdgcn-amd-amdhsa -mcpu=gfx902 -nogpulib \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902,NODEFAULTLIBS %s
 
 

diff  --git a/clang/test/Driver/rocm-detect.hip b/clang/test/Driver/rocm-detect.hip
index 82ed7138098a..9490ec9ba376 100644
--- a/clang/test/Driver/rocm-detect.hip
+++ b/clang/test/Driver/rocm-detect.hip
@@ -8,17 +8,17 @@
 // target not included in the test.
 
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 // Should not interpret -nostdlib as disabling offload libraries.
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 -nostdlib \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 -nogpulib \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902,NODEFAULTLIBS %s
 
 

diff  --git a/clang/test/Driver/rocm-device-libs.cl b/clang/test/Driver/rocm-device-libs.cl
index 23cabd654391..cdb4716bde9a 100644
--- a/clang/test/Driver/rocm-device-libs.cl
+++ b/clang/test/Driver/rocm-device-libs.cl
@@ -6,7 +6,7 @@
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
@@ -15,7 +15,7 @@
 // Make sure the 
diff erent denormal default is respected for gfx8
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
@@ -24,7 +24,7 @@
 // Make sure the non-canonical name works
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=fiji \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
@@ -33,7 +33,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   -cl-denorms-are-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DAZ,GFX900,WAVE64 %s
 
@@ -41,7 +41,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
 // RUN:   -cl-denorms-are-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DAZ,GFX803,WAVE64 %s
 
@@ -50,7 +50,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
 // RUN:   -cl-finite-math-only \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-FINITE-ONLY,GFX803,WAVE64 %s
 
@@ -59,7 +59,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-fp32-correctly-rounded-divide-sqrt \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-CORRECT-SQRT,GFX803,WAVE64 %s
 
@@ -68,7 +68,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-fast-relaxed-math \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-FAST-RELAXED,GFX803,WAVE64 %s
 
@@ -77,45 +77,45 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-unsafe-math-optimizations \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-UNSAFE,GFX803,WAVE64 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010                    \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE32 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1011                    \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1011,WAVE32 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1012                    \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1012,WAVE32 %s
 
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010 -mwavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE64 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010 -mwavefrontsize64 -mno-wavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE32 %s
 
 // Ignore -mno-wavefrontsize64 without wave32 support
 // RUN: %clang -### -target amdgcn-amd-amdhsa       \
 // RUN:   -x cl -mcpu=gfx803  -mno-wavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs    \
+// RUN:   --rocm-path=%S/Inputs/rocm    \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX803,WAVE64 %s
 
@@ -124,12 +124,12 @@
 // Test --hip-device-lib-path format
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
-// RUN:   --hip-device-lib-path=%S/Inputs/rocm-device-libs/amdgcn/bitcode \
+// RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode \
 // RUN:   %S/opencl.cl \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
 // Test environment variable HIP_DEVICE_LIB_PATH
-// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm-device-libs/amdgcn/bitcode %clang -### -target amdgcn-amd-amdhsa \
+// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   %S/opencl.cl \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s

diff  --git a/clang/test/Driver/rocm-not-found.cl b/clang/test/Driver/rocm-not-found.cl
index 8ecc4b0ef105..ee931971d9e6 100644
--- a/clang/test/Driver/rocm-not-found.cl
+++ b/clang/test/Driver/rocm-not-found.cl
@@ -5,7 +5,7 @@
 
 // RUN: %clang -### --sysroot=%s/no-rocm-there -target amdgcn--amdhsa %s 2>&1 | FileCheck %s --check-prefix ERR
 // RUN: %clang -### --rocm-path=%s/no-rocm-there -target amdgcn--amdhsa %s 2>&1 | FileCheck %s --check-prefix ERR
-// ERR: cannot find ROCm installation. Provide its path via --rocm-path, or pass -nogpulib.
+// ERR: cannot find ROCm installation. Provide its path via --rocm-path, or pass -nogpulib and -nogpuinc to build without ROCm device library and HIP includes.
 
 // Accept nogpulib or nostdlib for OpenCL.
 // RUN: %clang -### -nogpulib --rocm-path=%s/no-rocm-there %s 2>&1 | FileCheck %s --check-prefix OK


        


More information about the cfe-commits mailing list