[clang] 11cc826 - [Clang] Implement resource directory headers for common GPU intrinsics (#110179)

Mon Nov 11 10:09:58 PST 2024

Author: Joseph Huber
Date: 2024-11-11T10:09:55-08:00
New Revision: 11cc826c0a5802b03c85aa271b6fd16214f8f4d8

URL: https://github.com/llvm/llvm-project/commit/11cc826c0a5802b03c85aa271b6fd16214f8f4d8
DIFF: https://github.com/llvm/llvm-project/commit/11cc826c0a5802b03c85aa271b6fd16214f8f4d8.diff

LOG: [Clang] Implement resource directory headers for common GPU intrinsics (#110179)

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.

Added: 
    clang/lib/Headers/amdgpuintrin.h
    clang/lib/Headers/gpuintrin.h
    clang/lib/Headers/nvptxintrin.h
    clang/test/Headers/gpuintrin.c
    clang/test/Headers/gpuintrin_lang.c

Modified: 
    clang/lib/Headers/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 225bf131aeab41..a094305bcec5e4 100644

--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -276,6 +276,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -304,6 +310,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -526,6 +533,7 @@ add_header_target("systemz-resource-headers" "${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -712,6 +720,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()

diff  --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00000000000000..f4f90b394522d6
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,190 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include <stdint.h>
+
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_amdgcn_readfirstlane(__x);
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
+         ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+  __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+  __builtin_amdgcn_wave_barrier();
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+  return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
+         ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+}
+
+// Returns true if the flat pointer points to CUDA 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return __builtin_amdgcn_is_shared(
+      (void __attribute__((address_space(0))) *)((void __gpu_generic *)ptr));
+}
+
+// Returns true if the flat pointer points to CUDA 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return __builtin_amdgcn_is_private(
+      (void __attribute__((address_space(0))) *)((void __gpu_generic *)ptr));
+}
+
+// Terminates execution of the associated wavefront.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+  __builtin_amdgcn_endpgm();
+}
+
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  __builtin_amdgcn_s_sleep(2);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#if !defined(__cplusplus)
+_Pragma("pop_macro(\"bool\")");
+#endif
+
+#endif // __AMDGPUINTRIN_H

diff  --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
new file mode 100644
index 00000000000000..4c463c333308fc
--- /dev/null
+++ b/clang/lib/Headers/gpuintrin.h
@@ -0,0 +1,196 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, but
+// some targets may provide 
diff erent implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
+#if defined(__NVPTX__)
+#include <nvptxintrin.h>
+#elif defined(__AMDGPU__)
+#include <amdgpuintrin.h>
+#elif !defined(_OPENMP)
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+#include <stdint.h>
+
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+#define __GPU_X_DIM 0
+#define __GPU_Y_DIM 1
+#define __GPU_Z_DIM 2
+
+// Returns the number of blocks in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_num_blocks_x();
+  case 1:
+    return __gpu_num_blocks_y();
+  case 2:
+    return __gpu_num_blocks_z();
+  default:
+    __builtin_unreachable();
+  }
+}
+
+// Returns the number of block id in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_block_id_x();
+  case 1:
+    return __gpu_block_id_y();
+  case 2:
+    return __gpu_block_id_z();
+  default:
+    __builtin_unreachable();
+  }
+}
+
+// Returns the number of threads in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_num_threads_x();
+  case 1:
+    return __gpu_num_threads_y();
+  case 2:
+    return __gpu_num_threads_z();
+  default:
+    __builtin_unreachable();
+  }
+}
+
+// Returns the thread id in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_thread_id_x();
+  case 1:
+    return __gpu_thread_id_y();
+  case 2:
+    return __gpu_thread_id_z();
+  default:
+    __builtin_unreachable();
+  }
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_FN_ATTRS static __inline__ bool
+__gpu_is_first_in_lane(uint64_t __lane_mask) {
+  return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
+}
+
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ float
+__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
+  return __builtin_bit_cast(
+      float, __gpu_read_first_lane_u32(__lane_mask,
+                                       __builtin_bit_cast(uint32_t, __x)));
+}
+
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ double
+__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
+  return __builtin_bit_cast(
+      double, __gpu_read_first_lane_u64(__lane_mask,
+                                        __builtin_bit_cast(uint64_t, __x)));
+}
+
+// Shuffles the the lanes according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ float
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+  return __builtin_bit_cast(
+      float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
+                                   __builtin_bit_cast(uint32_t, __x)));
+}
+
+// Shuffles the the lanes according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ double
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+  return __builtin_bit_cast(
+      double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
+                                    __builtin_bit_cast(uint64_t, __x)));
+}
+
+// Gets the sum of all lanes inside the warp or wavefront.
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
+      uint32_t __index = __step + __gpu_lane_id();                             \
+      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x);          \
+    }                                                                          \
+    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
+  }
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
+
+// Gets the accumulator scan of the threads in the warp or wavefront.
+#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
+  _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
+      uint64_t __lane_mask, uint32_t __x) {                                    \
+    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
+      uint32_t __index = __gpu_lane_id() - __step;                             \
+      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
+      __x += __builtin_bit_cast(                                               \
+          __type,                                                              \
+          -bitmask & __builtin_bit_cast(__bitmask_type,                        \
+                                        __gpu_shuffle_idx_##__suffix(          \
+                                            __lane_mask, __index, __x)));      \
+    }                                                                          \
+    return __x;                                                                \
+  }
+__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
+__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
+__DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
+__DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
+#undef __DO_LANE_SCAN
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#if !defined(__cplusplus)
+_Pragma("pop_macro(\"bool\")");
+#endif
+
+#undef _DEFAULT_FN_ATTRS
+
+#endif // __GPUINTRIN_H

diff  --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
new file mode 100644
index 00000000000000..8b68b807cac4f3
--- /dev/null
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -0,0 +1,201 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX"
+#endif
+
+#include <stdint.h>
+
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __nvvm_read_ptx_sreg_warpsize();
+}
+
+// Returns the id of the thread inside of a CUDA warp executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __nvvm_read_ptx_sreg_laneid();
+}
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  return __nvvm_activemask();
+}
+
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint32_t __id = __builtin_ffs(__mask) - 1;
+  return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
+}
+
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint32_t __id = __builtin_ffs(__mask) - 1;
+  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
+                                             __gpu_num_lanes() - 1)
+          << 32ull) |
+         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
+                                             __gpu_num_lanes() - 1));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  uint32_t __mask = (uint32_t)__lane_mask;
+  return __nvvm_vote_ballot_sync(__mask, __x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+  __syncthreads();
+}
+
+// Waits for all threads in the warp to reconverge for independent scheduling.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+  __nvvm_bar_warp_sync((uint32_t)__lane_mask);
+}
+
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint32_t __bitmask = (__mask >> __idx) & 1u;
+  return -__bitmask &
+         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+}
+
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint64_t __bitmask = (__mask >> __idx) & 1u;
+  return -__bitmask & ((uint64_t)__nvvm_shfl_sync_idx_i32(
+                           __mask, __hi, __idx, __gpu_num_lanes() - 1u)
+                       << 32ull) |
+         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
+                                             __gpu_num_lanes() - 1u));
+}
+
+// Returns true if the flat pointer points to CUDA 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return __nvvm_isspacep_shared(ptr);
+}
+
+// Returns true if the flat pointer points to CUDA 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return __nvvm_isspacep_local(ptr);
+}
+
+// Terminates execution of the calling thread.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+  __nvvm_exit();
+}
+
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    asm("nanosleep.u32 64;" ::: "memory");
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#if !defined(__cplusplus)
+_Pragma("pop_macro(\"bool\")");
+#endif
+
+#endif // __NVPTXINTRIN_H

diff  --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
new file mode 100644
index 00000000000000..2e45f73692f534
--- /dev/null
+++ b/clang/test/Headers/gpuintrin.c
@@ -0,0 +1,107 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=AMDGPU
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -target-feature +ptx62 \
+// RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=NVPTX
+
+#include <gpuintrin.h>
+
+// AMDGPU-LABEL: define protected amdgpu_kernel void @foo(
+// AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR7:[0-9]+]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
+// AMDGPU-NEXT:    call void @__gpu_sync_threads() #[[ATTR7]]
+// AMDGPU-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
+// AMDGPU-NEXT:    unreachable
+//
+// NVPTX-LABEL: define protected void @foo(
+// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
+// NVPTX-NEXT:    call void @__gpu_sync_threads() #[[ATTR6]]
+// NVPTX-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    call void @__gpu_exit() #[[ATTR7:[0-9]+]]
+// NVPTX-NEXT:    unreachable
+//
+__gpu_kernel void foo() {
+  __gpu_num_blocks_x();
+  __gpu_num_blocks_y();
+  __gpu_num_blocks_z();
+  __gpu_num_blocks(0);
+  __gpu_block_id_x();
+  __gpu_block_id_y();
+  __gpu_block_id_z();
+  __gpu_block_id(0);
+  __gpu_num_threads_x();
+  __gpu_num_threads_y();
+  __gpu_num_threads_z();
+  __gpu_num_threads(0);
+  __gpu_thread_id_x();
+  __gpu_thread_id_y();
+  __gpu_thread_id_z();
+  __gpu_thread_id(0);
+  __gpu_num_lanes();
+  __gpu_lane_id();
+  __gpu_lane_mask();
+  __gpu_read_first_lane_u32(-1, -1);
+  __gpu_ballot(-1, 1);
+  __gpu_sync_threads();
+  __gpu_sync_lane(-1);
+  __gpu_shuffle_idx_u32(-1, -1, -1);
+  __gpu_first_lane_id(-1);
+  __gpu_is_first_in_lane(-1);
+  __gpu_exit();
+}

diff  --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
new file mode 100644
index 00000000000000..b2dfc9d40827df
--- /dev/null
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -fcuda-is-device -triple nvptx64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=CUDA
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -fcuda-is-device -triple amdgcn -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=HIP
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -cl-std=CL3.0 -triple amdgcn -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=OPENCL
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ -cl-std=CL3.0 \
+// RUN:   -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
+// RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=OPENMP
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -std=c89 -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=C89
+
+#define _DEFAULT_FN_ATTRS __attribute__((always_inline))
+#include <gpuintrin.h>
+
+#ifdef __device__
+__device__ int foo() { return __gpu_thread_id_x(); }
+#else
+// CUDA-LABEL: define dso_local i32 @foo(
+// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
+// CUDA-NEXT:  [[ENTRY:.*:]]
+// CUDA-NEXT:    [[TMP0:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT:    ret i32 [[TMP0]]
+//
+// HIP-LABEL: define dso_local i32 @foo(
+// HIP-SAME: ) #[[ATTR0:[0-9]+]] {
+// HIP-NEXT:  [[ENTRY:.*:]]
+// HIP-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5)
+// HIP-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// HIP-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// HIP-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
+// HIP-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// HIP-NEXT:    ret i32 [[TMP0]]
+//
+// OPENCL-LABEL: define dso_local i32 @foo(
+// OPENCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// OPENCL-NEXT:    ret i32 [[TMP0]]
+//
+// OPENMP-LABEL: define hidden i32 @foo(
+// OPENMP-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENMP-NEXT:  [[ENTRY:.*:]]
+// OPENMP-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// OPENMP-NEXT:    ret i32 [[TMP0]]
+//
+// C89-LABEL: define dso_local i32 @foo(
+// C89-SAME: ) #[[ATTR2:[0-9]+]] {
+// C89-NEXT:  [[ENTRY:.*:]]
+// C89-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5)
+// C89-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// C89-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// C89-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
+// C89-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// C89-NEXT:    ret i32 [[TMP0]]
+//
+int foo() { return __gpu_thread_id_x(); }
+#pragma omp declare target to(foo)
+#endif