[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

Mon Nov 11 06:05:33 PST 2024

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/110179

>From 2a8b7a70e29f545543bfe3eaa8b66e6fe047a4a4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH 01/18] [Clang] Implement resource directory headers for common
 GPU intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++++++++++++++++++++++++++++++
 clang/lib/Headers/gpuintrin.h    |  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++++++++++++++++++++++++++++++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 67242cd4d981bc..813fc86a5bb5cb 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -273,6 +273,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -301,6 +307,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -523,6 +530,7 @@ add_header_target("systemz-resource-headers" "${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -709,6 +717,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00000000000000..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+         _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+         _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_broadcast_value(uint64_t, uint32_t x) {
+  return __builtin_amdgcn_readfirstlane(x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
+_ballot(uint64_t lane_mask, bool x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  return lane_mask & __builtin_amdgcn_ballot_w64(x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() {
+  __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_lane(uint64_t) {
+  __builtin_amdgcn_wave_barrier();
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_shuffle(uint64_t, uint32_t idx, uint32_t x) {
+  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
+}
+
+// Returns the current value of the GPU's processor clock.
+// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
+_DEFAULT_ATTRS static inline uint64_t _processor_clock() {
+  return __builtin_readcyclecounter();
+}
+
+// Returns a fixed-frequency timestamp. The actual frequency is dependent on
+// the card and can only be queried via the driver.
+_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
+}
+
+// Terminates execution of the associated wavefront.
+_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() {
+  __builtin_amdgcn_endpgm();
+}
+
+#pragma omp end declare variant
+#pragma omp end declare target
+#undef _DEFAULT_ATTRS
+
+#endif // __AMDGPUINTRIN_H
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
new file mode 100644
index 00000000000000..09ca48b5580327
--- /dev/null
+++ b/clang/lib/Headers/gpuintrin.h
@@ -0,0 +1,18 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include <nvptxintrin.h>
+#elif defined(__AMDGPU__)
+#include <amdgpuintrin.h>
+#endif
+
+#endif // __GPUINTRIN_H
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
new file mode 100644
index 00000000000000..a1b124ecd958ab
--- /dev/null
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -0,0 +1,184 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((nvptx_kernel))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the total number of CUDA blocks.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the absolute id of the CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+         _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the total number of threads in the block.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the absolute id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+         _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }
+
+// Returns the id of the thread inside of a CUDA warp executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __nvvm_read_ptx_sreg_laneid();
+}
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+  return __nvvm_activemask();
+}
+
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_broadcast_value(uint64_t lane_mask, uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t id = __builtin_ffs(mask) - 1;
+  return __nvvm_shfl_sync_idx_i32(mask, x, id, _get_lane_size() - 1);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
+_ballot(uint64_t lane_mask, bool x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  return __nvvm_vote_ballot_sync(mask, x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() {
+  __syncthreads();
+}
+
+// Waits for all threads in the warp to reconverge for independent scheduling.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void
+_sync_lane(uint64_t mask) {
+  __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
+}
+
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t bitmask = (mask >> idx) & 1;
+  return -bitmask &
+         __nvvm_shfl_sync_idx_i32(mask, x, idx, _get_lane_size() - 1);
+}
+
+// Returns the current value of the GPU's processor clock.
+_DEFAULT_ATTRS static inline uint64_t _processor_clock() {
+  return __builtin_readcyclecounter();
+}
+
+// Returns a global fixed-frequency timer at nanosecond frequency.
+_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
+}
+
+// Terminates execution of the calling thread.
+_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() { __nvvm_exit(); }
+
+#pragma omp end declare variant
+#pragma omp end declare target
+#undef _DEFAULT_ATTRS
+
+#endif // __NVPTXINTRIN_H

>From 12443daa2f634c94497f05309cdb018e45bdd9a6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 3 Oct 2024 15:46:27 -0500
Subject: [PATCH 02/18] Update and address comments

---
 clang/lib/Headers/amdgpuintrin.h | 102 +++----
 clang/lib/Headers/gpuintrin.h    |  58 ++++
 clang/lib/Headers/nvptxintrin.h  | 115 +++----
 clang/test/Headers/gpuintrin.c   | 508 +++++++++++++++++++++++++++++++
 4 files changed, 642 insertions(+), 141 deletions(-)
 create mode 100644 clang/test/Headers/gpuintrin.c

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 95936f86bd15b8..1fd7261cf4ca75 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -10,178 +10,144 @@
 #define __AMDGPUINTRIN_H
 
 #ifndef __AMDGPU__
-#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
 #endif
 
 #include <stdbool.h>
 #include <stdint.h>
 
 #if defined(__HIP__) || defined(__CUDA__)
-#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
-#else
-#define _DEFAULT_ATTRS __attribute__((always_inline))
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
 #endif
 
 #pragma omp begin declare target device_type(nohost)
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 // Type aliases to the address spaces used by the AMDGPU backend.
-#define _private __attribute__((opencl_private))
-#define _constant __attribute__((opencl_constant))
-#define _local __attribute__((opencl_local))
-#define _global __attribute__((opencl_global))
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))
 
 // Attribute to declare a function as a kernel.
-#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+#define _Kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of workgroups in the 'x' dimension of the grid.
-_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
 }
 
 // Returns the number of workgroups in the 'y' dimension of the grid.
-_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
   return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
 }
 
 // Returns the number of workgroups in the 'z' dimension of the grid.
-_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
   return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
 }
 
-// Returns the total number of workgruops in the grid.
-_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
-  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
-}
-
 // Returns the 'x' dimension of the current AMD workgroup's id.
-_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
   return __builtin_amdgcn_workgroup_id_x();
 }
 
 // Returns the 'y' dimension of the current AMD workgroup's id.
-_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
   return __builtin_amdgcn_workgroup_id_y();
 }
 
 // Returns the 'z' dimension of the current AMD workgroup's id.
-_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
   return __builtin_amdgcn_workgroup_id_z();
 }
 
-// Returns the absolute id of the AMD workgroup.
-_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
-  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
-         _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
-}
-
 // Returns the number of workitems in the 'x' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
   return __builtin_amdgcn_workgroup_size_x();
 }
 
 // Returns the number of workitems in the 'y' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
   return __builtin_amdgcn_workgroup_size_y();
 }
 
 // Returns the number of workitems in the 'z' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
   return __builtin_amdgcn_workgroup_size_z();
 }
 
-// Returns the total number of workitems in the workgroup.
-_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
-  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
-}
-
 // Returns the 'x' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
   return __builtin_amdgcn_workitem_id_x();
 }
 
 // Returns the 'y' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
   return __builtin_amdgcn_workitem_id_y();
 }
 
 // Returns the 'z' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
   return __builtin_amdgcn_workitem_id_z();
 }
 
-// Returns the absolute id of the thread in the current AMD workgroup.
-_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
-  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
-         _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
-}
-
 // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
 // and compilation options.
-_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
   return __builtin_amdgcn_wavefrontsize();
 }
 
 // Returns the id of the thread inside of an AMD wavefront executing together.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
 
 // Returns the bit-mask of active threads in the current wavefront.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
   return __builtin_amdgcn_read_exec();
 }
 
 // Copies the value from the first active thread in the wavefront to the rest.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-_broadcast_value(uint64_t, uint32_t x) {
-  return __builtin_amdgcn_readfirstlane(x);
+__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_amdgcn_readfirstlane(__x);
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
-_ballot(uint64_t lane_mask, bool x) {
+__gpu_ballot(uint64_t __lane_mask, bool __x) {
   // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
   // the active threads
-  return lane_mask & __builtin_amdgcn_ballot_w64(x);
+  return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
-_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() {
+_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
   __builtin_amdgcn_s_barrier();
   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
 // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_lane(uint64_t) {
+_DEFAULT_ATTRS [[clang::convergent]] static inline void
+__gpu_sync_lane(uint64_t __lane_mask) {
   __builtin_amdgcn_wave_barrier();
 }
 
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-_shuffle(uint64_t, uint32_t idx, uint32_t x) {
-  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
-}
-
-// Returns the current value of the GPU's processor clock.
-// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
-_DEFAULT_ATTRS static inline uint64_t _processor_clock() {
-  return __builtin_readcyclecounter();
-}
-
-// Returns a fixed-frequency timestamp. The actual frequency is dependent on
-// the card and can only be queried via the driver.
-_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
+__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+  return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
 }
 
 // Terminates execution of the associated wavefront.
-_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() {
+_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() {
   __builtin_amdgcn_endpgm();
 }
 
 #pragma omp end declare variant
 #pragma omp end declare target
-#undef _DEFAULT_ATTRS
 
 #endif // __AMDGPUINTRIN_H
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 09ca48b5580327..2531ad8bba70c8 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -15,4 +15,62 @@
 #include <amdgpuintrin.h>
 #endif
 
+// Returns the total number of blocks / workgroups.
+_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
+  return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
+}
+
+// Returns the absolute id of the block / workgroup.
+_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
+  return __gpu_block_id_x() +
+         (uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
+         (uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
+             __gpu_block_id_z();
+}
+
+// Returns the total number of threads in the block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {
+  return __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_num_threads_z();
+}
+
+// Returns the absolute id of the thread in the current block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id() {
+  return __gpu_thread_id_x() + __gpu_num_threads_x() * __gpu_thread_id_y() +
+         __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_thread_id_z();
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_ATTRS static inline uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {
+  return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
+}
+
+// Gets the sum of all lanes inside the warp or wavefront.
+_DEFAULT_ATTRS static inline uint32_t __gpu_lane_reduce(uint64_t __lane_mask,
+                                                        uint32_t x) {
+  for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
+    uint32_t index = step + __gpu_lane_id();
+    x += __gpu_shuffle_idx(__lane_mask, index, x);
+  }
+  return __gpu_broadcast(__lane_mask, x);
+}
+
+// Gets the accumulator scan of the threads in the warp or wavefront.
+_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan(uint64_t __lane_mask,
+                                                      uint32_t x) {
+  for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
+    uint32_t index = __gpu_lane_id() - step;
+    uint32_t bitmask = __gpu_lane_id() >= step;
+    x += -bitmask & __gpu_shuffle_idx(__lane_mask, index, x);
+  }
+  return x;
+}
+
+#undef _DEFAULT_ATTRS
+
 #endif // __GPUINTRIN_H
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index a1b124ecd958ab..fc9769d4c578dd 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -10,175 +10,144 @@
 #define __NVPTXINTRIN_H
 
 #ifndef __NVPTX__
-#error "This file is intended for NVPTX targets or offloading to NVPTX
+#error "This file is intended for NVPTX targets or offloading to NVPTX"
 #endif
 
 #include <stdbool.h>
 #include <stdint.h>
 
 #if defined(__HIP__) || defined(__CUDA__)
-#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
-#else
-#define _DEFAULT_ATTRS __attribute__((always_inline))
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
 #endif
 
 #pragma omp begin declare target device_type(nohost)
 #pragma omp begin declare variant match(device = {arch(nvptx64)})
 
 // Type aliases to the address spaces used by the NVPTX backend.
-#define _private __attribute__((opencl_private))
-#define _constant __attribute__((opencl_constant))
-#define _local __attribute__((opencl_local))
-#define _global __attribute__((opencl_global))
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))
 
 // Attribute to declare a function as a kernel.
-#define _kernel __attribute__((nvptx_kernel))
+#define _Kernel __attribute__((nvptx_kernel, visibility("protected")))
 
 // Returns the number of CUDA blocks in the 'x' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
 
 // Returns the number of CUDA blocks in the 'y' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
   return __nvvm_read_ptx_sreg_nctaid_y();
 }
 
 // Returns the number of CUDA blocks in the 'z' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
   return __nvvm_read_ptx_sreg_nctaid_z();
 }
 
-// Returns the total number of CUDA blocks.
-_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
-  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
-}
-
 // Returns the 'x' dimension of the current CUDA block's id.
-_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
   return __nvvm_read_ptx_sreg_ctaid_x();
 }
 
 // Returns the 'y' dimension of the current CUDA block's id.
-_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
   return __nvvm_read_ptx_sreg_ctaid_y();
 }
 
 // Returns the 'z' dimension of the current CUDA block's id.
-_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
   return __nvvm_read_ptx_sreg_ctaid_z();
 }
 
-// Returns the absolute id of the CUDA block.
-_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
-  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
-         _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
-}
-
 // Returns the number of CUDA threads in the 'x' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
   return __nvvm_read_ptx_sreg_ntid_x();
 }
 
 // Returns the number of CUDA threads in the 'y' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
   return __nvvm_read_ptx_sreg_ntid_y();
 }
 
 // Returns the number of CUDA threads in the 'z' dimension.
-_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
   return __nvvm_read_ptx_sreg_ntid_z();
 }
 
-// Returns the total number of threads in the block.
-_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
-  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
-}
-
 // Returns the 'x' dimension id of the thread in the current CUDA block.
-_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
   return __nvvm_read_ptx_sreg_tid_x();
 }
 
 // Returns the 'y' dimension id of the thread in the current CUDA block.
-_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
   return __nvvm_read_ptx_sreg_tid_y();
 }
 
 // Returns the 'z' dimension id of the thread in the current CUDA block.
-_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
   return __nvvm_read_ptx_sreg_tid_z();
 }
 
-// Returns the absolute id of the thread in the current CUDA block.
-_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
-  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
-         _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
-}
-
 // Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
-_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
+  return __nvvm_read_ptx_sreg_warpsize();
+}
 
 // Returns the id of the thread inside of a CUDA warp executing together.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
   return __nvvm_read_ptx_sreg_laneid();
 }
 
 // Returns the bit-mask of active threads in the current warp.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
   return __nvvm_activemask();
 }
 
 // Copies the value from the first active thread in the warp to the rest.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-_broadcast_value(uint64_t lane_mask, uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t id = __builtin_ffs(mask) - 1;
-  return __nvvm_shfl_sync_idx_i32(mask, x, id, _get_lane_size() - 1);
+__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint32_t __id = __builtin_ffs(__mask) - 1;
+  return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
-_ballot(uint64_t lane_mask, bool x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  return __nvvm_vote_ballot_sync(mask, x);
+__gpu_ballot(uint64_t __lane_mask, bool __x) {
+  uint32_t __mask = (uint32_t)__lane_mask;
+  return __nvvm_vote_ballot_sync(__mask, __x);
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
-_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() {
+_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
   __syncthreads();
 }
 
 // Waits for all threads in the warp to reconverge for independent scheduling.
 _DEFAULT_ATTRS [[clang::convergent]] static inline void
-_sync_lane(uint64_t mask) {
-  __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
+__gpu_sync_lane(uint64_t __lane_mask) {
+  __nvvm_bar_warp_sync((uint32_t)__lane_mask);
 }
 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-_shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t bitmask = (mask >> idx) & 1;
-  return -bitmask &
-         __nvvm_shfl_sync_idx_i32(mask, x, idx, _get_lane_size() - 1);
-}
-
-// Returns the current value of the GPU's processor clock.
-_DEFAULT_ATTRS static inline uint64_t _processor_clock() {
-  return __builtin_readcyclecounter();
-}
-
-// Returns a global fixed-frequency timer at nanosecond frequency.
-_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
+__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint32_t __bitmask = (__mask >> __idx) & 1u;
+  return -__bitmask &
+         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
 }
 
 // Terminates execution of the calling thread.
-_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() { __nvvm_exit(); }
+_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() { __nvvm_exit(); }
 
 #pragma omp end declare variant
 #pragma omp end declare target
-#undef _DEFAULT_ATTRS
 
 #endif // __NVPTXINTRIN_H
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
new file mode 100644
index 00000000000000..ff5a816699ebef
--- /dev/null
+++ b/clang/test/Headers/gpuintrin.c
@@ -0,0 +1,508 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=AMDGPU
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -target-feature +ptx62 \
+// RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=NVPTX
+
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#include <gpuintrin.h>
+
+// AMDGPU-LABEL: define dso_local void @foo(
+// AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[RETVAL_I116:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I114:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I112:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I110:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I19_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I17_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I15_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I12_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I9_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I_I103:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I104:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I101:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I99:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I97:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I7_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I4_I87:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I_I88:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I89:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I84:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I81:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I78:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I26_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I24_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I22_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I18_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I14_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I_I70:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I71:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I68:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I66:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I64:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I8_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I4_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I_I58:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I59:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I54:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I50:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I47:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I_I:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I_I:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I42:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I43:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I38:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I39:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I32:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I33:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__IDX_ADDR_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__X_ADDR_I34:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I30:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I24:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I25:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__X_ADDR_I26:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__X_ADDR_I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I48:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I47]] to ptr
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 12
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META3:![0-9]+]]
+// AMDGPU-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12
+// AMDGPU-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG4:![0-9]+]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I49:%.*]] = zext i16 [[TMP5]] to i32
+// AMDGPU-NEXT:    [[DIV_I:%.*]] = udiv i32 [[TMP2]], [[CONV_I49]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I51:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I50]] to ptr
+// AMDGPU-NEXT:    [[TMP6:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 16
+// AMDGPU-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP9:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP9]], i32 14
+// AMDGPU-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I52:%.*]] = zext i16 [[TMP11]] to i32
+// AMDGPU-NEXT:    [[DIV_I53:%.*]] = udiv i32 [[TMP8]], [[CONV_I52]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I55:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I54]] to ptr
+// AMDGPU-NEXT:    [[TMP12:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP12]], i32 20
+// AMDGPU-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(4) [[TMP13]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP15:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP15]], i32 16
+// AMDGPU-NEXT:    [[TMP17:%.*]] = load i16, ptr addrspace(4) [[TMP16]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I56:%.*]] = zext i16 [[TMP17]] to i32
+// AMDGPU-NEXT:    [[DIV_I57:%.*]] = udiv i32 [[TMP14]], [[CONV_I56]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I60:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I59]] to ptr
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I61:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I58]] to ptr
+// AMDGPU-NEXT:    [[TMP18:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP18]], i32 12
+// AMDGPU-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP21:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP21]], i32 12
+// AMDGPU-NEXT:    [[TMP23:%.*]] = load i16, ptr addrspace(4) [[TMP22]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I_I62:%.*]] = zext i16 [[TMP23]] to i32
+// AMDGPU-NEXT:    [[DIV_I_I:%.*]] = udiv i32 [[TMP20]], [[CONV_I_I62]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I5_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I4_I]] to ptr
+// AMDGPU-NEXT:    [[TMP24:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP24]], i32 16
+// AMDGPU-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(4) [[TMP25]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP27:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP27]], i32 14
+// AMDGPU-NEXT:    [[TMP29:%.*]] = load i16, ptr addrspace(4) [[TMP28]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I6_I:%.*]] = zext i16 [[TMP29]] to i32
+// AMDGPU-NEXT:    [[DIV_I7_I:%.*]] = udiv i32 [[TMP26]], [[CONV_I6_I]]
+// AMDGPU-NEXT:    [[MUL_I:%.*]] = mul i32 [[DIV_I_I]], [[DIV_I7_I]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I9_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I8_I]] to ptr
+// AMDGPU-NEXT:    [[TMP30:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP30]], i32 20
+// AMDGPU-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(4) [[TMP31]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP33:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP33]], i32 16
+// AMDGPU-NEXT:    [[TMP35:%.*]] = load i16, ptr addrspace(4) [[TMP34]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I10_I:%.*]] = zext i16 [[TMP35]] to i32
+// AMDGPU-NEXT:    [[DIV_I11_I:%.*]] = udiv i32 [[TMP32]], [[CONV_I10_I]]
+// AMDGPU-NEXT:    [[MUL3_I:%.*]] = mul i32 [[MUL_I]], [[DIV_I11_I]]
+// AMDGPU-NEXT:    [[CONV_I63:%.*]] = zext i32 [[MUL3_I]] to i64
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I65:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I64]] to ptr
+// AMDGPU-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I67:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I66]] to ptr
+// AMDGPU-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I69:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I68]] to ptr
+// AMDGPU-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I72:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I71]] to ptr
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I23_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I22_I]] to ptr
+// AMDGPU-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+// AMDGPU-NEXT:    [[CONV_I73:%.*]] = zext i32 [[TMP39]] to i64
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I15_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I14_I]] to ptr
+// AMDGPU-NEXT:    [[TMP40:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP40]], i32 12
+// AMDGPU-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(4) [[TMP41]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP43:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP43]], i32 12
+// AMDGPU-NEXT:    [[TMP45:%.*]] = load i16, ptr addrspace(4) [[TMP44]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I16_I:%.*]] = zext i16 [[TMP45]] to i32
+// AMDGPU-NEXT:    [[DIV_I17_I:%.*]] = udiv i32 [[TMP42]], [[CONV_I16_I]]
+// AMDGPU-NEXT:    [[CONV2_I:%.*]] = zext i32 [[DIV_I17_I]] to i64
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I25_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I24_I]] to ptr
+// AMDGPU-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+// AMDGPU-NEXT:    [[CONV4_I:%.*]] = zext i32 [[TMP46]] to i64
+// AMDGPU-NEXT:    [[MUL_I74:%.*]] = mul i64 [[CONV2_I]], [[CONV4_I]]
+// AMDGPU-NEXT:    [[ADD_I:%.*]] = add i64 [[CONV_I73]], [[MUL_I74]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I75:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I70]] to ptr
+// AMDGPU-NEXT:    [[TMP47:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP47]], i32 12
+// AMDGPU-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(4) [[TMP48]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP50:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP50]], i32 12
+// AMDGPU-NEXT:    [[TMP52:%.*]] = load i16, ptr addrspace(4) [[TMP51]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I_I76:%.*]] = zext i16 [[TMP52]] to i32
+// AMDGPU-NEXT:    [[DIV_I_I77:%.*]] = udiv i32 [[TMP49]], [[CONV_I_I76]]
+// AMDGPU-NEXT:    [[CONV6_I:%.*]] = zext i32 [[DIV_I_I77]] to i64
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I19_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I18_I]] to ptr
+// AMDGPU-NEXT:    [[TMP53:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGPU-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP53]], i32 16
+// AMDGPU-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(4) [[TMP54]], align 4, !invariant.load [[META3]]
+// AMDGPU-NEXT:    [[TMP56:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP56]], i32 14
+// AMDGPU-NEXT:    [[TMP58:%.*]] = load i16, ptr addrspace(4) [[TMP57]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I20_I:%.*]] = zext i16 [[TMP58]] to i32
+// AMDGPU-NEXT:    [[DIV_I21_I:%.*]] = udiv i32 [[TMP55]], [[CONV_I20_I]]
+// AMDGPU-NEXT:    [[CONV8_I:%.*]] = zext i32 [[DIV_I21_I]] to i64
+// AMDGPU-NEXT:    [[MUL9_I:%.*]] = mul i64 [[CONV6_I]], [[CONV8_I]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I27_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I26_I]] to ptr
+// AMDGPU-NEXT:    [[TMP59:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+// AMDGPU-NEXT:    [[CONV11_I:%.*]] = zext i32 [[TMP59]] to i64
+// AMDGPU-NEXT:    [[MUL12_I:%.*]] = mul i64 [[MUL9_I]], [[CONV11_I]]
+// AMDGPU-NEXT:    [[ADD13_I:%.*]] = add i64 [[ADD_I]], [[MUL12_I]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I79:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I78]] to ptr
+// AMDGPU-NEXT:    [[TMP60:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP60]], i32 12
+// AMDGPU-NEXT:    [[TMP62:%.*]] = load i16, ptr addrspace(4) [[TMP61]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I80:%.*]] = zext i16 [[TMP62]] to i32
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I82:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I81]] to ptr
+// AMDGPU-NEXT:    [[TMP63:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP63]], i32 14
+// AMDGPU-NEXT:    [[TMP65:%.*]] = load i16, ptr addrspace(4) [[TMP64]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I83:%.*]] = zext i16 [[TMP65]] to i32
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I85:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I84]] to ptr
+// AMDGPU-NEXT:    [[TMP66:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP66]], i32 16
+// AMDGPU-NEXT:    [[TMP68:%.*]] = load i16, ptr addrspace(4) [[TMP67]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I86:%.*]] = zext i16 [[TMP68]] to i32
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I90:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I89]] to ptr
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I91:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I88]] to ptr
+// AMDGPU-NEXT:    [[TMP69:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP70:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP69]], i32 12
+// AMDGPU-NEXT:    [[TMP71:%.*]] = load i16, ptr addrspace(4) [[TMP70]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I_I92:%.*]] = zext i16 [[TMP71]] to i32
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I5_I93:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I4_I87]] to ptr
+// AMDGPU-NEXT:    [[TMP72:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP72]], i32 14
+// AMDGPU-NEXT:    [[TMP74:%.*]] = load i16, ptr addrspace(4) [[TMP73]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I6_I94:%.*]] = zext i16 [[TMP74]] to i32
+// AMDGPU-NEXT:    [[MUL_I95:%.*]] = mul i32 [[CONV_I_I92]], [[CONV_I6_I94]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I8_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I7_I]] to ptr
+// AMDGPU-NEXT:    [[TMP75:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP75]], i32 16
+// AMDGPU-NEXT:    [[TMP77:%.*]] = load i16, ptr addrspace(4) [[TMP76]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I9_I:%.*]] = zext i16 [[TMP77]] to i32
+// AMDGPU-NEXT:    [[MUL3_I96:%.*]] = mul i32 [[MUL_I95]], [[CONV_I9_I]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I98:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I97]] to ptr
+// AMDGPU-NEXT:    [[TMP78:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I100:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I99]] to ptr
+// AMDGPU-NEXT:    [[TMP79:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I102:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I101]] to ptr
+// AMDGPU-NEXT:    [[TMP80:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I105:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I104]] to ptr
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I16_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I15_I]] to ptr
+// AMDGPU-NEXT:    [[TMP81:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I10_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I9_I]] to ptr
+// AMDGPU-NEXT:    [[TMP82:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP82]], i32 12
+// AMDGPU-NEXT:    [[TMP84:%.*]] = load i16, ptr addrspace(4) [[TMP83]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I11_I:%.*]] = zext i16 [[TMP84]] to i32
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I18_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I17_I]] to ptr
+// AMDGPU-NEXT:    [[TMP85:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+// AMDGPU-NEXT:    [[MUL_I106:%.*]] = mul i32 [[CONV_I11_I]], [[TMP85]]
+// AMDGPU-NEXT:    [[ADD_I107:%.*]] = add i32 [[TMP81]], [[MUL_I106]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I108:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I103]] to ptr
+// AMDGPU-NEXT:    [[TMP86:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP86]], i32 12
+// AMDGPU-NEXT:    [[TMP88:%.*]] = load i16, ptr addrspace(4) [[TMP87]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I_I109:%.*]] = zext i16 [[TMP88]] to i32
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I13_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I12_I]] to ptr
+// AMDGPU-NEXT:    [[TMP89:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGPU-NEXT:    [[TMP90:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP89]], i32 14
+// AMDGPU-NEXT:    [[TMP91:%.*]] = load i16, ptr addrspace(4) [[TMP90]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[CONV_I14_I:%.*]] = zext i16 [[TMP91]] to i32
+// AMDGPU-NEXT:    [[MUL5_I:%.*]] = mul i32 [[CONV_I_I109]], [[CONV_I14_I]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I20_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I19_I]] to ptr
+// AMDGPU-NEXT:    [[TMP92:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+// AMDGPU-NEXT:    [[MUL7_I:%.*]] = mul i32 [[MUL5_I]], [[TMP92]]
+// AMDGPU-NEXT:    [[ADD8_I:%.*]] = add i32 [[ADD_I107]], [[MUL7_I]]
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I111:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I110]] to ptr
+// AMDGPU-NEXT:    [[TMP93:%.*]] = call i32 @llvm.amdgcn.wavefrontsize()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I115:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I114]] to ptr
+// AMDGPU-NEXT:    [[TMP94:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+// AMDGPU-NEXT:    [[TMP95:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP94]])
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I117:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I116]] to ptr
+// AMDGPU-NEXT:    [[TMP96:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I]] to ptr
+// AMDGPU-NEXT:    [[__X_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I]] to ptr
+// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I]], align 8
+// AMDGPU-NEXT:    store i32 -1, ptr [[__X_ADDR_ASCAST_I]], align 4
+// AMDGPU-NEXT:    [[TMP97:%.*]] = load i32, ptr [[__X_ADDR_ASCAST_I]], align 4
+// AMDGPU-NEXT:    [[TMP98:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP97]])
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I27:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I24]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I28:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I25]] to ptr
+// AMDGPU-NEXT:    [[__X_ADDR_ASCAST_I29:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I26]] to ptr
+// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I28]], align 8
+// AMDGPU-NEXT:    store i8 1, ptr [[__X_ADDR_ASCAST_I29]], align 1
+// AMDGPU-NEXT:    [[TMP99:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I28]], align 8
+// AMDGPU-NEXT:    [[TMP100:%.*]] = load i8, ptr [[__X_ADDR_ASCAST_I29]], align 1
+// AMDGPU-NEXT:    [[LOADEDV_I:%.*]] = trunc i8 [[TMP100]] to i1
+// AMDGPU-NEXT:    [[TMP101:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[LOADEDV_I]])
+// AMDGPU-NEXT:    [[AND_I:%.*]] = and i64 [[TMP99]], [[TMP101]]
+// AMDGPU-NEXT:    call void @llvm.amdgcn.s.barrier()
+// AMDGPU-NEXT:    fence syncscope("workgroup") acquire
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I31:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I30]] to ptr
+// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I31]], align 8
+// AMDGPU-NEXT:    call void @llvm.amdgcn.wave.barrier()
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I35:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I32]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I36:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I33]] to ptr
+// AMDGPU-NEXT:    [[__IDX_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__IDX_ADDR_I]] to ptr
+// AMDGPU-NEXT:    [[__X_ADDR_ASCAST_I37:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I34]] to ptr
+// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I36]], align 8
+// AMDGPU-NEXT:    store i32 -1, ptr [[__IDX_ADDR_ASCAST_I]], align 4
+// AMDGPU-NEXT:    store i32 -1, ptr [[__X_ADDR_ASCAST_I37]], align 4
+// AMDGPU-NEXT:    [[TMP102:%.*]] = load i32, ptr [[__IDX_ADDR_ASCAST_I]], align 4
+// AMDGPU-NEXT:    [[SHL_I:%.*]] = shl i32 [[TMP102]], 2
+// AMDGPU-NEXT:    [[TMP103:%.*]] = load i32, ptr [[__X_ADDR_ASCAST_I37]], align 4
+// AMDGPU-NEXT:    [[TMP104:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL_I]], i32 [[TMP103]])
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I40:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I38]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I41:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I39]] to ptr
+// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I41]], align 8
+// AMDGPU-NEXT:    [[TMP105:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I41]], align 8
+// AMDGPU-NEXT:    [[TMP106:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP105]], i1 true)
+// AMDGPU-NEXT:    [[TMP107:%.*]] = add i64 [[TMP106]], 1
+// AMDGPU-NEXT:    [[ISZERO_I:%.*]] = icmp eq i64 [[TMP105]], 0
+// AMDGPU-NEXT:    [[FFS_I:%.*]] = select i1 [[ISZERO_I]], i64 0, i64 [[TMP107]]
+// AMDGPU-NEXT:    [[CAST_I:%.*]] = trunc i64 [[FFS_I]] to i32
+// AMDGPU-NEXT:    [[SUB_I:%.*]] = sub nsw i32 [[CAST_I]], 1
+// AMDGPU-NEXT:    [[CONV_I:%.*]] = sext i32 [[SUB_I]] to i64
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I44:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I42]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I45:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I43]] to ptr
+// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I45]], align 8
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I113:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I112]] to ptr
+// AMDGPU-NEXT:    [[TMP108:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+// AMDGPU-NEXT:    [[TMP109:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP108]])
+// AMDGPU-NEXT:    [[CONV_I46:%.*]] = zext i32 [[TMP109]] to i64
+// AMDGPU-NEXT:    [[TMP110:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I45]], align 8
+// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I_I:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I_I]] to ptr
+// AMDGPU-NEXT:    store i64 [[TMP110]], ptr [[__LANE_MASK_ADDR_ASCAST_I_I]], align 8
+// AMDGPU-NEXT:    [[TMP111:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I_I]], align 8
+// AMDGPU-NEXT:    [[TMP112:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP111]], i1 true)
+// AMDGPU-NEXT:    [[TMP113:%.*]] = add i64 [[TMP112]], 1
+// AMDGPU-NEXT:    [[ISZERO_I_I:%.*]] = icmp eq i64 [[TMP111]], 0
+// AMDGPU-NEXT:    [[FFS_I_I:%.*]] = select i1 [[ISZERO_I_I]], i64 0, i64 [[TMP113]]
+// AMDGPU-NEXT:    [[CAST_I_I:%.*]] = trunc i64 [[FFS_I_I]] to i32
+// AMDGPU-NEXT:    [[SUB_I_I:%.*]] = sub nsw i32 [[CAST_I_I]], 1
+// AMDGPU-NEXT:    [[CONV_I_I:%.*]] = sext i32 [[SUB_I_I]] to i64
+// AMDGPU-NEXT:    [[CMP_I:%.*]] = icmp eq i64 [[CONV_I46]], [[CONV_I_I]]
+// AMDGPU-NEXT:    call void @llvm.amdgcn.endpgm()
+// AMDGPU-NEXT:    unreachable
+//
+// NVPTX-LABEL: define dso_local void @foo(
+// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I_I:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I42:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I37:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I30:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__IDX_ADDR_I:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__X_ADDR_I31:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__MASK_I32:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__BITMASK_I:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I28:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I24:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__X_ADDR_I25:%.*]] = alloca i8, align 1
+// NVPTX-NEXT:    [[__MASK_I26:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__X_ADDR_I:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__MASK_I:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__ID_I:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[TMP0:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+// NVPTX-NEXT:    [[TMP1:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+// NVPTX-NEXT:    [[TMP2:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+// NVPTX-NEXT:    [[TMP3:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+// NVPTX-NEXT:    [[TMP4:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+// NVPTX-NEXT:    [[MUL_I:%.*]] = mul i32 [[TMP3]], [[TMP4]]
+// NVPTX-NEXT:    [[TMP5:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+// NVPTX-NEXT:    [[MUL3_I:%.*]] = mul i32 [[MUL_I]], [[TMP5]]
+// NVPTX-NEXT:    [[CONV_I45:%.*]] = zext i32 [[MUL3_I]] to i64
+// NVPTX-NEXT:    [[TMP6:%.*]] = call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+// NVPTX-NEXT:    [[TMP7:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+// NVPTX-NEXT:    [[TMP8:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+// NVPTX-NEXT:    [[TMP9:%.*]] = call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+// NVPTX-NEXT:    [[CONV_I46:%.*]] = zext i32 [[TMP9]] to i64
+// NVPTX-NEXT:    [[TMP10:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+// NVPTX-NEXT:    [[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
+// NVPTX-NEXT:    [[TMP11:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+// NVPTX-NEXT:    [[CONV4_I:%.*]] = zext i32 [[TMP11]] to i64
+// NVPTX-NEXT:    [[MUL_I47:%.*]] = mul i64 [[CONV2_I]], [[CONV4_I]]
+// NVPTX-NEXT:    [[ADD_I:%.*]] = add i64 [[CONV_I46]], [[MUL_I47]]
+// NVPTX-NEXT:    [[TMP12:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+// NVPTX-NEXT:    [[CONV6_I:%.*]] = zext i32 [[TMP12]] to i64
+// NVPTX-NEXT:    [[TMP13:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+// NVPTX-NEXT:    [[CONV8_I:%.*]] = zext i32 [[TMP13]] to i64
+// NVPTX-NEXT:    [[MUL9_I:%.*]] = mul i64 [[CONV6_I]], [[CONV8_I]]
+// NVPTX-NEXT:    [[TMP14:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+// NVPTX-NEXT:    [[CONV11_I:%.*]] = zext i32 [[TMP14]] to i64
+// NVPTX-NEXT:    [[MUL12_I:%.*]] = mul i64 [[MUL9_I]], [[CONV11_I]]
+// NVPTX-NEXT:    [[ADD13_I:%.*]] = add i64 [[ADD_I]], [[MUL12_I]]
+// NVPTX-NEXT:    [[TMP15:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// NVPTX-NEXT:    [[TMP16:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+// NVPTX-NEXT:    [[TMP17:%.*]] = call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+// NVPTX-NEXT:    [[TMP18:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// NVPTX-NEXT:    [[TMP19:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+// NVPTX-NEXT:    [[MUL_I48:%.*]] = mul i32 [[TMP18]], [[TMP19]]
+// NVPTX-NEXT:    [[TMP20:%.*]] = call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+// NVPTX-NEXT:    [[MUL3_I49:%.*]] = mul i32 [[MUL_I48]], [[TMP20]]
+// NVPTX-NEXT:    [[TMP21:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// NVPTX-NEXT:    [[TMP22:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+// NVPTX-NEXT:    [[TMP23:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+// NVPTX-NEXT:    [[TMP24:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// NVPTX-NEXT:    [[TMP25:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// NVPTX-NEXT:    [[TMP26:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+// NVPTX-NEXT:    [[MUL_I50:%.*]] = mul i32 [[TMP25]], [[TMP26]]
+// NVPTX-NEXT:    [[ADD_I51:%.*]] = add i32 [[TMP24]], [[MUL_I50]]
+// NVPTX-NEXT:    [[TMP27:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// NVPTX-NEXT:    [[TMP28:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+// NVPTX-NEXT:    [[MUL5_I:%.*]] = mul i32 [[TMP27]], [[TMP28]]
+// NVPTX-NEXT:    [[TMP29:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+// NVPTX-NEXT:    [[MUL7_I:%.*]] = mul i32 [[MUL5_I]], [[TMP29]]
+// NVPTX-NEXT:    [[ADD8_I:%.*]] = add i32 [[ADD_I51]], [[MUL7_I]]
+// NVPTX-NEXT:    [[TMP30:%.*]] = call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// NVPTX-NEXT:    [[TMP31:%.*]] = call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
+// NVPTX-NEXT:    [[TMP32:%.*]] = call i32 @llvm.nvvm.activemask()
+// NVPTX-NEXT:    [[CONV_I52:%.*]] = zext i32 [[TMP32]] to i64
+// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I]], align 8
+// NVPTX-NEXT:    store i32 -1, ptr [[__X_ADDR_I]], align 4
+// NVPTX-NEXT:    [[TMP33:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I]], align 8
+// NVPTX-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP33]] to i32
+// NVPTX-NEXT:    store i32 [[CONV_I]], ptr [[__MASK_I]], align 4
+// NVPTX-NEXT:    [[TMP34:%.*]] = load i32, ptr [[__MASK_I]], align 4
+// NVPTX-NEXT:    [[TMP35:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP34]], i1 true)
+// NVPTX-NEXT:    [[TMP36:%.*]] = add i32 [[TMP35]], 1
+// NVPTX-NEXT:    [[ISZERO_I:%.*]] = icmp eq i32 [[TMP34]], 0
+// NVPTX-NEXT:    [[FFS_I:%.*]] = select i1 [[ISZERO_I]], i32 0, i32 [[TMP36]]
+// NVPTX-NEXT:    [[SUB_I:%.*]] = sub nsw i32 [[FFS_I]], 1
+// NVPTX-NEXT:    store i32 [[SUB_I]], ptr [[__ID_I]], align 4
+// NVPTX-NEXT:    [[TMP37:%.*]] = load i32, ptr [[__MASK_I]], align 4
+// NVPTX-NEXT:    [[TMP38:%.*]] = load i32, ptr [[__X_ADDR_I]], align 4
+// NVPTX-NEXT:    [[TMP39:%.*]] = load i32, ptr [[__ID_I]], align 4
+// NVPTX-NEXT:    [[TMP40:%.*]] = call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// NVPTX-NEXT:    [[SUB1_I:%.*]] = sub i32 [[TMP40]], 1
+// NVPTX-NEXT:    [[TMP41:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP37]], i32 [[TMP38]], i32 [[TMP39]], i32 [[SUB1_I]])
+// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I24]], align 8
+// NVPTX-NEXT:    store i8 1, ptr [[__X_ADDR_I25]], align 1
+// NVPTX-NEXT:    [[TMP42:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I24]], align 8
+// NVPTX-NEXT:    [[CONV_I27:%.*]] = trunc i64 [[TMP42]] to i32
+// NVPTX-NEXT:    store i32 [[CONV_I27]], ptr [[__MASK_I26]], align 4
+// NVPTX-NEXT:    [[TMP43:%.*]] = load i32, ptr [[__MASK_I26]], align 4
+// NVPTX-NEXT:    [[TMP44:%.*]] = load i8, ptr [[__X_ADDR_I25]], align 1
+// NVPTX-NEXT:    [[LOADEDV_I:%.*]] = trunc i8 [[TMP44]] to i1
+// NVPTX-NEXT:    [[TMP45:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[TMP43]], i1 [[LOADEDV_I]])
+// NVPTX-NEXT:    [[CONV1_I:%.*]] = zext i32 [[TMP45]] to i64
+// NVPTX-NEXT:    call void @llvm.nvvm.barrier0()
+// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I28]], align 8
+// NVPTX-NEXT:    [[TMP46:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I28]], align 8
+// NVPTX-NEXT:    [[CONV_I29:%.*]] = trunc i64 [[TMP46]] to i32
+// NVPTX-NEXT:    call void @llvm.nvvm.bar.warp.sync(i32 [[CONV_I29]])
+// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I30]], align 8
+// NVPTX-NEXT:    store i32 -1, ptr [[__IDX_ADDR_I]], align 4
+// NVPTX-NEXT:    store i32 -1, ptr [[__X_ADDR_I31]], align 4
+// NVPTX-NEXT:    [[TMP47:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I30]], align 8
+// NVPTX-NEXT:    [[CONV_I33:%.*]] = trunc i64 [[TMP47]] to i32
+// NVPTX-NEXT:    store i32 [[CONV_I33]], ptr [[__MASK_I32]], align 4
+// NVPTX-NEXT:    [[TMP48:%.*]] = load i32, ptr [[__MASK_I32]], align 4
+// NVPTX-NEXT:    [[TMP49:%.*]] = load i32, ptr [[__IDX_ADDR_I]], align 4
+// NVPTX-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP48]], [[TMP49]]
+// NVPTX-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 1
+// NVPTX-NEXT:    store i32 [[AND_I]], ptr [[__BITMASK_I]], align 4
+// NVPTX-NEXT:    [[TMP50:%.*]] = load i32, ptr [[__BITMASK_I]], align 4
+// NVPTX-NEXT:    [[SUB_I34:%.*]] = sub i32 0, [[TMP50]]
+// NVPTX-NEXT:    [[TMP51:%.*]] = load i32, ptr [[__MASK_I32]], align 4
+// NVPTX-NEXT:    [[TMP52:%.*]] = load i32, ptr [[__X_ADDR_I31]], align 4
+// NVPTX-NEXT:    [[TMP53:%.*]] = load i32, ptr [[__IDX_ADDR_I]], align 4
+// NVPTX-NEXT:    [[TMP54:%.*]] = call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// NVPTX-NEXT:    [[SUB1_I36:%.*]] = sub i32 [[TMP54]], 1
+// NVPTX-NEXT:    [[TMP55:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP51]], i32 [[TMP52]], i32 [[TMP53]], i32 [[SUB1_I36]])
+// NVPTX-NEXT:    [[AND2_I:%.*]] = and i32 [[SUB_I34]], [[TMP55]]
+// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I37]], align 8
+// NVPTX-NEXT:    [[TMP56:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I37]], align 8
+// NVPTX-NEXT:    [[TMP57:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP56]], i1 true)
+// NVPTX-NEXT:    [[TMP58:%.*]] = add i64 [[TMP57]], 1
+// NVPTX-NEXT:    [[ISZERO_I38:%.*]] = icmp eq i64 [[TMP56]], 0
+// NVPTX-NEXT:    [[FFS_I39:%.*]] = select i1 [[ISZERO_I38]], i64 0, i64 [[TMP58]]
+// NVPTX-NEXT:    [[CAST_I:%.*]] = trunc i64 [[FFS_I39]] to i32
+// NVPTX-NEXT:    [[SUB_I40:%.*]] = sub nsw i32 [[CAST_I]], 1
+// NVPTX-NEXT:    [[CONV_I41:%.*]] = sext i32 [[SUB_I40]] to i64
+// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I42]], align 8
+// NVPTX-NEXT:    [[TMP59:%.*]] = call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
+// NVPTX-NEXT:    [[CONV_I44:%.*]] = zext i32 [[TMP59]] to i64
+// NVPTX-NEXT:    [[TMP60:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I42]], align 8
+// NVPTX-NEXT:    store i64 [[TMP60]], ptr [[__LANE_MASK_ADDR_I_I]], align 8
+// NVPTX-NEXT:    [[TMP61:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I_I]], align 8
+// NVPTX-NEXT:    [[TMP62:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP61]], i1 true)
+// NVPTX-NEXT:    [[TMP63:%.*]] = add i64 [[TMP62]], 1
+// NVPTX-NEXT:    [[ISZERO_I_I:%.*]] = icmp eq i64 [[TMP61]], 0
+// NVPTX-NEXT:    [[FFS_I_I:%.*]] = select i1 [[ISZERO_I_I]], i64 0, i64 [[TMP63]]
+// NVPTX-NEXT:    [[CAST_I_I:%.*]] = trunc i64 [[FFS_I_I]] to i32
+// NVPTX-NEXT:    [[SUB_I_I:%.*]] = sub nsw i32 [[CAST_I_I]], 1
+// NVPTX-NEXT:    [[CONV_I_I:%.*]] = sext i32 [[SUB_I_I]] to i64
+// NVPTX-NEXT:    [[CMP_I:%.*]] = icmp eq i64 [[CONV_I44]], [[CONV_I_I]]
+// NVPTX-NEXT:    call void @llvm.nvvm.exit()
+// NVPTX-NEXT:    unreachable
+//
+void foo() {
+  __gpu_num_blocks_x();
+  __gpu_num_blocks_y();
+  __gpu_num_blocks_z();
+  __gpu_num_blocks();
+  __gpu_block_id_x();
+  __gpu_block_id_y();
+  __gpu_block_id_z();
+  __gpu_block_id();
+  __gpu_num_threads_x();
+  __gpu_num_threads_y();
+  __gpu_num_threads_z();
+  __gpu_num_threads();
+  __gpu_thread_id_x();
+  __gpu_thread_id_y();
+  __gpu_thread_id_z();
+  __gpu_thread_id();
+  __gpu_num_lanes();
+  __gpu_lane_id();
+  __gpu_lane_mask();
+  __gpu_broadcast(-1, -1);
+  __gpu_ballot(-1, 1);
+  __gpu_sync_threads();
+  __gpu_sync_lane(-1);
+  __gpu_shuffle_idx(-1, -1, -1);
+  __gpu_first_lane_id(-1);
+  __gpu_is_first_lane(-1);
+  __gpu_exit();
+}
+//.
+// AMDGPU: [[META3]] = !{}
+// AMDGPU: [[RNG4]] = !{i16 1, i16 1025}
+//.

>From 2e2a3ea0f4105c2fdd789817824b5b0cfc6efbaf Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 3 Oct 2024 16:00:43 -0500
Subject: [PATCH 03/18] Add test for compiling from different languages

---
 clang/test/Headers/gpuintrin_lang.c | 61 +++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 clang/test/Headers/gpuintrin_lang.c

diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
new file mode 100644
index 00000000000000..f66a9f7e1fd499
--- /dev/null
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -0,0 +1,61 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -fcuda-is-device -triple nvptx64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=CUDA
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -fcuda-is-device -triple amdgcn -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=HIP
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -cl-std=CL3.0 -triple amdgcn -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=OPENCL
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers/ -cl-std=CL3.0 \
+// RUN:   -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
+// RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=OPENMP
+
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#include <gpuintrin.h>
+
+#ifdef __device__
+__device__ int foo() { return __gpu_thread_id_x(); }
+#else
+// CUDA-LABEL: define dso_local i32 @foo(
+// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
+// CUDA-NEXT:  [[ENTRY:.*:]]
+// CUDA-NEXT:    [[TMP0:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT:    ret i32 [[TMP0]]
+//
+// HIP-LABEL: define dso_local i32 @foo(
+// HIP-SAME: ) #[[ATTR0:[0-9]+]] {
+// HIP-NEXT:  [[ENTRY:.*:]]
+// HIP-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5)
+// HIP-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// HIP-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// HIP-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
+// HIP-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// HIP-NEXT:    ret i32 [[TMP0]]
+//
+// OPENCL-LABEL: define dso_local i32 @foo(
+// OPENCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// OPENCL-NEXT:    ret i32 [[TMP0]]
+//
+// OPENMP-LABEL: define hidden i32 @foo(
+// OPENMP-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENMP-NEXT:  [[ENTRY:.*:]]
+// OPENMP-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// OPENMP-NEXT:    ret i32 [[TMP0]]
+//
+int foo() { return __gpu_thread_id_x(); }
+#pragma omp declare target to(foo)
+#endif

>From 10e807dea7a7754b775f785b1ef06ef698254a44 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 3 Oct 2024 16:13:37 -0500
Subject: [PATCH 04/18] Add stricter atomic

---
 libc/src/__support/GPU/amdgpu/utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 6ab95403ca3890..f09f073c5e94bb 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -136,8 +136,8 @@ LIBC_INLINE uint32_t get_lane_size() {
 
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
   __builtin_amdgcn_s_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
 /// Waits for all pending memory operations to complete in program order.

>From db8dbd1f91fd0f523212ea8768f489df1c0bf7b1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 25 Oct 2024 12:08:52 -0500
Subject: [PATCH 05/18] Change to __ for C

---
 clang/lib/Headers/amdgpuintrin.h | 11 ++++++-----
 clang/lib/Headers/nvptxintrin.h  | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 1fd7261cf4ca75..88ba55e4cbe8fb 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -26,13 +26,14 @@
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 // Type aliases to the address spaces used by the AMDGPU backend.
-#define _Private __attribute__((opencl_private))
-#define _Constant __attribute__((opencl_constant))
-#define _Local __attribute__((opencl_local))
-#define _Global __attribute__((opencl_global))
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))
 
 // Attribute to declare a function as a kernel.
-#define _Kernel __attribute__((amdgpu_kernel, visibility("protected")))
+#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fc9769d4c578dd..d8aefd67ccfced 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -26,13 +26,14 @@
 #pragma omp begin declare variant match(device = {arch(nvptx64)})
 
 // Type aliases to the address spaces used by the NVPTX backend.
-#define _Private __attribute__((opencl_private))
-#define _Constant __attribute__((opencl_constant))
-#define _Local __attribute__((opencl_local))
-#define _Global __attribute__((opencl_global))
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))
 
 // Attribute to declare a function as a kernel.
-#define _Kernel __attribute__((nvptx_kernel, visibility("protected")))
+#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {

>From 7ebfdf43061869f3630338b47f007d9342d56fe1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 28 Oct 2024 12:05:47 -0500
Subject: [PATCH 06/18] Update names

---
 clang/lib/Headers/amdgpuintrin.h | 16 ++++++++--------
 clang/lib/Headers/gpuintrin.h    | 24 +++++++++++++++++-------
 clang/lib/Headers/nvptxintrin.h  | 16 ++++++++--------
 clang/test/Headers/gpuintrin.c   |  4 ++--
 4 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 88ba55e4cbe8fb..6bbe0f8c89b8a3 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -26,14 +26,14 @@
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 // Type aliases to the address spaces used by the AMDGPU backend.
-#define __private __attribute__((opencl_private))
-#define __constant __attribute__((opencl_constant))
-#define __local __attribute__((opencl_local))
-#define __global __attribute__((opencl_global))
-#define __generic __attribute__((opencl_generic))
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
 
 // Attribute to declare a function as a kernel.
-#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
@@ -113,7 +113,7 @@ _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
 
 // Copies the value from the first active thread in the wavefront to the rest.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
@@ -139,7 +139,7 @@ __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
 }
 
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 2531ad8bba70c8..2e80acdf78d285 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -5,6 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef __GPUINTRIN_H
 #define __GPUINTRIN_H
@@ -13,6 +21,8 @@
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#else
+#error "This header is only meant to be used on GPU architectures."
 #endif
 
 // Returns the total number of blocks / workgroups.
@@ -51,22 +61,22 @@ _DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
-_DEFAULT_ATTRS static inline uint32_t __gpu_lane_reduce(uint64_t __lane_mask,
-                                                        uint32_t x) {
+_DEFAULT_ATTRS static inline uint32_t
+__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
   for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
     uint32_t index = step + __gpu_lane_id();
-    x += __gpu_shuffle_idx(__lane_mask, index, x);
+    x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
   }
-  return __gpu_broadcast(__lane_mask, x);
+  return __gpu_broadcast_u32(__lane_mask, x);
 }
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
-_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan(uint64_t __lane_mask,
-                                                      uint32_t x) {
+_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
+                                                          uint32_t x) {
   for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
     uint32_t index = __gpu_lane_id() - step;
     uint32_t bitmask = __gpu_lane_id() >= step;
-    x += -bitmask & __gpu_shuffle_idx(__lane_mask, index, x);
+    x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
   }
   return x;
 }
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index d8aefd67ccfced..66d236b9faf522 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -26,14 +26,14 @@
 #pragma omp begin declare variant match(device = {arch(nvptx64)})
 
 // Type aliases to the address spaces used by the NVPTX backend.
-#define __private __attribute__((opencl_private))
-#define __constant __attribute__((opencl_constant))
-#define __local __attribute__((opencl_local))
-#define __global __attribute__((opencl_global))
-#define __generic __attribute__((opencl_generic))
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
 
 // Attribute to declare a function as a kernel.
-#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
@@ -112,7 +112,7 @@ _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
 
 // Copies the value from the first active thread in the warp to the rest.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __id = __builtin_ffs(__mask) - 1;
   return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
@@ -138,7 +138,7 @@ __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __bitmask = (__mask >> __idx) & 1u;
   return -__bitmask &
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index ff5a816699ebef..4d015bb8981732 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -493,11 +493,11 @@ void foo() {
   __gpu_num_lanes();
   __gpu_lane_id();
   __gpu_lane_mask();
-  __gpu_broadcast(-1, -1);
+  __gpu_broadcast_u32(-1, -1);
   __gpu_ballot(-1, 1);
   __gpu_sync_threads();
   __gpu_sync_lane(-1);
-  __gpu_shuffle_idx(-1, -1, -1);
+  __gpu_shuffle_idx_u32(-1, -1, -1);
   __gpu_first_lane_id(-1);
   __gpu_is_first_lane(-1);
   __gpu_exit();

>From 24e04fa9473cb9af6fe93fd2172b45b0073f0b48 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 5 Nov 2024 12:57:45 -0600
Subject: [PATCH 07/18] Update for comments

---
 clang/lib/Headers/amdgpuintrin.h      |  55 +--
 clang/lib/Headers/gpuintrin.h         |  77 ++--
 clang/lib/Headers/nvptxintrin.h       |  57 +--
 clang/test/Headers/gpuintrin.c        | 524 +++-----------------------
 clang/test/Headers/gpuintrin_lang.c   |   2 +-
 libc/src/__support/GPU/amdgpu/utils.h |   2 +-
 6 files changed, 177 insertions(+), 540 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 6bbe0f8c89b8a3..056d8765340c72 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -16,10 +16,12 @@
 #include <stdbool.h>
 #include <stdint.h>
 
+#if !defined(_DEFAULT_FN_ATTRS)
 #if defined(__HIP__) || defined(__CUDA__)
-#define _DEFAULT_ATTRS __attribute__((device))
-#elif !defined(_DEFAULT_ATTRS)
-#define _DEFAULT_ATTRS
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
 #endif
 
 #pragma omp begin declare target device_type(nohost)
@@ -36,115 +38,114 @@
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of workgroups in the 'x' dimension of the grid.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
 }
 
 // Returns the number of workgroups in the 'y' dimension of the grid.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_y(void) {
   return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
 }
 
 // Returns the number of workgroups in the 'z' dimension of the grid.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_z(void) {
   return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
 }
 
 // Returns the 'x' dimension of the current AMD workgroup's id.
-_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_x(void) {
   return __builtin_amdgcn_workgroup_id_x();
 }
 
 // Returns the 'y' dimension of the current AMD workgroup's id.
-_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_y(void) {
   return __builtin_amdgcn_workgroup_id_y();
 }
 
 // Returns the 'z' dimension of the current AMD workgroup's id.
-_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_z(void) {
   return __builtin_amdgcn_workgroup_id_z();
 }
 
 // Returns the number of workitems in the 'x' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_x(void) {
   return __builtin_amdgcn_workgroup_size_x();
 }
 
 // Returns the number of workitems in the 'y' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_y(void) {
   return __builtin_amdgcn_workgroup_size_y();
 }
 
 // Returns the number of workitems in the 'z' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_z(void) {
   return __builtin_amdgcn_workgroup_size_z();
 }
 
 // Returns the 'x' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_x(void) {
   return __builtin_amdgcn_workitem_id_x();
 }
 
 // Returns the 'y' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_y(void) {
   return __builtin_amdgcn_workitem_id_y();
 }
 
 // Returns the 'z' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_z(void) {
   return __builtin_amdgcn_workitem_id_z();
 }
 
 // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
 // and compilation options.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_lanes(void) {
   return __builtin_amdgcn_wavefrontsize();
 }
 
 // Returns the id of the thread inside of an AMD wavefront executing together.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_lane_id(void) {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
 
 // Returns the bit-mask of active threads in the current wavefront.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
+_DEFAULT_FN_ATTRS static inline uint64_t __gpu_lane_mask(void) {
   return __builtin_amdgcn_read_exec();
 }
 
 // Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_DEFAULT_FN_ATTRS static inline uint32_t
 __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
-__gpu_ballot(uint64_t __lane_mask, bool __x) {
+_DEFAULT_FN_ATTRS static inline uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                      bool __x) {
   // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
   // the active threads
   return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
-_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
+_DEFAULT_FN_ATTRS static inline void __gpu_sync_threads(void) {
   __builtin_amdgcn_s_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
 }
 
 // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-_DEFAULT_ATTRS [[clang::convergent]] static inline void
-__gpu_sync_lane(uint64_t __lane_mask) {
+_DEFAULT_FN_ATTRS static inline void __gpu_sync_lane(uint64_t __lane_mask) {
   __builtin_amdgcn_wave_barrier();
 }
 
 // Shuffles the the lanes inside the wavefront according to the given index.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_DEFAULT_FN_ATTRS static inline uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
 }
 
 // Terminates execution of the associated wavefront.
-_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() {
+_DEFAULT_FN_ATTRS [[noreturn]] static inline void __gpu_exit(void) {
   __builtin_amdgcn_endpgm();
 }
 
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 2e80acdf78d285..f804c489481797 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -25,43 +25,76 @@
 #error "This header is only meant to be used on GPU architectures."
 #endif
 
-// Returns the total number of blocks / workgroups.
-_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
-  return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
+// Returns the number of blocks in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_num_blocks_x();
+  case 1:
+    return __gpu_num_blocks_y();
+  case 2:
+    return __gpu_num_blocks_z();
+  default:
+    __builtin_unreachable();
+  }
 }
 
-// Returns the absolute id of the block / workgroup.
-_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
-  return __gpu_block_id_x() +
-         (uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
-         (uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
-             __gpu_block_id_z();
+// Returns the number of block id in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_block_id_x();
+  case 1:
+    return __gpu_block_id_y();
+  case 2:
+    return __gpu_block_id_z();
+  default:
+    __builtin_unreachable();
+  }
 }
 
-// Returns the total number of threads in the block / workgroup.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {
-  return __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_num_threads_z();
+// Returns the number of threads in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_num_threads_x();
+  case 1:
+    return __gpu_num_threads_y();
+  case 2:
+    return __gpu_num_threads_z();
+  default:
+    __builtin_unreachable();
+  }
 }
 
-// Returns the absolute id of the thread in the current block / workgroup.
-_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id() {
-  return __gpu_thread_id_x() + __gpu_num_threads_x() * __gpu_thread_id_y() +
-         __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_thread_id_z();
+// Returns the thread id in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id(int __dim) {
+  switch (__dim) {
+  case 0:
+    return __gpu_thread_id_x();
+  case 1:
+    return __gpu_thread_id_y();
+  case 2:
+    return __gpu_thread_id_z();
+  default:
+    __builtin_unreachable();
+  }
 }
 
 // Get the first active thread inside the lane.
-_DEFAULT_ATTRS static inline uint64_t
+_DEFAULT_FN_ATTRS static inline uint64_t
 __gpu_first_lane_id(uint64_t __lane_mask) {
   return __builtin_ffsll(__lane_mask) - 1;
 }
 
 // Conditional that is only true for a single thread in a lane.
-_DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {
+_DEFAULT_FN_ATTRS static inline bool
+__gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
-_DEFAULT_ATTRS static inline uint32_t
+_DEFAULT_FN_ATTRS static inline uint32_t
 __gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
   for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
     uint32_t index = step + __gpu_lane_id();
@@ -71,8 +104,8 @@ __gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
 }
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
-_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
-                                                          uint32_t x) {
+_DEFAULT_FN_ATTRS static inline uint32_t
+__gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
   for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
     uint32_t index = __gpu_lane_id() - step;
     uint32_t bitmask = __gpu_lane_id() >= step;
@@ -81,6 +114,6 @@ _DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
   return x;
 }
 
-#undef _DEFAULT_ATTRS
+#undef _DEFAULT_FN_ATTRS
 
 #endif // __GPUINTRIN_H
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 66d236b9faf522..0fee0faa4c9680 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -16,10 +16,12 @@
 #include <stdbool.h>
 #include <stdint.h>
 
+#if !defined(_DEFAULT_FN_ATTRS)
 #if defined(__HIP__) || defined(__CUDA__)
-#define _DEFAULT_ATTRS __attribute__((device))
-#elif !defined(_DEFAULT_ATTRS)
-#define _DEFAULT_ATTRS
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
 #endif
 
 #pragma omp begin declare target device_type(nohost)
@@ -33,85 +35,85 @@
 #define __gpu_generic __attribute__((opencl_generic))
 
 // Attribute to declare a function as a kernel.
-#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
 // Returns the number of CUDA blocks in the 'x' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
 
 // Returns the number of CUDA blocks in the 'y' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_y(void) {
   return __nvvm_read_ptx_sreg_nctaid_y();
 }
 
 // Returns the number of CUDA blocks in the 'z' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_z(void) {
   return __nvvm_read_ptx_sreg_nctaid_z();
 }
 
 // Returns the 'x' dimension of the current CUDA block's id.
-_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_x(void) {
   return __nvvm_read_ptx_sreg_ctaid_x();
 }
 
 // Returns the 'y' dimension of the current CUDA block's id.
-_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_y(void) {
   return __nvvm_read_ptx_sreg_ctaid_y();
 }
 
 // Returns the 'z' dimension of the current CUDA block's id.
-_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_z(void) {
   return __nvvm_read_ptx_sreg_ctaid_z();
 }
 
 // Returns the number of CUDA threads in the 'x' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_x(void) {
   return __nvvm_read_ptx_sreg_ntid_x();
 }
 
 // Returns the number of CUDA threads in the 'y' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_y(void) {
   return __nvvm_read_ptx_sreg_ntid_y();
 }
 
 // Returns the number of CUDA threads in the 'z' dimension.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_z(void) {
   return __nvvm_read_ptx_sreg_ntid_z();
 }
 
 // Returns the 'x' dimension id of the thread in the current CUDA block.
-_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_x(void) {
   return __nvvm_read_ptx_sreg_tid_x();
 }
 
 // Returns the 'y' dimension id of the thread in the current CUDA block.
-_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_y(void) {
   return __nvvm_read_ptx_sreg_tid_y();
 }
 
 // Returns the 'z' dimension id of the thread in the current CUDA block.
-_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_z(void) {
   return __nvvm_read_ptx_sreg_tid_z();
 }
 
 // Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
-_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_lanes(void) {
   return __nvvm_read_ptx_sreg_warpsize();
 }
 
 // Returns the id of the thread inside of a CUDA warp executing together.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_lane_id(void) {
   return __nvvm_read_ptx_sreg_laneid();
 }
 
 // Returns the bit-mask of active threads in the current warp.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
+_DEFAULT_FN_ATTRS static inline uint64_t __gpu_lane_mask(void) {
   return __nvvm_activemask();
 }
 
 // Copies the value from the first active thread in the warp to the rest.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_DEFAULT_FN_ATTRS static inline uint32_t
 __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __id = __builtin_ffs(__mask) - 1;
@@ -119,25 +121,24 @@ __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
-__gpu_ballot(uint64_t __lane_mask, bool __x) {
+_DEFAULT_FN_ATTRS static inline uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                      bool __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   return __nvvm_vote_ballot_sync(__mask, __x);
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
-_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
+_DEFAULT_FN_ATTRS static inline void __gpu_sync_threads(void) {
   __syncthreads();
 }
 
 // Waits for all threads in the warp to reconverge for independent scheduling.
-_DEFAULT_ATTRS [[clang::convergent]] static inline void
-__gpu_sync_lane(uint64_t __lane_mask) {
+_DEFAULT_FN_ATTRS static inline void __gpu_sync_lane(uint64_t __lane_mask) {
   __nvvm_bar_warp_sync((uint32_t)__lane_mask);
 }
 
 // Shuffles the the lanes inside the warp according to the given index.
-_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_DEFAULT_FN_ATTRS static inline uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __bitmask = (__mask >> __idx) & 1u;
@@ -146,7 +147,9 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
 }
 
 // Terminates execution of the calling thread.
-_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() { __nvvm_exit(); }
+_DEFAULT_FN_ATTRS [[noreturn]] static inline void __gpu_exit(void) {
+  __nvvm_exit();
+}
 
 #pragma omp end declare variant
 #pragma omp end declare target
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 4d015bb8981732..25e89b0e059e29 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -13,483 +13,87 @@
 #define _DEFAULT_ATTRS __attribute__((always_inline))
 #include <gpuintrin.h>
 
-// AMDGPU-LABEL: define dso_local void @foo(
+// AMDGPU-LABEL: define protected amdgpu_kernel void @foo(
 // AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
-// AMDGPU-NEXT:    [[RETVAL_I116:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I114:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I112:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I110:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I19_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I17_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I15_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I12_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I9_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I_I103:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I104:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I101:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I99:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I97:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I7_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I4_I87:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I_I88:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I89:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I84:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I81:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I78:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I26_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I24_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I22_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I18_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I14_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I_I70:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I71:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I68:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I66:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I64:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I8_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I4_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I_I58:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I59:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I54:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I50:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I47:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I_I:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I_I:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I42:%.*]] = alloca i1, align 1, addrspace(5)
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I43:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I38:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I39:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I32:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I33:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[__IDX_ADDR_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[__X_ADDR_I34:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I30:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I24:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I25:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[__X_ADDR_I26:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_I:%.*]] = alloca i64, align 8, addrspace(5)
-// AMDGPU-NEXT:    [[__X_ADDR_I:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I48:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I47]] to ptr
-// AMDGPU-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 12
-// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META3:![0-9]+]]
-// AMDGPU-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12
-// AMDGPU-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG4:![0-9]+]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I49:%.*]] = zext i16 [[TMP5]] to i32
-// AMDGPU-NEXT:    [[DIV_I:%.*]] = udiv i32 [[TMP2]], [[CONV_I49]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I51:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I50]] to ptr
-// AMDGPU-NEXT:    [[TMP6:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 16
-// AMDGPU-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP9:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP9]], i32 14
-// AMDGPU-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I52:%.*]] = zext i16 [[TMP11]] to i32
-// AMDGPU-NEXT:    [[DIV_I53:%.*]] = udiv i32 [[TMP8]], [[CONV_I52]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I55:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I54]] to ptr
-// AMDGPU-NEXT:    [[TMP12:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP12]], i32 20
-// AMDGPU-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(4) [[TMP13]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP15:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP15]], i32 16
-// AMDGPU-NEXT:    [[TMP17:%.*]] = load i16, ptr addrspace(4) [[TMP16]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I56:%.*]] = zext i16 [[TMP17]] to i32
-// AMDGPU-NEXT:    [[DIV_I57:%.*]] = udiv i32 [[TMP14]], [[CONV_I56]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I60:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I59]] to ptr
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I61:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I58]] to ptr
-// AMDGPU-NEXT:    [[TMP18:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP18]], i32 12
-// AMDGPU-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP21:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP21]], i32 12
-// AMDGPU-NEXT:    [[TMP23:%.*]] = load i16, ptr addrspace(4) [[TMP22]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I_I62:%.*]] = zext i16 [[TMP23]] to i32
-// AMDGPU-NEXT:    [[DIV_I_I:%.*]] = udiv i32 [[TMP20]], [[CONV_I_I62]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I5_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I4_I]] to ptr
-// AMDGPU-NEXT:    [[TMP24:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP24]], i32 16
-// AMDGPU-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(4) [[TMP25]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP27:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP27]], i32 14
-// AMDGPU-NEXT:    [[TMP29:%.*]] = load i16, ptr addrspace(4) [[TMP28]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I6_I:%.*]] = zext i16 [[TMP29]] to i32
-// AMDGPU-NEXT:    [[DIV_I7_I:%.*]] = udiv i32 [[TMP26]], [[CONV_I6_I]]
-// AMDGPU-NEXT:    [[MUL_I:%.*]] = mul i32 [[DIV_I_I]], [[DIV_I7_I]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I9_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I8_I]] to ptr
-// AMDGPU-NEXT:    [[TMP30:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP30]], i32 20
-// AMDGPU-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(4) [[TMP31]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP33:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP33]], i32 16
-// AMDGPU-NEXT:    [[TMP35:%.*]] = load i16, ptr addrspace(4) [[TMP34]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I10_I:%.*]] = zext i16 [[TMP35]] to i32
-// AMDGPU-NEXT:    [[DIV_I11_I:%.*]] = udiv i32 [[TMP32]], [[CONV_I10_I]]
-// AMDGPU-NEXT:    [[MUL3_I:%.*]] = mul i32 [[MUL_I]], [[DIV_I11_I]]
-// AMDGPU-NEXT:    [[CONV_I63:%.*]] = zext i32 [[MUL3_I]] to i64
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I65:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I64]] to ptr
-// AMDGPU-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I67:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I66]] to ptr
-// AMDGPU-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I69:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I68]] to ptr
-// AMDGPU-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I72:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I71]] to ptr
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I23_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I22_I]] to ptr
-// AMDGPU-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-// AMDGPU-NEXT:    [[CONV_I73:%.*]] = zext i32 [[TMP39]] to i64
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I15_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I14_I]] to ptr
-// AMDGPU-NEXT:    [[TMP40:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP40]], i32 12
-// AMDGPU-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(4) [[TMP41]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP43:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP43]], i32 12
-// AMDGPU-NEXT:    [[TMP45:%.*]] = load i16, ptr addrspace(4) [[TMP44]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I16_I:%.*]] = zext i16 [[TMP45]] to i32
-// AMDGPU-NEXT:    [[DIV_I17_I:%.*]] = udiv i32 [[TMP42]], [[CONV_I16_I]]
-// AMDGPU-NEXT:    [[CONV2_I:%.*]] = zext i32 [[DIV_I17_I]] to i64
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I25_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I24_I]] to ptr
-// AMDGPU-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-// AMDGPU-NEXT:    [[CONV4_I:%.*]] = zext i32 [[TMP46]] to i64
-// AMDGPU-NEXT:    [[MUL_I74:%.*]] = mul i64 [[CONV2_I]], [[CONV4_I]]
-// AMDGPU-NEXT:    [[ADD_I:%.*]] = add i64 [[CONV_I73]], [[MUL_I74]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I75:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I70]] to ptr
-// AMDGPU-NEXT:    [[TMP47:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP47]], i32 12
-// AMDGPU-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(4) [[TMP48]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP50:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP50]], i32 12
-// AMDGPU-NEXT:    [[TMP52:%.*]] = load i16, ptr addrspace(4) [[TMP51]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I_I76:%.*]] = zext i16 [[TMP52]] to i32
-// AMDGPU-NEXT:    [[DIV_I_I77:%.*]] = udiv i32 [[TMP49]], [[CONV_I_I76]]
-// AMDGPU-NEXT:    [[CONV6_I:%.*]] = zext i32 [[DIV_I_I77]] to i64
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I19_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I18_I]] to ptr
-// AMDGPU-NEXT:    [[TMP53:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// AMDGPU-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP53]], i32 16
-// AMDGPU-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(4) [[TMP54]], align 4, !invariant.load [[META3]]
-// AMDGPU-NEXT:    [[TMP56:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP56]], i32 14
-// AMDGPU-NEXT:    [[TMP58:%.*]] = load i16, ptr addrspace(4) [[TMP57]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I20_I:%.*]] = zext i16 [[TMP58]] to i32
-// AMDGPU-NEXT:    [[DIV_I21_I:%.*]] = udiv i32 [[TMP55]], [[CONV_I20_I]]
-// AMDGPU-NEXT:    [[CONV8_I:%.*]] = zext i32 [[DIV_I21_I]] to i64
-// AMDGPU-NEXT:    [[MUL9_I:%.*]] = mul i64 [[CONV6_I]], [[CONV8_I]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I27_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I26_I]] to ptr
-// AMDGPU-NEXT:    [[TMP59:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-// AMDGPU-NEXT:    [[CONV11_I:%.*]] = zext i32 [[TMP59]] to i64
-// AMDGPU-NEXT:    [[MUL12_I:%.*]] = mul i64 [[MUL9_I]], [[CONV11_I]]
-// AMDGPU-NEXT:    [[ADD13_I:%.*]] = add i64 [[ADD_I]], [[MUL12_I]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I79:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I78]] to ptr
-// AMDGPU-NEXT:    [[TMP60:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP60]], i32 12
-// AMDGPU-NEXT:    [[TMP62:%.*]] = load i16, ptr addrspace(4) [[TMP61]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I80:%.*]] = zext i16 [[TMP62]] to i32
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I82:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I81]] to ptr
-// AMDGPU-NEXT:    [[TMP63:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP63]], i32 14
-// AMDGPU-NEXT:    [[TMP65:%.*]] = load i16, ptr addrspace(4) [[TMP64]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I83:%.*]] = zext i16 [[TMP65]] to i32
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I85:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I84]] to ptr
-// AMDGPU-NEXT:    [[TMP66:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP66]], i32 16
-// AMDGPU-NEXT:    [[TMP68:%.*]] = load i16, ptr addrspace(4) [[TMP67]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I86:%.*]] = zext i16 [[TMP68]] to i32
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I90:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I89]] to ptr
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I91:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I88]] to ptr
-// AMDGPU-NEXT:    [[TMP69:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP70:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP69]], i32 12
-// AMDGPU-NEXT:    [[TMP71:%.*]] = load i16, ptr addrspace(4) [[TMP70]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I_I92:%.*]] = zext i16 [[TMP71]] to i32
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I5_I93:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I4_I87]] to ptr
-// AMDGPU-NEXT:    [[TMP72:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP72]], i32 14
-// AMDGPU-NEXT:    [[TMP74:%.*]] = load i16, ptr addrspace(4) [[TMP73]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I6_I94:%.*]] = zext i16 [[TMP74]] to i32
-// AMDGPU-NEXT:    [[MUL_I95:%.*]] = mul i32 [[CONV_I_I92]], [[CONV_I6_I94]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I8_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I7_I]] to ptr
-// AMDGPU-NEXT:    [[TMP75:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP75]], i32 16
-// AMDGPU-NEXT:    [[TMP77:%.*]] = load i16, ptr addrspace(4) [[TMP76]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I9_I:%.*]] = zext i16 [[TMP77]] to i32
-// AMDGPU-NEXT:    [[MUL3_I96:%.*]] = mul i32 [[MUL_I95]], [[CONV_I9_I]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I98:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I97]] to ptr
-// AMDGPU-NEXT:    [[TMP78:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I100:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I99]] to ptr
-// AMDGPU-NEXT:    [[TMP79:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I102:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I101]] to ptr
-// AMDGPU-NEXT:    [[TMP80:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I105:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I104]] to ptr
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I16_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I15_I]] to ptr
-// AMDGPU-NEXT:    [[TMP81:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I10_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I9_I]] to ptr
-// AMDGPU-NEXT:    [[TMP82:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP82]], i32 12
-// AMDGPU-NEXT:    [[TMP84:%.*]] = load i16, ptr addrspace(4) [[TMP83]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I11_I:%.*]] = zext i16 [[TMP84]] to i32
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I18_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I17_I]] to ptr
-// AMDGPU-NEXT:    [[TMP85:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
-// AMDGPU-NEXT:    [[MUL_I106:%.*]] = mul i32 [[CONV_I11_I]], [[TMP85]]
-// AMDGPU-NEXT:    [[ADD_I107:%.*]] = add i32 [[TMP81]], [[MUL_I106]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I108:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I103]] to ptr
-// AMDGPU-NEXT:    [[TMP86:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP86]], i32 12
-// AMDGPU-NEXT:    [[TMP88:%.*]] = load i16, ptr addrspace(4) [[TMP87]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I_I109:%.*]] = zext i16 [[TMP88]] to i32
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I13_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I12_I]] to ptr
-// AMDGPU-NEXT:    [[TMP89:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP90:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP89]], i32 14
-// AMDGPU-NEXT:    [[TMP91:%.*]] = load i16, ptr addrspace(4) [[TMP90]], align 2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
-// AMDGPU-NEXT:    [[CONV_I14_I:%.*]] = zext i16 [[TMP91]] to i32
-// AMDGPU-NEXT:    [[MUL5_I:%.*]] = mul i32 [[CONV_I_I109]], [[CONV_I14_I]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I20_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I19_I]] to ptr
-// AMDGPU-NEXT:    [[TMP92:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
-// AMDGPU-NEXT:    [[MUL7_I:%.*]] = mul i32 [[MUL5_I]], [[TMP92]]
-// AMDGPU-NEXT:    [[ADD8_I:%.*]] = add i32 [[ADD_I107]], [[MUL7_I]]
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I111:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I110]] to ptr
-// AMDGPU-NEXT:    [[TMP93:%.*]] = call i32 @llvm.amdgcn.wavefrontsize()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I115:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I114]] to ptr
-// AMDGPU-NEXT:    [[TMP94:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-// AMDGPU-NEXT:    [[TMP95:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP94]])
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I117:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I116]] to ptr
-// AMDGPU-NEXT:    [[TMP96:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I]] to ptr
-// AMDGPU-NEXT:    [[__X_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I]] to ptr
-// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I]], align 8
-// AMDGPU-NEXT:    store i32 -1, ptr [[__X_ADDR_ASCAST_I]], align 4
-// AMDGPU-NEXT:    [[TMP97:%.*]] = load i32, ptr [[__X_ADDR_ASCAST_I]], align 4
-// AMDGPU-NEXT:    [[TMP98:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP97]])
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I27:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I24]] to ptr
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I28:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I25]] to ptr
-// AMDGPU-NEXT:    [[__X_ADDR_ASCAST_I29:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I26]] to ptr
-// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I28]], align 8
-// AMDGPU-NEXT:    store i8 1, ptr [[__X_ADDR_ASCAST_I29]], align 1
-// AMDGPU-NEXT:    [[TMP99:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I28]], align 8
-// AMDGPU-NEXT:    [[TMP100:%.*]] = load i8, ptr [[__X_ADDR_ASCAST_I29]], align 1
-// AMDGPU-NEXT:    [[LOADEDV_I:%.*]] = trunc i8 [[TMP100]] to i1
-// AMDGPU-NEXT:    [[TMP101:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[LOADEDV_I]])
-// AMDGPU-NEXT:    [[AND_I:%.*]] = and i64 [[TMP99]], [[TMP101]]
-// AMDGPU-NEXT:    call void @llvm.amdgcn.s.barrier()
-// AMDGPU-NEXT:    fence syncscope("workgroup") acquire
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I31:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I30]] to ptr
-// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I31]], align 8
-// AMDGPU-NEXT:    call void @llvm.amdgcn.wave.barrier()
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I35:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I32]] to ptr
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I36:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I33]] to ptr
-// AMDGPU-NEXT:    [[__IDX_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__IDX_ADDR_I]] to ptr
-// AMDGPU-NEXT:    [[__X_ADDR_ASCAST_I37:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I34]] to ptr
-// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I36]], align 8
-// AMDGPU-NEXT:    store i32 -1, ptr [[__IDX_ADDR_ASCAST_I]], align 4
-// AMDGPU-NEXT:    store i32 -1, ptr [[__X_ADDR_ASCAST_I37]], align 4
-// AMDGPU-NEXT:    [[TMP102:%.*]] = load i32, ptr [[__IDX_ADDR_ASCAST_I]], align 4
-// AMDGPU-NEXT:    [[SHL_I:%.*]] = shl i32 [[TMP102]], 2
-// AMDGPU-NEXT:    [[TMP103:%.*]] = load i32, ptr [[__X_ADDR_ASCAST_I37]], align 4
-// AMDGPU-NEXT:    [[TMP104:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL_I]], i32 [[TMP103]])
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I40:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I38]] to ptr
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I41:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I39]] to ptr
-// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I41]], align 8
-// AMDGPU-NEXT:    [[TMP105:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I41]], align 8
-// AMDGPU-NEXT:    [[TMP106:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP105]], i1 true)
-// AMDGPU-NEXT:    [[TMP107:%.*]] = add i64 [[TMP106]], 1
-// AMDGPU-NEXT:    [[ISZERO_I:%.*]] = icmp eq i64 [[TMP105]], 0
-// AMDGPU-NEXT:    [[FFS_I:%.*]] = select i1 [[ISZERO_I]], i64 0, i64 [[TMP107]]
-// AMDGPU-NEXT:    [[CAST_I:%.*]] = trunc i64 [[FFS_I]] to i32
-// AMDGPU-NEXT:    [[SUB_I:%.*]] = sub nsw i32 [[CAST_I]], 1
-// AMDGPU-NEXT:    [[CONV_I:%.*]] = sext i32 [[SUB_I]] to i64
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I44:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I42]] to ptr
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I45:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I43]] to ptr
-// AMDGPU-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_ASCAST_I45]], align 8
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I113:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I112]] to ptr
-// AMDGPU-NEXT:    [[TMP108:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-// AMDGPU-NEXT:    [[TMP109:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP108]])
-// AMDGPU-NEXT:    [[CONV_I46:%.*]] = zext i32 [[TMP109]] to i64
-// AMDGPU-NEXT:    [[TMP110:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I45]], align 8
-// AMDGPU-NEXT:    [[RETVAL_ASCAST_I_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I_I]] to ptr
-// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST_I_I:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR_I_I]] to ptr
-// AMDGPU-NEXT:    store i64 [[TMP110]], ptr [[__LANE_MASK_ADDR_ASCAST_I_I]], align 8
-// AMDGPU-NEXT:    [[TMP111:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST_I_I]], align 8
-// AMDGPU-NEXT:    [[TMP112:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP111]], i1 true)
-// AMDGPU-NEXT:    [[TMP113:%.*]] = add i64 [[TMP112]], 1
-// AMDGPU-NEXT:    [[ISZERO_I_I:%.*]] = icmp eq i64 [[TMP111]], 0
-// AMDGPU-NEXT:    [[FFS_I_I:%.*]] = select i1 [[ISZERO_I_I]], i64 0, i64 [[TMP113]]
-// AMDGPU-NEXT:    [[CAST_I_I:%.*]] = trunc i64 [[FFS_I_I]] to i32
-// AMDGPU-NEXT:    [[SUB_I_I:%.*]] = sub nsw i32 [[CAST_I_I]], 1
-// AMDGPU-NEXT:    [[CONV_I_I:%.*]] = sext i32 [[SUB_I_I]] to i64
-// AMDGPU-NEXT:    [[CMP_I:%.*]] = icmp eq i64 [[CONV_I46]], [[CONV_I_I]]
-// AMDGPU-NEXT:    call void @llvm.amdgcn.endpgm()
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR7:[0-9]+]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_broadcast_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
+// AMDGPU-NEXT:    call void @__gpu_sync_threads() #[[ATTR7]]
+// AMDGPU-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
 // AMDGPU-NEXT:    unreachable
 //
-// NVPTX-LABEL: define dso_local void @foo(
+// NVPTX-LABEL: define protected void @foo(
 // NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
 // NVPTX-NEXT:  [[ENTRY:.*:]]
-// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I_I:%.*]] = alloca i64, align 8
-// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I42:%.*]] = alloca i64, align 8
-// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I37:%.*]] = alloca i64, align 8
-// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I30:%.*]] = alloca i64, align 8
-// NVPTX-NEXT:    [[__IDX_ADDR_I:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[__X_ADDR_I31:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[__MASK_I32:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[__BITMASK_I:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I28:%.*]] = alloca i64, align 8
-// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I24:%.*]] = alloca i64, align 8
-// NVPTX-NEXT:    [[__X_ADDR_I25:%.*]] = alloca i8, align 1
-// NVPTX-NEXT:    [[__MASK_I26:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[__LANE_MASK_ADDR_I:%.*]] = alloca i64, align 8
-// NVPTX-NEXT:    [[__X_ADDR_I:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[__MASK_I:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[__ID_I:%.*]] = alloca i32, align 4
-// NVPTX-NEXT:    [[TMP0:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-// NVPTX-NEXT:    [[TMP1:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-// NVPTX-NEXT:    [[TMP2:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
-// NVPTX-NEXT:    [[TMP3:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-// NVPTX-NEXT:    [[TMP4:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-// NVPTX-NEXT:    [[MUL_I:%.*]] = mul i32 [[TMP3]], [[TMP4]]
-// NVPTX-NEXT:    [[TMP5:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
-// NVPTX-NEXT:    [[MUL3_I:%.*]] = mul i32 [[MUL_I]], [[TMP5]]
-// NVPTX-NEXT:    [[CONV_I45:%.*]] = zext i32 [[MUL3_I]] to i64
-// NVPTX-NEXT:    [[TMP6:%.*]] = call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-// NVPTX-NEXT:    [[TMP7:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-// NVPTX-NEXT:    [[TMP8:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-// NVPTX-NEXT:    [[TMP9:%.*]] = call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-// NVPTX-NEXT:    [[CONV_I46:%.*]] = zext i32 [[TMP9]] to i64
-// NVPTX-NEXT:    [[TMP10:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-// NVPTX-NEXT:    [[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
-// NVPTX-NEXT:    [[TMP11:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-// NVPTX-NEXT:    [[CONV4_I:%.*]] = zext i32 [[TMP11]] to i64
-// NVPTX-NEXT:    [[MUL_I47:%.*]] = mul i64 [[CONV2_I]], [[CONV4_I]]
-// NVPTX-NEXT:    [[ADD_I:%.*]] = add i64 [[CONV_I46]], [[MUL_I47]]
-// NVPTX-NEXT:    [[TMP12:%.*]] = call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-// NVPTX-NEXT:    [[CONV6_I:%.*]] = zext i32 [[TMP12]] to i64
-// NVPTX-NEXT:    [[TMP13:%.*]] = call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-// NVPTX-NEXT:    [[CONV8_I:%.*]] = zext i32 [[TMP13]] to i64
-// NVPTX-NEXT:    [[MUL9_I:%.*]] = mul i64 [[CONV6_I]], [[CONV8_I]]
-// NVPTX-NEXT:    [[TMP14:%.*]] = call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-// NVPTX-NEXT:    [[CONV11_I:%.*]] = zext i32 [[TMP14]] to i64
-// NVPTX-NEXT:    [[MUL12_I:%.*]] = mul i64 [[MUL9_I]], [[CONV11_I]]
-// NVPTX-NEXT:    [[ADD13_I:%.*]] = add i64 [[ADD_I]], [[MUL12_I]]
-// NVPTX-NEXT:    [[TMP15:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// NVPTX-NEXT:    [[TMP16:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-// NVPTX-NEXT:    [[TMP17:%.*]] = call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-// NVPTX-NEXT:    [[TMP18:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// NVPTX-NEXT:    [[TMP19:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-// NVPTX-NEXT:    [[MUL_I48:%.*]] = mul i32 [[TMP18]], [[TMP19]]
-// NVPTX-NEXT:    [[TMP20:%.*]] = call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-// NVPTX-NEXT:    [[MUL3_I49:%.*]] = mul i32 [[MUL_I48]], [[TMP20]]
-// NVPTX-NEXT:    [[TMP21:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// NVPTX-NEXT:    [[TMP22:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-// NVPTX-NEXT:    [[TMP23:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-// NVPTX-NEXT:    [[TMP24:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// NVPTX-NEXT:    [[TMP25:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// NVPTX-NEXT:    [[TMP26:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-// NVPTX-NEXT:    [[MUL_I50:%.*]] = mul i32 [[TMP25]], [[TMP26]]
-// NVPTX-NEXT:    [[ADD_I51:%.*]] = add i32 [[TMP24]], [[MUL_I50]]
-// NVPTX-NEXT:    [[TMP27:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// NVPTX-NEXT:    [[TMP28:%.*]] = call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-// NVPTX-NEXT:    [[MUL5_I:%.*]] = mul i32 [[TMP27]], [[TMP28]]
-// NVPTX-NEXT:    [[TMP29:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-// NVPTX-NEXT:    [[MUL7_I:%.*]] = mul i32 [[MUL5_I]], [[TMP29]]
-// NVPTX-NEXT:    [[ADD8_I:%.*]] = add i32 [[ADD_I51]], [[MUL7_I]]
-// NVPTX-NEXT:    [[TMP30:%.*]] = call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-// NVPTX-NEXT:    [[TMP31:%.*]] = call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
-// NVPTX-NEXT:    [[TMP32:%.*]] = call i32 @llvm.nvvm.activemask()
-// NVPTX-NEXT:    [[CONV_I52:%.*]] = zext i32 [[TMP32]] to i64
-// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I]], align 8
-// NVPTX-NEXT:    store i32 -1, ptr [[__X_ADDR_I]], align 4
-// NVPTX-NEXT:    [[TMP33:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I]], align 8
-// NVPTX-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP33]] to i32
-// NVPTX-NEXT:    store i32 [[CONV_I]], ptr [[__MASK_I]], align 4
-// NVPTX-NEXT:    [[TMP34:%.*]] = load i32, ptr [[__MASK_I]], align 4
-// NVPTX-NEXT:    [[TMP35:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP34]], i1 true)
-// NVPTX-NEXT:    [[TMP36:%.*]] = add i32 [[TMP35]], 1
-// NVPTX-NEXT:    [[ISZERO_I:%.*]] = icmp eq i32 [[TMP34]], 0
-// NVPTX-NEXT:    [[FFS_I:%.*]] = select i1 [[ISZERO_I]], i32 0, i32 [[TMP36]]
-// NVPTX-NEXT:    [[SUB_I:%.*]] = sub nsw i32 [[FFS_I]], 1
-// NVPTX-NEXT:    store i32 [[SUB_I]], ptr [[__ID_I]], align 4
-// NVPTX-NEXT:    [[TMP37:%.*]] = load i32, ptr [[__MASK_I]], align 4
-// NVPTX-NEXT:    [[TMP38:%.*]] = load i32, ptr [[__X_ADDR_I]], align 4
-// NVPTX-NEXT:    [[TMP39:%.*]] = load i32, ptr [[__ID_I]], align 4
-// NVPTX-NEXT:    [[TMP40:%.*]] = call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-// NVPTX-NEXT:    [[SUB1_I:%.*]] = sub i32 [[TMP40]], 1
-// NVPTX-NEXT:    [[TMP41:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP37]], i32 [[TMP38]], i32 [[TMP39]], i32 [[SUB1_I]])
-// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I24]], align 8
-// NVPTX-NEXT:    store i8 1, ptr [[__X_ADDR_I25]], align 1
-// NVPTX-NEXT:    [[TMP42:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I24]], align 8
-// NVPTX-NEXT:    [[CONV_I27:%.*]] = trunc i64 [[TMP42]] to i32
-// NVPTX-NEXT:    store i32 [[CONV_I27]], ptr [[__MASK_I26]], align 4
-// NVPTX-NEXT:    [[TMP43:%.*]] = load i32, ptr [[__MASK_I26]], align 4
-// NVPTX-NEXT:    [[TMP44:%.*]] = load i8, ptr [[__X_ADDR_I25]], align 1
-// NVPTX-NEXT:    [[LOADEDV_I:%.*]] = trunc i8 [[TMP44]] to i1
-// NVPTX-NEXT:    [[TMP45:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[TMP43]], i1 [[LOADEDV_I]])
-// NVPTX-NEXT:    [[CONV1_I:%.*]] = zext i32 [[TMP45]] to i64
-// NVPTX-NEXT:    call void @llvm.nvvm.barrier0()
-// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I28]], align 8
-// NVPTX-NEXT:    [[TMP46:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I28]], align 8
-// NVPTX-NEXT:    [[CONV_I29:%.*]] = trunc i64 [[TMP46]] to i32
-// NVPTX-NEXT:    call void @llvm.nvvm.bar.warp.sync(i32 [[CONV_I29]])
-// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I30]], align 8
-// NVPTX-NEXT:    store i32 -1, ptr [[__IDX_ADDR_I]], align 4
-// NVPTX-NEXT:    store i32 -1, ptr [[__X_ADDR_I31]], align 4
-// NVPTX-NEXT:    [[TMP47:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I30]], align 8
-// NVPTX-NEXT:    [[CONV_I33:%.*]] = trunc i64 [[TMP47]] to i32
-// NVPTX-NEXT:    store i32 [[CONV_I33]], ptr [[__MASK_I32]], align 4
-// NVPTX-NEXT:    [[TMP48:%.*]] = load i32, ptr [[__MASK_I32]], align 4
-// NVPTX-NEXT:    [[TMP49:%.*]] = load i32, ptr [[__IDX_ADDR_I]], align 4
-// NVPTX-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP48]], [[TMP49]]
-// NVPTX-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 1
-// NVPTX-NEXT:    store i32 [[AND_I]], ptr [[__BITMASK_I]], align 4
-// NVPTX-NEXT:    [[TMP50:%.*]] = load i32, ptr [[__BITMASK_I]], align 4
-// NVPTX-NEXT:    [[SUB_I34:%.*]] = sub i32 0, [[TMP50]]
-// NVPTX-NEXT:    [[TMP51:%.*]] = load i32, ptr [[__MASK_I32]], align 4
-// NVPTX-NEXT:    [[TMP52:%.*]] = load i32, ptr [[__X_ADDR_I31]], align 4
-// NVPTX-NEXT:    [[TMP53:%.*]] = load i32, ptr [[__IDX_ADDR_I]], align 4
-// NVPTX-NEXT:    [[TMP54:%.*]] = call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-// NVPTX-NEXT:    [[SUB1_I36:%.*]] = sub i32 [[TMP54]], 1
-// NVPTX-NEXT:    [[TMP55:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP51]], i32 [[TMP52]], i32 [[TMP53]], i32 [[SUB1_I36]])
-// NVPTX-NEXT:    [[AND2_I:%.*]] = and i32 [[SUB_I34]], [[TMP55]]
-// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I37]], align 8
-// NVPTX-NEXT:    [[TMP56:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I37]], align 8
-// NVPTX-NEXT:    [[TMP57:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP56]], i1 true)
-// NVPTX-NEXT:    [[TMP58:%.*]] = add i64 [[TMP57]], 1
-// NVPTX-NEXT:    [[ISZERO_I38:%.*]] = icmp eq i64 [[TMP56]], 0
-// NVPTX-NEXT:    [[FFS_I39:%.*]] = select i1 [[ISZERO_I38]], i64 0, i64 [[TMP58]]
-// NVPTX-NEXT:    [[CAST_I:%.*]] = trunc i64 [[FFS_I39]] to i32
-// NVPTX-NEXT:    [[SUB_I40:%.*]] = sub nsw i32 [[CAST_I]], 1
-// NVPTX-NEXT:    [[CONV_I41:%.*]] = sext i32 [[SUB_I40]] to i64
-// NVPTX-NEXT:    store i64 -1, ptr [[__LANE_MASK_ADDR_I42]], align 8
-// NVPTX-NEXT:    [[TMP59:%.*]] = call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
-// NVPTX-NEXT:    [[CONV_I44:%.*]] = zext i32 [[TMP59]] to i64
-// NVPTX-NEXT:    [[TMP60:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I42]], align 8
-// NVPTX-NEXT:    store i64 [[TMP60]], ptr [[__LANE_MASK_ADDR_I_I]], align 8
-// NVPTX-NEXT:    [[TMP61:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_I_I]], align 8
-// NVPTX-NEXT:    [[TMP62:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP61]], i1 true)
-// NVPTX-NEXT:    [[TMP63:%.*]] = add i64 [[TMP62]], 1
-// NVPTX-NEXT:    [[ISZERO_I_I:%.*]] = icmp eq i64 [[TMP61]], 0
-// NVPTX-NEXT:    [[FFS_I_I:%.*]] = select i1 [[ISZERO_I_I]], i64 0, i64 [[TMP63]]
-// NVPTX-NEXT:    [[CAST_I_I:%.*]] = trunc i64 [[FFS_I_I]] to i32
-// NVPTX-NEXT:    [[SUB_I_I:%.*]] = sub nsw i32 [[CAST_I_I]], 1
-// NVPTX-NEXT:    [[CONV_I_I:%.*]] = sext i32 [[SUB_I_I]] to i64
-// NVPTX-NEXT:    [[CMP_I:%.*]] = icmp eq i64 [[CONV_I44]], [[CONV_I_I]]
-// NVPTX-NEXT:    call void @llvm.nvvm.exit()
+// NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_broadcast_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
+// NVPTX-NEXT:    call void @__gpu_sync_threads() #[[ATTR6]]
+// NVPTX-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    call void @__gpu_exit() #[[ATTR7:[0-9]+]]
 // NVPTX-NEXT:    unreachable
 //
-void foo() {
+__gpu_kernel void foo() {
   __gpu_num_blocks_x();
   __gpu_num_blocks_y();
   __gpu_num_blocks_z();
-  __gpu_num_blocks();
+  __gpu_num_blocks(0);
   __gpu_block_id_x();
   __gpu_block_id_y();
   __gpu_block_id_z();
-  __gpu_block_id();
+  __gpu_block_id(0);
   __gpu_num_threads_x();
   __gpu_num_threads_y();
   __gpu_num_threads_z();
-  __gpu_num_threads();
+  __gpu_num_threads(0);
   __gpu_thread_id_x();
   __gpu_thread_id_y();
   __gpu_thread_id_z();
-  __gpu_thread_id();
+  __gpu_thread_id(0);
   __gpu_num_lanes();
   __gpu_lane_id();
   __gpu_lane_mask();
@@ -499,10 +103,6 @@ void foo() {
   __gpu_sync_lane(-1);
   __gpu_shuffle_idx_u32(-1, -1, -1);
   __gpu_first_lane_id(-1);
-  __gpu_is_first_lane(-1);
+  __gpu_is_first_in_lane(-1);
   __gpu_exit();
 }
-//.
-// AMDGPU: [[META3]] = !{}
-// AMDGPU: [[RNG4]] = !{i16 1, i16 1025}
-//.
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index f66a9f7e1fd499..c4aae94182bbdc 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,7 +22,7 @@
 // RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENMP
 
-#define _DEFAULT_ATTRS __attribute__((always_inline))
+#define _DEFAULT_FN_ATTRS __attribute__((always_inline))
 #include <gpuintrin.h>
 
 #ifdef __device__
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index f09f073c5e94bb..6ab95403ca3890 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -136,8 +136,8 @@ LIBC_INLINE uint32_t get_lane_size() {
 
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() {
-  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
   __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
 /// Waits for all pending memory operations to complete in program order.

>From f90544547e2d3cdffdb64e49acd7ecd704895f82 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 5 Nov 2024 13:38:42 -0600
Subject: [PATCH 08/18] c89 support

---
 clang/lib/Headers/amdgpuintrin.h    | 58 ++++++++++++++---------------
 clang/lib/Headers/gpuintrin.h       | 37 ++++++++++++++----
 clang/lib/Headers/nvptxintrin.h     | 58 ++++++++++++++---------------
 clang/test/Headers/gpuintrin.c      |  1 -
 clang/test/Headers/gpuintrin_lang.c | 15 ++++++++
 5 files changed, 102 insertions(+), 67 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 056d8765340c72..93399d3f4f3328 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -13,15 +13,11 @@
 #error "This file is intended for AMDGPU targets or offloading to AMDGPU"
 #endif
 
-#include <stdbool.h>
 #include <stdint.h>
 
-#if !defined(_DEFAULT_FN_ATTRS)
-#if defined(__HIP__) || defined(__CUDA__)
-#define _DEFAULT_FN_ATTRS __attribute__((device))
-#else
-#define _DEFAULT_FN_ATTRS
-#endif
+#if !defined(__cplusplus)
+#pragma push_macro("bool")
+#define bool _Bool
 #endif
 
 #pragma omp begin declare target device_type(nohost)
@@ -38,118 +34,122 @@
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of workgroups in the 'x' dimension of the grid.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
 }
 
 // Returns the number of workgroups in the 'y' dimension of the grid.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
   return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
 }
 
 // Returns the number of workgroups in the 'z' dimension of the grid.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
   return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
 }
 
 // Returns the 'x' dimension of the current AMD workgroup's id.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
   return __builtin_amdgcn_workgroup_id_x();
 }
 
 // Returns the 'y' dimension of the current AMD workgroup's id.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
   return __builtin_amdgcn_workgroup_id_y();
 }
 
 // Returns the 'z' dimension of the current AMD workgroup's id.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
   return __builtin_amdgcn_workgroup_id_z();
 }
 
 // Returns the number of workitems in the 'x' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
   return __builtin_amdgcn_workgroup_size_x();
 }
 
 // Returns the number of workitems in the 'y' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
   return __builtin_amdgcn_workgroup_size_y();
 }
 
 // Returns the number of workitems in the 'z' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
   return __builtin_amdgcn_workgroup_size_z();
 }
 
 // Returns the 'x' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
   return __builtin_amdgcn_workitem_id_x();
 }
 
 // Returns the 'y' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
   return __builtin_amdgcn_workitem_id_y();
 }
 
 // Returns the 'z' dimension id of the workitem in the current AMD workgroup.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
   return __builtin_amdgcn_workitem_id_z();
 }
 
 // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
 // and compilation options.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_lanes(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
   return __builtin_amdgcn_wavefrontsize();
 }
 
 // Returns the id of the thread inside of an AMD wavefront executing together.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_lane_id(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
 
 // Returns the bit-mask of active threads in the current wavefront.
-_DEFAULT_FN_ATTRS static inline uint64_t __gpu_lane_mask(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
   return __builtin_amdgcn_read_exec();
 }
 
 // Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_FN_ATTRS static inline uint32_t
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.
-_DEFAULT_FN_ATTRS static inline uint64_t __gpu_ballot(uint64_t __lane_mask,
-                                                      bool __x) {
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
   // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
   // the active threads
   return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
-_DEFAULT_FN_ATTRS static inline void __gpu_sync_threads(void) {
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
   __builtin_amdgcn_s_barrier();
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
 }
 
 // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-_DEFAULT_FN_ATTRS static inline void __gpu_sync_lane(uint64_t __lane_mask) {
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
   __builtin_amdgcn_wave_barrier();
 }
 
 // Shuffles the the lanes inside the wavefront according to the given index.
-_DEFAULT_FN_ATTRS static inline uint32_t
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
 }
 
 // Terminates execution of the associated wavefront.
-_DEFAULT_FN_ATTRS [[noreturn]] static inline void __gpu_exit(void) {
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __builtin_amdgcn_endpgm();
 }
 
 #pragma omp end declare variant
 #pragma omp end declare target
 
+#if !defined(__cplusplus)
+#pragma pop_macro("bool")
+#endif
+
 #endif // __AMDGPUINTRIN_H
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index f804c489481797..ed804037590637 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -17,6 +17,14 @@
 #ifndef __GPUINTRIN_H
 #define __GPUINTRIN_H
 
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
 #if defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
@@ -25,8 +33,15 @@
 #error "This header is only meant to be used on GPU architectures."
 #endif
 
+#if !defined(__cplusplus)
+#pragma push_macro("bool")
+#define bool _Bool
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+
 // Returns the number of blocks in the requested dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks(int __dim) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
   switch (__dim) {
   case 0:
     return __gpu_num_blocks_x();
@@ -40,7 +55,7 @@ _DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks(int __dim) {
 }
 
 // Returns the number of block id in the requested dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id(int __dim) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
   switch (__dim) {
   case 0:
     return __gpu_block_id_x();
@@ -54,7 +69,7 @@ _DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id(int __dim) {
 }
 
 // Returns the number of threads in the requested dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads(int __dim) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
   switch (__dim) {
   case 0:
     return __gpu_num_threads_x();
@@ -68,7 +83,7 @@ _DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads(int __dim) {
 }
 
 // Returns the thread id in the requested dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id(int __dim) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
   switch (__dim) {
   case 0:
     return __gpu_thread_id_x();
@@ -82,19 +97,19 @@ _DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id(int __dim) {
 }
 
 // Get the first active thread inside the lane.
-_DEFAULT_FN_ATTRS static inline uint64_t
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_first_lane_id(uint64_t __lane_mask) {
   return __builtin_ffsll(__lane_mask) - 1;
 }
 
 // Conditional that is only true for a single thread in a lane.
-_DEFAULT_FN_ATTRS static inline bool
+_DEFAULT_FN_ATTRS static __inline__ bool
 __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
-_DEFAULT_FN_ATTRS static inline uint32_t
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
   for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
     uint32_t index = step + __gpu_lane_id();
@@ -104,7 +119,7 @@ __gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
 }
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
-_DEFAULT_FN_ATTRS static inline uint32_t
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
   for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
     uint32_t index = __gpu_lane_id() - step;
@@ -114,6 +129,12 @@ __gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
   return x;
 }
 
+#pragma omp end declare target
+
+#if !defined(__cplusplus)
+#pragma pop_macro("bool")
+#endif
+
 #undef _DEFAULT_FN_ATTRS
 
 #endif // __GPUINTRIN_H
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 0fee0faa4c9680..a205a652dcb786 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,15 +13,11 @@
 #error "This file is intended for NVPTX targets or offloading to NVPTX"
 #endif
 
-#include <stdbool.h>
 #include <stdint.h>
 
-#if !defined(_DEFAULT_FN_ATTRS)
-#if defined(__HIP__) || defined(__CUDA__)
-#define _DEFAULT_FN_ATTRS __attribute__((device))
-#else
-#define _DEFAULT_FN_ATTRS
-#endif
+#if !defined(__cplusplus)
+#pragma push_macro("bool")
+#define bool _Bool
 #endif
 
 #pragma omp begin declare target device_type(nohost)
@@ -38,82 +34,82 @@
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
 // Returns the number of CUDA blocks in the 'x' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
 
 // Returns the number of CUDA blocks in the 'y' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
   return __nvvm_read_ptx_sreg_nctaid_y();
 }
 
 // Returns the number of CUDA blocks in the 'z' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
   return __nvvm_read_ptx_sreg_nctaid_z();
 }
 
 // Returns the 'x' dimension of the current CUDA block's id.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
   return __nvvm_read_ptx_sreg_ctaid_x();
 }
 
 // Returns the 'y' dimension of the current CUDA block's id.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
   return __nvvm_read_ptx_sreg_ctaid_y();
 }
 
 // Returns the 'z' dimension of the current CUDA block's id.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
   return __nvvm_read_ptx_sreg_ctaid_z();
 }
 
 // Returns the number of CUDA threads in the 'x' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
   return __nvvm_read_ptx_sreg_ntid_x();
 }
 
 // Returns the number of CUDA threads in the 'y' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
   return __nvvm_read_ptx_sreg_ntid_y();
 }
 
 // Returns the number of CUDA threads in the 'z' dimension.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
   return __nvvm_read_ptx_sreg_ntid_z();
 }
 
 // Returns the 'x' dimension id of the thread in the current CUDA block.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_x(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
   return __nvvm_read_ptx_sreg_tid_x();
 }
 
 // Returns the 'y' dimension id of the thread in the current CUDA block.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_y(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
   return __nvvm_read_ptx_sreg_tid_y();
 }
 
 // Returns the 'z' dimension id of the thread in the current CUDA block.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_z(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
   return __nvvm_read_ptx_sreg_tid_z();
 }
 
 // Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_lanes(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
   return __nvvm_read_ptx_sreg_warpsize();
 }
 
 // Returns the id of the thread inside of a CUDA warp executing together.
-_DEFAULT_FN_ATTRS static inline uint32_t __gpu_lane_id(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
   return __nvvm_read_ptx_sreg_laneid();
 }
 
 // Returns the bit-mask of active threads in the current warp.
-_DEFAULT_FN_ATTRS static inline uint64_t __gpu_lane_mask(void) {
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
   return __nvvm_activemask();
 }
 
 // Copies the value from the first active thread in the warp to the rest.
-_DEFAULT_FN_ATTRS static inline uint32_t
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __id = __builtin_ffs(__mask) - 1;
@@ -121,24 +117,24 @@ __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.
-_DEFAULT_FN_ATTRS static inline uint64_t __gpu_ballot(uint64_t __lane_mask,
-                                                      bool __x) {
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   return __nvvm_vote_ballot_sync(__mask, __x);
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
-_DEFAULT_FN_ATTRS static inline void __gpu_sync_threads(void) {
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
   __syncthreads();
 }
 
 // Waits for all threads in the warp to reconverge for independent scheduling.
-_DEFAULT_FN_ATTRS static inline void __gpu_sync_lane(uint64_t __lane_mask) {
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
   __nvvm_bar_warp_sync((uint32_t)__lane_mask);
 }
 
 // Shuffles the the lanes inside the warp according to the given index.
-_DEFAULT_FN_ATTRS static inline uint32_t
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __bitmask = (__mask >> __idx) & 1u;
@@ -147,11 +143,15 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
 }
 
 // Terminates execution of the calling thread.
-_DEFAULT_FN_ATTRS [[noreturn]] static inline void __gpu_exit(void) {
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __nvvm_exit();
 }
 
 #pragma omp end declare variant
 #pragma omp end declare target
 
+#if !defined(__cplusplus)
+#pragma pop_macro("bool")
+#endif
+
 #endif // __NVPTXINTRIN_H
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 25e89b0e059e29..0645cd51b67398 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -10,7 +10,6 @@
 // RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=NVPTX
 
-#define _DEFAULT_ATTRS __attribute__((always_inline))
 #include <gpuintrin.h>
 
 // AMDGPU-LABEL: define protected amdgpu_kernel void @foo(
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index c4aae94182bbdc..c1040de5a58ec8 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -21,6 +21,11 @@
 // RUN:   -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
 // RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENMP
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -std=c89 -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=C89
 
 #define _DEFAULT_FN_ATTRS __attribute__((always_inline))
 #include <gpuintrin.h>
@@ -56,6 +61,16 @@ __device__ int foo() { return __gpu_thread_id_x(); }
 // OPENMP-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
 // OPENMP-NEXT:    ret i32 [[TMP0]]
 //
+// C89-LABEL: define dso_local i32 @foo(
+// C89-SAME: ) #[[ATTR0:[0-9]+]] {
+// C89-NEXT:  [[ENTRY:.*:]]
+// C89-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5)
+// C89-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// C89-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// C89-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
+// C89-NEXT:    [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// C89-NEXT:    ret i32 [[TMP0]]
+//
 int foo() { return __gpu_thread_id_x(); }
 #pragma omp declare target to(foo)
 #endif

>From 1e41cf4264f4bf62dd235d3fc92b4637ac50ab98 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 5 Nov 2024 13:43:55 -0600
Subject: [PATCH 09/18] Use _Pragma

---
 clang/lib/Headers/amdgpuintrin.h | 12 ++++++------
 clang/lib/Headers/gpuintrin.h    |  8 ++++----
 clang/lib/Headers/nvptxintrin.h  | 12 ++++++------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 93399d3f4f3328..91eb7973d67e51 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -16,12 +16,12 @@
 #include <stdint.h>
 
 #if !defined(__cplusplus)
-#pragma push_macro("bool")
+_Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
-#pragma omp begin declare target device_type(nohost)
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 
 // Type aliases to the address spaces used by the AMDGPU backend.
 #define __gpu_private __attribute__((opencl_private))
@@ -145,11 +145,11 @@ _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __builtin_amdgcn_endpgm();
 }
 
-#pragma omp end declare variant
-#pragma omp end declare target
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
 
 #if !defined(__cplusplus)
-#pragma pop_macro("bool")
+_Pragma("pop_macro(\"bool\")");
 #endif
 
 #endif // __AMDGPUINTRIN_H
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ed804037590637..61a68bec239be3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -34,11 +34,11 @@
 #endif
 
 #if !defined(__cplusplus)
-#pragma push_macro("bool")
+_Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
-#pragma omp begin declare target device_type(nohost)
+_Pragma("omp begin declare target device_type(nohost)");
 
 // Returns the number of blocks in the requested dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
@@ -129,10 +129,10 @@ __gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
   return x;
 }
 
-#pragma omp end declare target
+_Pragma("omp end declare target");
 
 #if !defined(__cplusplus)
-#pragma pop_macro("bool")
+_Pragma("pop_macro(\"bool\")");
 #endif
 
 #undef _DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index a205a652dcb786..62bb4983cbf3dc 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -16,12 +16,12 @@
 #include <stdint.h>
 
 #if !defined(__cplusplus)
-#pragma push_macro("bool")
+_Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
-#pragma omp begin declare target device_type(nohost)
-#pragma omp begin declare variant match(device = {arch(nvptx64)})
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 
 // Type aliases to the address spaces used by the NVPTX backend.
 #define __gpu_private __attribute__((opencl_private))
@@ -147,11 +147,11 @@ _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __nvvm_exit();
 }
 
-#pragma omp end declare variant
-#pragma omp end declare target
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
 
 #if !defined(__cplusplus)
-#pragma pop_macro("bool")
+_Pragma("pop_macro(\"bool\")");
 #endif
 
 #endif // __NVPTXINTRIN_H

>From dadaca2f19c6526a24dabdb62716341dff0abc60 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 5 Nov 2024 13:47:00 -0600
Subject: [PATCH 10/18] broadcast to read_first_lane

---
 clang/lib/Headers/amdgpuintrin.h | 2 +-
 clang/lib/Headers/gpuintrin.h    | 2 +-
 clang/lib/Headers/nvptxintrin.h  | 2 +-
 clang/test/Headers/gpuintrin.c   | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 91eb7973d67e51..ee9410b1a9a483 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -111,7 +111,7 @@ _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
 
 // Copies the value from the first active thread in the wavefront to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 61a68bec239be3..d0fde284f9faf0 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -115,7 +115,7 @@ __gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
     uint32_t index = step + __gpu_lane_id();
     x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
   }
-  return __gpu_broadcast_u32(__lane_mask, x);
+  return __gpu_read_first_lane_u32(__lane_mask, x);
 }
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 62bb4983cbf3dc..844d1554a63c44 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -110,7 +110,7 @@ _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
 
 // Copies the value from the first active thread in the warp to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __id = __builtin_ffs(__mask) - 1;
   return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 0645cd51b67398..2e45f73692f534 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -34,7 +34,7 @@
 // AMDGPU-NEXT:    [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_broadcast_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_threads() #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
@@ -66,7 +66,7 @@
 // NVPTX-NEXT:    [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_broadcast_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_threads() #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
@@ -96,7 +96,7 @@ __gpu_kernel void foo() {
   __gpu_num_lanes();
   __gpu_lane_id();
   __gpu_lane_mask();
-  __gpu_broadcast_u32(-1, -1);
+  __gpu_read_first_lane_u32(-1, -1);
   __gpu_ballot(-1, 1);
   __gpu_sync_threads();
   __gpu_sync_lane(-1);

>From b942931ff841460198bf6ba14cab218539f98e5b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 5 Nov 2024 15:39:42 -0600
Subject: [PATCH 11/18] Stdint

---
 clang/lib/Headers/gpuintrin.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index d0fde284f9faf0..cccd413ac6147e 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -33,6 +33,8 @@
 #error "This header is only meant to be used on GPU architectures."
 #endif
 
+#include <stdint.h>
+
 #if !defined(__cplusplus)
 _Pragma("push_macro(\"bool\")");
 #define bool _Bool

>From 1703f0104c8f2770c27ce1913365e71309b9ff15 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 7 Nov 2024 10:23:31 -0600
Subject: [PATCH 12/18] Add gpu variant for OpenMP

---
 clang/lib/Headers/gpuintrin.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index cccd413ac6147e..6d32d5c15577f2 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -41,6 +41,7 @@ _Pragma("push_macro(\"bool\")");
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
 
 // Returns the number of blocks in the requested dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
@@ -131,6 +132,7 @@ __gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
   return x;
 }
 
+_Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
 #if !defined(__cplusplus)

>From f727e6a89107ff77797dcd46a2f2dfc6e93d3cd5 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 7 Nov 2024 12:20:04 -0600
Subject: [PATCH 13/18] Support for u64

---
 clang/lib/Headers/amdgpuintrin.h | 18 ++++++++++++++++++
 clang/lib/Headers/nvptxintrin.h  | 28 ++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index ee9410b1a9a483..49851dfddadef1 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -115,6 +115,15 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
+         ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
+}
+
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
@@ -140,6 +149,15 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
 }
 
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
+         ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+}
+
 // Terminates execution of the associated wavefront.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __builtin_amdgcn_endpgm();
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 844d1554a63c44..b9a1c862c228b3 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -116,6 +116,20 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
 }
 
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint32_t __id = __builtin_ffs(__mask) - 1;
+  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
+                                             __gpu_num_lanes() - 1)
+          << 32ull) |
+         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
+                                             __gpu_num_lanes() - 1));
+}
+
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
@@ -142,6 +156,20 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
          __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
 }
 
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  uint32_t __mask = (uint32_t)__lane_mask;
+  uint64_t __bitmask = (__mask >> __idx) & 1u;
+  return -__bitmask & ((uint64_t)__nvvm_shfl_sync_idx_i32(
+                           __mask, __hi, __idx, __gpu_num_lanes() - 1u)
+                       << 32ull) |
+         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
+                                             __gpu_num_lanes() - 1u));
+}
+
 // Terminates execution of the calling thread.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __nvvm_exit();

>From ffc10e8fe5a7d686b065c378c78bf921b6c56972 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 7 Nov 2024 15:16:25 -0600
Subject: [PATCH 14/18] Add floating point versions

---
 clang/lib/Headers/gpuintrin.h | 58 ++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 6d32d5c15577f2..4f95e7982fcfdb 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -111,26 +111,54 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ float
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, float __x) {
+  return __builtin_bit_cast(
+      float,
+      __gpu_shuffle_idx_u32(__lane_mask, __builtin_bit_cast(uint32_t, __x)));
+}
+
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ double
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, double __x) {
+  return __builtin_bit_cast(
+      double,
+      __gpu_shuffle_idx_u64(__lane_mask, __builtin_bit_cast(uint64_t, __x)));
+}
+
 // Gets the sum of all lanes inside the warp or wavefront.
-_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
-  for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
-    uint32_t index = step + __gpu_lane_id();
-    x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
+#define __DO_LANE_REDUCE(__type, __suffix)                                     \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_##__suffix(     \
+      uint64_t __lane_mask, __type x) {                                        \
+    for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {         \
+      uint32_t index = step + __gpu_lane_id();                                 \
+      x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x);                \
+    }                                                                          \
+    return __gpu_read_first_lane_##__suffix(__lane_mask, x);                   \
   }
-  return __gpu_read_first_lane_u32(__lane_mask, x);
-}
+__DO_LANE_REDUCE(uint32_t, u32);
+__DO_LANE_REDUCE(uint64_t, u64);
+__DO_LANE_REDUCE(float, f32);
+__DO_LANE_REDUCE(double, f64);
+#undef __DO_LANE_REDUCE
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
-_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
-  for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
-    uint32_t index = __gpu_lane_id() - step;
-    uint32_t bitmask = __gpu_lane_id() >= step;
-    x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
+#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
+  _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
+      uint64_t __lane_mask, uint32_t x) {                                      \
+    for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {             \
+      uint32_t index = __gpu_lane_id() - step;                                 \
+      __bitmask_type bitmask = __gpu_lane_id() >= step;                        \
+      x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x);     \
+    }                                                                          \
+    return x;                                                                  \
   }
-  return x;
-}
+__DO_LANE_SCAN(uint32_t, uint32_t, u32);
+__DO_LANE_SCAN(uint64_t, uint64_t, u64);
+__DO_LANE_SCAN(float, uint32_t, f32);
+__DO_LANE_SCAN(double, uint64_t, f64);
+#undef __DO_LANE_SCAN
 
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");

>From f9f0e839693b663da0b9a5b8a6a5fd10a663cdc8 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 7 Nov 2024 16:29:52 -0600
Subject: [PATCH 15/18] Fix floatoing point scans

---
 clang/lib/Headers/gpuintrin.h | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4f95e7982fcfdb..1911de81c6f109 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -113,18 +113,34 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
 
 // Gets the first floating point value from the active lanes.
 _DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, float __x) {
+__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
   return __builtin_bit_cast(
-      float,
-      __gpu_shuffle_idx_u32(__lane_mask, __builtin_bit_cast(uint32_t, __x)));
+      float, __gpu_read_first_lane_u32(__lane_mask,
+                                       __builtin_bit_cast(uint32_t, __x)));
 }
 
 // Gets the first floating point value from the active lanes.
 _DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, double __x) {
+__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
   return __builtin_bit_cast(
-      double,
-      __gpu_shuffle_idx_u64(__lane_mask, __builtin_bit_cast(uint64_t, __x)));
+      double, __gpu_read_first_lane_u64(__lane_mask,
+                                        __builtin_bit_cast(uint64_t, __x)));
+}
+
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ float
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+  return __builtin_bit_cast(
+      float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
+                                   __builtin_bit_cast(uint32_t, __x)));
+}
+
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ double
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+  return __builtin_bit_cast(
+      double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
+                                    __builtin_bit_cast(uint64_t, __x)));
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
@@ -150,7 +166,10 @@ __DO_LANE_REDUCE(double, f64);
     for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {             \
       uint32_t index = __gpu_lane_id() - step;                                 \
       __bitmask_type bitmask = __gpu_lane_id() >= step;                        \
-      x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x);     \
+      x += __builtin_bit_cast(                                                 \
+          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
+                                                __gpu_shuffle_idx_##__suffix(  \
+                                                    __lane_mask, index, x)));  \
     }                                                                          \
     return x;                                                                  \
   }

>From 860b550ef28c7d4160278b8e77e7f29240534bf6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 7 Nov 2024 21:31:00 -0600
Subject: [PATCH 16/18] comments

---
 clang/lib/Headers/gpuintrin.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 1911de81c6f109..e1351b10a78188 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -153,10 +153,10 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
     }                                                                          \
     return __gpu_read_first_lane_##__suffix(__lane_mask, x);                   \
   }
-__DO_LANE_REDUCE(uint32_t, u32);
-__DO_LANE_REDUCE(uint64_t, u64);
-__DO_LANE_REDUCE(float, f32);
-__DO_LANE_REDUCE(double, f64);
+__DO_LANE_REDUCE(uint32_t, u32); // uint32_t __gpu_lane_reduce_u32(m, x)
+__DO_LANE_REDUCE(uint64_t, u64); // uint64_t __gpu_lane_reduce_u64(m, x)
+__DO_LANE_REDUCE(float, f32);    // float __gpu_lane_reduce_f32(m, x)
+__DO_LANE_REDUCE(double, f64);   // double __gpu_lane_reduce_f64(m, x)
 #undef __DO_LANE_REDUCE
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
@@ -173,10 +173,10 @@ __DO_LANE_REDUCE(double, f64);
     }                                                                          \
     return x;                                                                  \
   }
-__DO_LANE_SCAN(uint32_t, uint32_t, u32);
-__DO_LANE_SCAN(uint64_t, uint64_t, u64);
-__DO_LANE_SCAN(float, uint32_t, f32);
-__DO_LANE_SCAN(double, uint64_t, f64);
+__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
+__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
+__DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
+__DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
 #undef __DO_LANE_SCAN
 
 _Pragma("omp end declare variant");

>From 238673380983aa04e490fbd425604743c8e5a882 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 8 Nov 2024 20:14:37 -0600
Subject: [PATCH 17/18] comments

---
 clang/lib/Headers/gpuintrin.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index e1351b10a78188..1f72654b508fd6 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -43,6 +43,10 @@ _Pragma("push_macro(\"bool\")");
 _Pragma("omp begin declare target device_type(nohost)");
 _Pragma("omp begin declare variant match(device = {kind(gpu)})");
 
+#define __GPU_X_DIM 0
+#define __GPU_Y_DIM 1
+#define __GPU_Z_DIM 2
+
 // Returns the number of blocks in the requested dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
   switch (__dim) {
@@ -127,7 +131,7 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
                                         __builtin_bit_cast(uint64_t, __x)));
 }
 
-// Gets the first floating point value from the active lanes.
+// Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
   return __builtin_bit_cast(
@@ -135,7 +139,7 @@ __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
                                    __builtin_bit_cast(uint32_t, __x)));
 }
 
-// Gets the first floating point value from the active lanes.
+// Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ double
 __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
   return __builtin_bit_cast(

>From 35f20bbe5ce45194dff68c52018cb3cf04b533f7 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 11 Nov 2024 08:04:56 -0600
Subject: [PATCH 18/18] Add more functions

---
 clang/lib/Headers/amdgpuintrin.h | 20 ++++++++
 clang/lib/Headers/gpuintrin.h    | 79 +++++++++++++++++++++++---------
 clang/lib/Headers/nvptxintrin.h  | 19 ++++++++
 3 files changed, 97 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 49851dfddadef1..9166b035776943 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -33,6 +33,9 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
+// A pointer to the dynamic local memory buffer.
+extern uint8_t __gpu_local __gpu_dynamic_buffer[];
+
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -158,11 +161,28 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
          ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
 }
 
+// Returns true if the flat pointer points to CUDA 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return __builtin_amdgcn_is_shared(
+      (void __attribute__((address_space(0))) *)ptr);
+}
+
+// Returns true if the flat pointer points to CUDA 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return __builtin_amdgcn_is_private(
+      (void __attribute__((address_space(0))) *)ptr);
+}
+
 // Terminates execution of the associated wavefront.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __builtin_amdgcn_endpgm();
 }
 
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  __builtin_amdgcn_s_sleep(2);
+}
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 1f72654b508fd6..183685dbc5e453 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -115,6 +115,22 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ int32_t
+__gpu_read_first_lane_i32(uint64_t __lane_mask, int32_t __x) {
+  return __builtin_bit_cast(
+      int32_t, __gpu_read_first_lane_u32(__lane_mask,
+                                         __builtin_bit_cast(uint32_t, __x)));
+}
+
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ int64_t
+__gpu_read_first_lane_i64(uint64_t __lane_mask, int64_t __x) {
+  return __builtin_bit_cast(
+      int64_t, __gpu_read_first_lane_u64(__lane_mask,
+                                         __builtin_bit_cast(uint64_t, __x)));
+}
+
 // Gets the first floating point value from the active lanes.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
@@ -131,6 +147,22 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
                                         __builtin_bit_cast(uint64_t, __x)));
 }
 
+// Shuffles the the lanes according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ int32_t
+__gpu_shuffle_idx_i32(uint64_t __lane_mask, uint32_t __idx, int32_t __x) {
+  return __builtin_bit_cast(
+      int32_t, __gpu_shuffle_idx_u32(__lane_mask, __idx,
+                                     __builtin_bit_cast(uint32_t, __x)));
+}
+
+// Shuffles the the lanes according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ int64_t
+__gpu_shuffle_idx_i64(uint64_t __lane_mask, uint32_t __idx, int64_t __x) {
+  return __builtin_bit_cast(
+      int64_t, __gpu_shuffle_idx_u64(__lane_mask, __idx,
+                                     __builtin_bit_cast(uint64_t, __x)));
+}
+
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
@@ -148,37 +180,42 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_REDUCE(__type, __suffix)                                     \
-  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_##__suffix(     \
-      uint64_t __lane_mask, __type x) {                                        \
-    for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {         \
-      uint32_t index = step + __gpu_lane_id();                                 \
-      x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x);                \
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
+      uint32_t __index = __step + __gpu_lane_id();                             \
+      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x);          \
     }                                                                          \
-    return __gpu_read_first_lane_##__suffix(__lane_mask, x);                   \
+    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
   }
-__DO_LANE_REDUCE(uint32_t, u32); // uint32_t __gpu_lane_reduce_u32(m, x)
-__DO_LANE_REDUCE(uint64_t, u64); // uint64_t __gpu_lane_reduce_u64(m, x)
-__DO_LANE_REDUCE(float, f32);    // float __gpu_lane_reduce_f32(m, x)
-__DO_LANE_REDUCE(double, f64);   // double __gpu_lane_reduce_f64(m, x)
-#undef __DO_LANE_REDUCE
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(int32_t, i32);  // int32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(int64_t, i64);  // int64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
-      uint64_t __lane_mask, uint32_t x) {                                      \
-    for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {             \
-      uint32_t index = __gpu_lane_id() - step;                                 \
-      __bitmask_type bitmask = __gpu_lane_id() >= step;                        \
-      x += __builtin_bit_cast(                                                 \
-          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
-                                                __gpu_shuffle_idx_##__suffix(  \
-                                                    __lane_mask, index, x)));  \
+      uint64_t __lane_mask, uint32_t __x) {                                    \
+    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
+      uint32_t __index = __gpu_lane_id() - __step;                             \
+      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
+      __x += __builtin_bit_cast(                                               \
+          __type,                                                              \
+          -bitmask & __builtin_bit_cast(__bitmask_type,                        \
+                                        __gpu_shuffle_idx_##__suffix(          \
+                                            __lane_mask, __index, __x)));      \
     }                                                                          \
-    return x;                                                                  \
+    return __x;                                                                \
   }
 __DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
 __DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
+__DO_LANE_SCAN(int32_t, uint32_t, i32);  // int32_t __gpu_lane_scan_i32(m, x)
+__DO_LANE_SCAN(int64_t, uint64_t, i64);  // int64_t __gpu_lane_scan_i64(m, x)
 __DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
 __DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
 #undef __DO_LANE_SCAN
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index b9a1c862c228b3..e66c13bb1cee82 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -33,6 +33,9 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
+// A pointer to the dynamic shared memory buffer.
+extern uint8_t __gpu_local __gpu_dynamic_shared_buffer[];
+
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -170,11 +173,27 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
                                              __gpu_num_lanes() - 1u));
 }
 
+// Returns true if the flat pointer points to CUDA 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return __nvvm_isspacep_shared(ptr);
+}
+
+// Returns true if the flat pointer points to CUDA 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return __nvvm_isspacep_local(ptr);
+}
+
 // Terminates execution of the calling thread.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __nvvm_exit();
 }
 
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    LIBC_INLINE_ASM("nanosleep.u32 64;" :: : "memory");
+}
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");