[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Thu Mar 13 11:52:14 PDT 2025

https://github.com/JonChesterfield created https://github.com/llvm/llvm-project/pull/131190

Introduce __builtin_gpu builtins to clang and corresponding llvm.gpu intrinsics in llvm for abstracting over minor differences between GPU architectures, and use those to implement a gpuintrin.h instantiation to support compiling llvm libc to spirv64--.

Motivated by discussion at https://discourse.llvm.org/t/rfc-spirv-ir-as-a-vendor-agnostic-gpu-representation/85115 and RFC for this specifically as <tbd>.

These are not named llvm.spirv because there are no spirv specific semantics involved. They're deliberately the same small abstraction over targets used by llvm libc already. Essentially this patch allows us to postpone choosing a target architecture for libc until JIT time. 

There is some refactoring to be done if this lands - moving some work out of CGBuiltin.cpp, simplifying gpuintrin.h, adjusting openmp's codegen to use these instead of devicertl magic functions.


>From c6eb8e105a711d8433ade0441d37a10d729c70f9 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Wed, 12 Mar 2025 20:55:17 +0000
Subject: [PATCH] [SPIRV] GPU intrinsics

---
 clang/include/clang/Basic/Builtins.td         |  33 +
 clang/lib/Headers/amdgpuintrin.h              |   2 +-
 clang/lib/Headers/gpuintrin.h                 |   2 +
 clang/lib/Headers/spirvintrin.h               | 177 +++++
 clang/test/CodeGen/amdgpu-grid-builtins.c     | 158 +++++
 clang/test/CodeGen/gpu_builtins.c             | 647 ++++++++++++++++++
 clang/test/Headers/gpuintrin.c                | 223 ++++++
 llvm/include/llvm/IR/Intrinsics.td            |  63 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   2 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/include/llvm/Transforms/Scalar.h         |   6 +
 .../Transforms/Scalar/LowerGPUIntrinsic.h     |  26 +
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |   1 +
 .../Transforms/Scalar/LowerGPUIntrinsic.cpp   | 501 ++++++++++++++
 llvm/lib/Transforms/Scalar/Scalar.cpp         |   1 +
 llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll     | 427 ++++++++++++
 18 files changed, 2271 insertions(+), 1 deletion(-)
 create mode 100644 clang/lib/Headers/spirvintrin.h
 create mode 100644 clang/test/CodeGen/amdgpu-grid-builtins.c
 create mode 100644 clang/test/CodeGen/gpu_builtins.c
 create mode 100644 llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
 create mode 100644 llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
 create mode 100644 llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 2fbdfaea57ccd..fb87b5c74c0f8 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4770,6 +4770,39 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
   let Prototype = "char const*(...)";
 }
 
+// GPU intrinsics
+class GPUBuiltin<string prototype> : Builtin {
+  let Spellings = ["__builtin_" # NAME];
+  let Prototype = prototype;
+  let Attributes = [NoThrow];
+}
+
+multiclass GPUGridBuiltin<string prototype> {
+  def _x : GPUBuiltin<prototype>;
+  def _y : GPUBuiltin<prototype>;
+  def _z : GPUBuiltin<prototype>;
+}
+
+defm gpu_num_blocks : GPUGridBuiltin<"uint32_t()">;
+defm gpu_block_id : GPUGridBuiltin<"uint32_t()">;
+defm gpu_num_threads : GPUGridBuiltin<"uint32_t()">;
+defm gpu_thread_id : GPUGridBuiltin<"uint32_t()">;
+
+def gpu_num_lanes : GPUBuiltin<"uint32_t()">;
+def gpu_lane_id : GPUBuiltin<"uint32_t()">;
+def gpu_lane_mask : GPUBuiltin<"uint64_t()">;
+
+def gpu_read_first_lane_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t)">;
+def gpu_shuffle_idx_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t, uint32_t, uint32_t)">;
+
+def gpu_ballot : GPUBuiltin<"uint64_t(uint64_t, bool)">;
+
+// Todo, why can't I mark these convergent?
+def gpu_sync_threads : GPUBuiltin<"void()">;
+def gpu_sync_lane : GPUBuiltin<"void(uint64_t)">;
+def gpu_exit : GPUBuiltin<"void()">;
+def gpu_thread_suspend : GPUBuiltin<"void()">;
+
 // HLSL
 def HLSLAddUint64: LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_adduint64"];
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index f7fb8e2814180..817cfeec896c4 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -1,4 +1,4 @@
-//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
+//===-- amdgpuintrin.h - AMDGPU intrinsic functions -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 0fb3916acac61..a3ce535188a48 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV64__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..a5129d3577edf
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,177 @@
+//===-- spirvintrin.h - SPIRV intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV64__
+// 32 bit SPIRV is currently a stretch goal
+#error "This file is intended for SPIRV64 targets or offloading to SPIRV64"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+// This is the skeleton of the spirv implementation for gpuintrin
+// Address spaces and kernel attribute are not yet implemented
+// The target-specific functions are declarations waiting for clang support
+
+#if defined(_OPENMP)
+#error "Openmp is not yet available on spirv though gpuintrin header"
+#endif
+
+// Type aliases to the address spaces used by the SPIRV backend.
+#define __gpu_private
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global
+#define __gpu_generic
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_gpu_num_blocks_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_gpu_num_blocks_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_gpu_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_gpu_block_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_gpu_block_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_gpu_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_gpu_num_threads_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_gpu_num_threads_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_gpu_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_gpu_thread_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_gpu_thread_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_gpu_thread_id_z();
+}
+
+// Returns the size of the wave.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_gpu_num_lanes();
+}
+
+// Returns the id of the thread inside of a wave executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_gpu_lane_id();
+}
+
+// Returns the bit-mask of active threads in the current wave.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  return __builtin_gpu_lane_mask();
+}
+
+// Copies the value from the first active thread in the wave to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_gpu_read_first_lane_u32(__lane_mask, __x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  return __builtin_gpu_ballot(__lane_mask, __x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+  return __builtin_gpu_sync_threads();
+}
+
+// Wait for all threads in the wave to converge
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+  return __builtin_gpu_sync_lane(__lane_mask);
+}
+
+// Shuffles the the lanes inside the wave according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Terminates execution of the associated wave.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__  void __gpu_exit(void) {
+  return __builtin_gpu_exit();
+}
+
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  return __builtin_gpu_thread_suspend();
+}
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/CodeGen/amdgpu-grid-builtins.c b/clang/test/CodeGen/amdgpu-grid-builtins.c
new file mode 100644
index 0000000000000..2104da2dc3cbc
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-grid-builtins.c
@@ -0,0 +1,158 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | FileCheck %s
+
+#include <stdint.h>
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_x(void)
+{
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.y()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_y(void)
+{
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.z()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_z(void)
+{
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_x(void)
+{
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_y(void)
+{
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_z(void)
+{
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_x(void)
+{
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 14
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_y(void)
+{
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 8, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_z(void)
+{
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5:![0-9]+]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_x(void)
+{
+  return __builtin_amdgcn_grid_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_y(void)
+{
+  return __builtin_amdgcn_grid_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 20
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_z(void)
+{
+  return __builtin_amdgcn_grid_size_z();
+}
+
+//.
+// CHECK: [[RNG3]] = !{i16 1, i16 1025}
+// CHECK: [[META4]] = !{}
+// CHECK: [[RNG5]] = !{i32 1, i32 0}
+//.
diff --git a/clang/test/CodeGen/gpu_builtins.c b/clang/test/CodeGen/gpu_builtins.c
new file mode 100644
index 0000000000000..8231b0952fa5e
--- /dev/null
+++ b/clang/test/CodeGen/gpu_builtins.c
@@ -0,0 +1,647 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes
+// RUN: %clang_cc1 -O1 -triple spirv64 %s -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV64
+// RUN: %clang_cc1 -O1 -triple spirv64-amd-amdhsa %s -emit-llvm -o - | FileCheck %s --check-prefix=AMDHSA
+// RUN: %clang_cc1 -O1 -triple nvptx64 -emit-llvm %s -o - | FileCheck %s --check-prefix=NVPTX
+// RUN: %clang_cc1 -O1 -triple amdgcn -emit-llvm %s -o - | FileCheck %s --check-prefix=AMDGCN
+
+#include <stdint.h>
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_blocks_x(void) {
+  return __builtin_gpu_num_blocks_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_blocks_y(void) {
+  return __builtin_gpu_num_blocks_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_blocks_z(void) {
+  return __builtin_gpu_num_blocks_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @block_id_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @block_id_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @block_id_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @block_id_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t block_id_x(void) {
+  return __builtin_gpu_block_id_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @block_id_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @block_id_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @block_id_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @block_id_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t block_id_y(void) {
+  return __builtin_gpu_block_id_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @block_id_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @block_id_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @block_id_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @block_id_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t block_id_z(void) {
+  return __builtin_gpu_block_id_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_threads_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_threads_x(void) {
+  return __builtin_gpu_num_threads_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_threads_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_threads_y(void) {
+  return __builtin_gpu_num_threads_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_threads_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_threads_z(void) {
+  return __builtin_gpu_num_threads_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @thread_id_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t thread_id_x(void) {
+  return __builtin_gpu_thread_id_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @thread_id_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t thread_id_y(void) {
+  return __builtin_gpu_thread_id_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @thread_id_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t thread_id_z(void) {
+  return __builtin_gpu_thread_id_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_lanes(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_lanes(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.lanes()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_lanes(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_lanes(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_lanes(void) {
+  return __builtin_gpu_num_lanes();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @lane_id(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @lane_id(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.lane.id()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @lane_id(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @lane_id(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t lane_id(void) {
+  return __builtin_gpu_lane_id();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @lane_mask(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// SPIRV64-NEXT:    ret i64 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @lane_mask(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i64 @llvm.gpu.lane.mask()
+// AMDHSA-NEXT:    ret i64 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @lane_mask(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// NVPTX-NEXT:    ret i64 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @lane_mask(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// AMDGCN-NEXT:    ret i64 [[TMP0]]
+//
+uint64_t lane_mask(void) {
+  return __builtin_gpu_lane_mask();
+}
+
+
+uint32_t
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @read_first_lane_u32(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @read_first_lane_u32(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @read_first_lane_u32(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @read_first_lane_u32(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_gpu_read_first_lane_u32(__lane_mask, __x);
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @ballot(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// SPIRV64-NEXT:    ret i64 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @ballot(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// AMDHSA-NEXT:    ret i64 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @ballot(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// NVPTX-NEXT:    ret i64 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @ballot(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// AMDGCN-NEXT:    ret i64 [[TMP0]]
+//
+uint64_t ballot(uint64_t __lane_mask,
+                            _Bool __x) {
+  return __builtin_gpu_ballot(__lane_mask, __x);
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// SPIRV64-LABEL: @sync_threads(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.sync.threads()
+// SPIRV64-NEXT:    ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// AMDHSA-LABEL: @sync_threads(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.sync.threads()
+// AMDHSA-NEXT:    ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// NVPTX-LABEL: @sync_threads(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.sync.threads()
+// NVPTX-NEXT:    ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// AMDGCN-LABEL: @sync_threads(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.sync.threads()
+// AMDGCN-NEXT:    ret void
+//
+void sync_threads(void) {
+  return __builtin_gpu_sync_threads();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// SPIRV64-LABEL: @sync_lane(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// SPIRV64-NEXT:    ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// AMDHSA-LABEL: @sync_lane(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// AMDHSA-NEXT:    ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// NVPTX-LABEL: @sync_lane(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// NVPTX-NEXT:    ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// AMDGCN-LABEL: @sync_lane(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// AMDGCN-NEXT:    ret void
+//
+void sync_lane(uint64_t __lane_mask) {
+  return __builtin_gpu_sync_lane(__lane_mask);
+}
+
+
+uint32_t
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @shuffle_idx_u32(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @shuffle_idx_u32(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @shuffle_idx_u32(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @shuffle_idx_u32(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width);
+}
+
+// SPIRV64: Function Attrs: nofree norecurse noreturn nosync nounwind
+// SPIRV64-LABEL: @gpu_exit(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.exit()
+// SPIRV64-NEXT:    unreachable
+//
+// AMDHSA: Function Attrs: nofree norecurse noreturn nosync nounwind
+// AMDHSA-LABEL: @gpu_exit(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.exit()
+// AMDHSA-NEXT:    unreachable
+//
+// NVPTX: Function Attrs: convergent nofree norecurse noreturn nosync nounwind
+// NVPTX-LABEL: @gpu_exit(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.exit()
+// NVPTX-NEXT:    unreachable
+//
+// AMDGCN: Function Attrs: convergent nofree norecurse noreturn nosync nounwind
+// AMDGCN-LABEL: @gpu_exit(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.exit()
+// AMDGCN-NEXT:    unreachable
+//
+void gpu_exit(void) {
+  return __builtin_gpu_exit();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// SPIRV64-LABEL: @thread_suspend(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.thread.suspend()
+// SPIRV64-NEXT:    ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// AMDHSA-LABEL: @thread_suspend(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.thread.suspend()
+// AMDHSA-NEXT:    ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// NVPTX-LABEL: @thread_suspend(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.thread.suspend()
+// NVPTX-NEXT:    ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// AMDGCN-LABEL: @thread_suspend(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.thread.suspend()
+// AMDGCN-NEXT:    ret void
+//
+void thread_suspend(void) {
+  return __builtin_gpu_thread_suspend();
+}
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 9a15ce277ba87..eaf001be19ac9 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -9,6 +9,11 @@
 // RUN:   -target-feature +ptx62 \
 // RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=NVPTX
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include  \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple spirv64-- -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SPIRV64
 
 #include <gpuintrin.h>
 
@@ -978,6 +983,224 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    call void @llvm.nvvm.exit()
 // NVPTX-NEXT:    ret void
 //
+//
+// SPIRV64-LABEL: define spir_func void @foo(
+// SPIRV64-SAME: ) #[[ATTR0:[0-9]+]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV64-NEXT:    [[CALL1:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV64-NEXT:    [[CALL3:%.*]] = call spir_func i32 @__gpu_num_blocks(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV64-NEXT:    [[CALL5:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV64-NEXT:    [[CALL6:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV64-NEXT:    [[CALL7:%.*]] = call spir_func i32 @__gpu_block_id(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL8:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV64-NEXT:    [[CALL9:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV64-NEXT:    [[CALL10:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV64-NEXT:    [[CALL11:%.*]] = call spir_func i32 @__gpu_num_threads(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL12:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV64-NEXT:    [[CALL13:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV64-NEXT:    [[CALL14:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV64-NEXT:    [[CALL15:%.*]] = call spir_func i32 @__gpu_thread_id(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL16:%.*]] = call spir_func i32 @__gpu_num_lanes()
+// SPIRV64-NEXT:    [[CALL17:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV64-NEXT:    [[CALL18:%.*]] = call spir_func i64 @__gpu_lane_mask()
+// SPIRV64-NEXT:    [[CALL19:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1)
+// SPIRV64-NEXT:    [[CALL20:%.*]] = call spir_func i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL21:%.*]] = call spir_func i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true)
+// SPIRV64-NEXT:    call spir_func void @__gpu_sync_threads()
+// SPIRV64-NEXT:    call spir_func void @__gpu_sync_lane(i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL22:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL23:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL24:%.*]] = call spir_func zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1)
+// SPIRV64-NEXT:    call spir_func void @__gpu_exit() #[[ATTR4:[0-9]+]]
+// SPIRV64-NEXT:    unreachable
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_blocks(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_block_id(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_threads(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_thread_id(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i64 @__gpu_read_first_lane_u64(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    [[__HI:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__LO:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// SPIRV64-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// SPIRV64-NEXT:    store i32 [[CONV]], ptr [[__HI]], align 4
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// SPIRV64-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// SPIRV64-NEXT:    store i32 [[CONV1]], ptr [[__LO]], align 4
+// SPIRV64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]])
+// SPIRV64-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV64-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// SPIRV64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// SPIRV64-NEXT:    [[CALL3:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]])
+// SPIRV64-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// SPIRV64-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// SPIRV64-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// SPIRV64-NEXT:    ret i64 [[OR]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i64 @__gpu_first_lane_id(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true)
+// SPIRV64-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+// SPIRV64-NEXT:    [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0
+// SPIRV64-NEXT:    [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]]
+// SPIRV64-NEXT:    [[CAST:%.*]] = trunc i64 [[FFS]] to i32
+// SPIRV64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CAST]], 1
+// SPIRV64-NEXT:    [[CONV:%.*]] = sext i32 [[SUB]] to i64
+// SPIRV64-NEXT:    ret i64 [[CONV]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func zeroext i1 @__gpu_is_first_in_lane(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV64-NEXT:    [[CONV:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[CALL1:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef [[TMP0]])
+// SPIRV64-NEXT:    [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]]
+// SPIRV64-NEXT:    ret i1 [[CMP]]
+//
 //.
 // AMDGPU: [[RNG3]] = !{i32 1, i32 0}
 // AMDGPU: [[META4]] = !{}
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 62239ca705b9e..c0613786a13bb 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2861,6 +2861,69 @@ def int_experimental_convergence_anchor
 def int_experimental_convergence_loop
   : DefaultAttrsIntrinsic<[llvm_token_ty], [], [IntrNoMem, IntrConvergent]>;
 
+//===------- GPU Intrinsics -----------------------------------------------===//
+
+class GPUIntrinsic<LLVMType ret_type, string name>
+  : DefaultAttrsIntrinsic<[ret_type],
+                          [],
+                          [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>,
+                          ClangBuiltin<name>;
+
+multiclass GPUGridIntrinsic_xyz<string prefix> {
+  def _x : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_x")>;
+  def _y : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_y")>;
+  def _z : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_z")>;
+}
+
+defm int_gpu_num_blocks : GPUGridIntrinsic_xyz<"__builtin_gpu_num_blocks">;
+defm int_gpu_block_id : GPUGridIntrinsic_xyz<"__builtin_gpu_block_id">;
+defm int_gpu_num_threads : GPUGridIntrinsic_xyz<"__builtin_gpu_num_threads">;
+defm int_gpu_thread_id : GPUGridIntrinsic_xyz<"__builtin_gpu_thread_id">;
+
+def int_gpu_num_lanes : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_num_lanes">;
+def int_gpu_lane_id : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_lane_id">;
+def int_gpu_lane_mask : GPUIntrinsic<llvm_i64_ty,"__builtin_gpu_lane_mask">;
+
+def int_gpu_read_first_lane_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+                                                        [llvm_i64_ty, llvm_i32_ty],
+                                                        [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                                        ClangBuiltin<"__builtin_gpu_read_first_lane_u32">;
+
+def int_gpu_shuffle_idx_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+                                                    [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                                                    [IntrNoMem, IntrConvergent, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                                    ClangBuiltin<"__builtin_gpu_shuffle_idx_u32">;
+
+def int_gpu_ballot : DefaultAttrsIntrinsic<[llvm_i64_ty],
+                                           [llvm_i64_ty, llvm_i1_ty],
+                                           [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                           ClangBuiltin<"__builtin_gpu_ballot">;
+
+def int_gpu_sync_threads : DefaultAttrsIntrinsic<[], [],
+    // todo, attributes match barrier, but there's a fence in here too
+    // also why is there a fence in here?
+                                                 [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                                  IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                                 ClangBuiltin<"__builtin_gpu_sync_threads">;
+
+def int_gpu_sync_lane : DefaultAttrsIntrinsic<[],
+                                              [llvm_i64_ty],
+                                              [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                              IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                              ClangBuiltin<"__builtin_gpu_sync_lane">;
+
+def int_gpu_exit : DefaultAttrsIntrinsic<[],
+                                         [],
+                                         [IntrNoReturn, IntrConvergent]>,
+                                         ClangBuiltin<"__builtin_gpu_exit">;
+
+
+def int_gpu_thread_suspend : DefaultAttrsIntrinsic<[],
+                                                   [],
+                                                   [IntrWillReturn, IntrNoMem, IntrHasSideEffects, IntrConvergent]>,
+                                                   ClangBuiltin<"__builtin_gpu_thread_suspend">;
+
+
 //===----------------------------------------------------------------------===//
 // Target-specific intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 86e050333acc7..91095f2880d03 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -150,6 +150,8 @@ defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
 defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
                                <"__builtin_amdgcn_workgroup_id">;
 
+defm int_amdgcn_grid_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+
 def int_amdgcn_dispatch_ptr :
   DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
   [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 78ff93019fd7e..9cbe7b068c394 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -175,6 +175,7 @@ void initializeLoopUnrollPass(PassRegistry &);
 void initializeLowerAtomicLegacyPassPass(PassRegistry &);
 void initializeLowerEmuTLSPass(PassRegistry &);
 void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
+void initializeLowerGPUIntrinsicPass(PassRegistry &);
 void initializeLowerIntrinsicsPass(PassRegistry &);
 void initializeLowerInvokeLegacyPassPass(PassRegistry &);
 void initializeLowerSwitchLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index fc772a7639c47..d746b7a39b871 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -135,6 +135,12 @@ FunctionPass *createSinkingPass();
 //
 Pass *createLowerAtomicPass();
 
+//===----------------------------------------------------------------------===//
+//
+// LowerGPUIntrinsic - Lower GPU intrinsics
+//
+Pass *createLowerGPUIntrinsicPass();
+
 //===----------------------------------------------------------------------===//
 //
 // MergeICmps - Merge integer comparison chains into a memcmp
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
new file mode 100644
index 0000000000000..6e793d4965287
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
@@ -0,0 +1,26 @@
+//===--- LowerGPUIntrinsic.h - Lower GPU intrinsics -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers GPU intrinsics.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
+#define LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct LowerGPUIntrinsicPass : public PassInfoMixin<LowerGPUIntrinsicPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; } // otherwise O0 doesn't run it
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 99e78d3b6feb8..32234df9a364f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -295,6 +295,7 @@
 #include "llvm/Transforms/Scalar/LowerAtomicPass.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 586d4b7e02fc1..f24b0a2e37329 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -96,6 +96,7 @@ MODULE_PASS("iroutliner", IROutlinerPass())
 MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass())
 MODULE_PASS("lower-emutls", LowerEmuTLSPass())
 MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass())
+MODULE_PASS("lower-gpu-intrinsic", LowerGPUIntrinsicPass())
 MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02043d01..f35c81f2e661b 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -51,6 +51,7 @@ add_llvm_component_library(LLVMScalarOpts
   LowerAtomicPass.cpp
   LowerConstantIntrinsics.cpp
   LowerExpectIntrinsic.cpp
+  LowerGPUIntrinsic.cpp
   LowerGuardIntrinsic.cpp
   LowerMatrixIntrinsics.cpp
   LowerWidenableCondition.cpp
diff --git a/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
new file mode 100644
index 0000000000000..a65e9fa731e67
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
@@ -0,0 +1,501 @@
+//===- LowerGPUIntrinsic.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower the llvm.gpu intrinsics to target specific code sequences.
+// Can be called from clang if building for a specific GPU or from the backend
+// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or
+// nvptx, adding further architectures means adding a column to the lookup table
+// and further intrinsics adding a row.
+//
+// The idea is for the intrinsics to represent a thin abstraction over the
+// different GPU architectures. In particular, code compiled to spirv-- without
+// specifying a specific target can be specialised at JIT time, at which point
+// this pass will rewrite those intrinsics to ones that the current backend
+// knows.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-gpu-intrinsic"
+
+using namespace llvm;
+
+namespace {
+
+// For each intrinsic, specify what function to call to lower it
+typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from,
+                              CallBase *CI);
+
+// Simple lowering, directly replace the intrinsic with a different one
+// with the same type, and optionally refine range metadata on the return value
+template <Intrinsic::ID To>
+bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) {
+
+  static_assert(To != Intrinsic::not_intrinsic);
+  Intrinsic::ID GenericID = from;
+  Intrinsic::ID SpecificID = To;
+
+  bool Changed = false;
+  Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID);
+  auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID);
+
+  if ((Generic->getType() != Specific->getType()) ||
+      (Generic->getReturnType() != Specific->getReturnType()))
+    report_fatal_error("LowerGPUIntrinsic: Inconsistent types between "
+                       "intrinsics in lookup table");
+
+  CI->setCalledFunction(Specific);
+  Changed = true;
+
+  return Changed;
+}
+
+// Replace intrinsic call with a linear sequence of instructions
+typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                          CallBase *CI);
+
+template <builder F>
+bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) {
+  bool Changed = false;
+
+  Builder.SetInsertPoint(CI);
+
+  Value *replacement = F(M, Builder, from, CI);
+  if (replacement) {
+    CI->replaceAllUsesWith(replacement);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+template <Intrinsic::ID Numerator, Intrinsic::ID Denominator>
+Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID,
+                      CallBase *) {
+  Value *N = Builder.CreateIntrinsic(Numerator, {}, {});
+  Value *D = Builder.CreateIntrinsic(Denominator, {}, {});
+  return Builder.CreateUDiv(N, D);
+}
+
+namespace amdgpu {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(
+      Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+      {ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+}
+
+Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+               CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1);
+  Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  CallInst *Lo =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z});
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo});
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                 {Type::getInt32Ty(Ctx)},
+                                 {CI->getArgOperand(1)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {});
+
+  Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0),
+                               width, "not");
+  Value *a = Builder.CreateAnd(id, n, "and");
+  Value *add = Builder.CreateAdd(a, idx, "add");
+  Value *shl =
+      Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), "shl");
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x});
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *C =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+                              {CI->getArgOperand(1)});
+
+  return Builder.CreateAnd(C, CI->getArgOperand(0));
+}
+
+Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                    CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+  Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+                                 Ctx.getOrInsertSyncScopeID("workgroup"));
+
+  return F;
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {},
+                                 {ConstantInt::get(Type::getInt32Ty(Ctx), 2)});
+}
+
+Value *dispatch_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
+  return Call;
+}
+
+Value *implicit_arg_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+  return Call;
+}
+
+template <unsigned Index>
+Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) {
+  auto &Ctx = M.getContext();
+  const unsigned XOffset = 12;
+  auto *DP = dispatch_ptr(Builder);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4);
+  auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset);
+  auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP);
+  llvm::MDBuilder MDB(Ctx);
+  // Known non-zero.
+  LD->setMetadata(llvm::LLVMContext::MD_range,
+                  MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+  return LD;
+}
+
+template <int Index>
+Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+
+  // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by
+  //       clang during compilation of user code.
+  StringRef Name = "__oclc_ABI_version";
+  auto *ABIVersionC = M.getNamedGlobal(Name);
+  if (!ABIVersionC) {
+    // In CGBuiltin, we'd have to create an extern variable to emit the load for
+    // Here, we can leave the intrinsic in place and it'll get lowered later
+    return nullptr;
+  }
+  auto &Ctx = M.getContext();
+
+  Value *ABIVersion = Builder.CreateLoad(Type::getInt32Ty(Ctx), ABIVersionC);
+
+  Value *IsCOV5 = Builder.CreateICmpSGE(
+      ABIVersion,
+      ConstantInt::get(Type::getInt32Ty(Ctx), CodeObjectVersionKind::COV_5));
+
+  Value *ImplicitGEP = Builder.CreateConstGEP1_32(
+      Type::getInt8Ty(Ctx), implicit_arg_ptr(Builder), 12 + Index * 2);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  Value *DispatchGEP = Builder.CreateConstGEP1_32(
+      Type::getInt8Ty(Ctx), dispatch_ptr(Builder), 4 + Index * 2);
+
+  auto Result = Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
+  LoadInst *LD = Builder.CreateLoad(Type::getInt16Ty(Ctx), Result);
+
+  // TODO: CGBuiltin digs MaxOpenCLWorkGroupSize out of targetinfo and limtis
+  // the range on the load based on that (MD_range)
+
+  LD->setMetadata(llvm::LLVMContext::MD_noundef, llvm::MDNode::get(Ctx, {}));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+
+  // The workgroup size is a uint16_t but gpu_block_id returns a uint32_t
+  return Builder.CreateZExt(LD, Type::getInt32Ty(Ctx));
+}
+
+} // namespace amdgpu
+
+namespace nvptx {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  CallInst *C = Builder.CreateIntrinsic(Intrinsic::nvvm_activemask, {}, {});
+  return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Value *conv =
+      Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+  Value *C = Builder.CreateIntrinsic(
+      Intrinsic::cttz, {Type::getInt32Ty(Ctx)},
+      {conv, ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+  Value *iszero = Builder.CreateICmpEQ(
+      conv, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "iszero");
+  Value *sub = Builder.CreateSelect(
+      iszero, ConstantInt::get(Type::getInt32Ty(Ctx), -1), C, "sub");
+
+  return Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+                                 {conv, CI->getArgOperand(1), sub,
+                                  ConstantInt::get(Type::getInt32Ty(Ctx), 31)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *lane_mask = CI->getArgOperand(0);
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *Conv = Builder.CreateTrunc(lane_mask, Type::getInt32Ty(Ctx), "conv");
+
+  Value *sh_prom = Builder.CreateZExt(idx, Type::getInt64Ty(Ctx), "sh_prom");
+  Value *shl0 =
+      Builder.CreateShl(width, ConstantInt::get(Type::getInt32Ty(Ctx), 8));
+  Value *or0 = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 8223),
+                                 shl0, "or");
+
+  Value *core = Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+                                        {Conv, x, idx, or0});
+
+  Value *shl1 =
+      Builder.CreateShl(ConstantInt::get(Type::getInt64Ty(Ctx), 1), sh_prom);
+  Value *and0 = Builder.CreateAnd(shl1, lane_mask);
+  Value *cmp =
+      Builder.CreateICmpEQ(and0, ConstantInt::get(Type::getInt64Ty(Ctx), 0));
+  Value *and4 = Builder.CreateSelect(
+      cmp, ConstantInt::get(Type::getInt32Ty(Ctx), 0), core, "and4");
+
+  return and4;
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Value *Conv =
+      Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+  Value *C = Builder.CreateIntrinsic(Intrinsic::nvvm_vote_ballot_sync, {},
+                                     {Conv, CI->getArgOperand(1)});
+
+  return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  Value *X = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx));
+  return Builder.CreateIntrinsic(Intrinsic::nvvm_bar_warp_sync, {}, {X});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+
+  Value *str = Builder.CreateGlobalString(
+      "__CUDA_ARCH", "", DL.getDefaultGlobalsAddressSpace(), &M);
+
+  Builder.SetInsertPoint(CI);
+  Value *Reflect = Builder.CreateIntrinsic(Intrinsic::nvvm_reflect, {}, {str});
+  Value *Cmp = Builder.CreateICmpUGT(
+      Reflect, ConstantInt::get(Type::getInt32Ty(Ctx), 699));
+
+  Builder.SetInsertPoint(SplitBlockAndInsertIfThen(Cmp, CI, false));
+
+  Builder.CreateIntrinsic(Intrinsic::nvvm_nanosleep, {},
+                          {ConstantInt::get(Type::getInt32Ty(Ctx), 64)});
+
+  CI->eraseFromParent();
+  return nullptr; // All done
+}
+
+} // namespace nvptx
+
+struct IntrinsicMap {
+  Intrinsic::ID Generic;
+  lowerFunction AMDGPU;
+  lowerFunction NVPTX;
+};
+
+using namespace Intrinsic;
+
+static const IntrinsicMap ls[] = {
+    // This table of intrinsic => what to do with it is walked in order.
+    // A row can create calls to intrinsics that are expanded in subsequent rows
+    // but that does mean that the order of rows is somewhat significant.
+    // S<intrinsic> is a simple lowering to an existing intrinsic
+    // B<function> involves building a short sequence of instructions
+
+    // amdgpu defines num_blocks as an integer ratio of two other intrinsics.
+    // amdgcn_grid_size and gpu_num_threads are expanded further down.
+    {
+        gpu_num_blocks_x,
+        B<intrinsicRatio<amdgcn_grid_size_x, gpu_num_threads_x>>,
+        S<nvvm_read_ptx_sreg_nctaid_x>,
+    },
+    {
+        gpu_num_blocks_y,
+        B<intrinsicRatio<amdgcn_grid_size_y, gpu_num_threads_y>>,
+        S<nvvm_read_ptx_sreg_nctaid_y>,
+    },
+    {
+        gpu_num_blocks_z,
+        B<intrinsicRatio<amdgcn_grid_size_z, gpu_num_threads_z>>,
+        S<nvvm_read_ptx_sreg_nctaid_z>,
+    },
+
+    // Note: Could canonicalise in favour of the target agnostic one without
+    // breaking existing users of builtin or intrinsic:
+    //  {amdgcn_workgroup_id_x, S<gpu_block_id_x>, nullptr},
+    //  {gpu_block_id_x, nullptr, S<nvvm_read_ptx_sreg_ctaid_x>},
+    // Using the target agnostic one throughout the rest of the backend would
+    // work fine, and amdgpu-no-workgroup-id-x attribute and similar may be
+    // applicable to other targets.
+    // Map {block,thread}_id onto existing intrinsics for the time being.
+    {gpu_block_id_x, S<amdgcn_workgroup_id_x>, S<nvvm_read_ptx_sreg_ctaid_x>},
+    {gpu_block_id_y, S<amdgcn_workgroup_id_y>, S<nvvm_read_ptx_sreg_ctaid_y>},
+    {gpu_block_id_z, S<amdgcn_workgroup_id_z>, S<nvvm_read_ptx_sreg_ctaid_z>},
+    {gpu_thread_id_x, S<amdgcn_workitem_id_x>, S<nvvm_read_ptx_sreg_tid_x>},
+    {gpu_thread_id_y, S<amdgcn_workitem_id_y>, S<nvvm_read_ptx_sreg_tid_y>},
+    {gpu_thread_id_z, S<amdgcn_workitem_id_z>, S<nvvm_read_ptx_sreg_tid_z>},
+    
+    // CGBuiltin maps builtin_amdgcn_workgroup_size onto gpu_num_threads
+    {gpu_num_threads_x, B<amdgpu::WGSize<0>>, S<nvvm_read_ptx_sreg_ntid_x>},
+    {gpu_num_threads_y, B<amdgpu::WGSize<1>>, S<nvvm_read_ptx_sreg_ntid_y>},
+    {gpu_num_threads_z, B<amdgpu::WGSize<2>>, S<nvvm_read_ptx_sreg_ntid_z>},
+ 
+    // Some of the following intrinsics need minor impedance matching
+    {gpu_num_lanes, S<amdgcn_wavefrontsize>, S<nvvm_read_ptx_sreg_warpsize>},
+    {gpu_lane_mask, B<amdgpu::lane_mask>, B<nvptx::lane_mask>},
+
+    {gpu_read_first_lane_u32, B<amdgpu::first_lane>, B<nvptx::first_lane>},
+    {gpu_shuffle_idx_u32, B<amdgpu::shuffle_idx>, B<nvptx::shuffle_idx>},
+
+    // shuffle sometimes emits call into lane_id so lower lane_id after shuffle
+    {gpu_lane_id, B<amdgpu::lane_id>, S<nvvm_read_ptx_sreg_laneid>},
+
+    {gpu_ballot, B<amdgpu::ballot>, B<nvptx::ballot>},
+
+    {gpu_sync_threads, B<amdgpu::sync_threads>, S<nvvm_barrier0>},
+    {gpu_sync_lane, B<amdgpu::sync_lane>, B<nvptx::sync_lane>},
+
+    {gpu_thread_suspend, B<amdgpu::thread_suspend>, B<nvptx::thread_suspend>},
+    {gpu_exit, S<amdgcn_endpgm>, S<nvvm_exit>},
+
+    // These aren't generic intrinsics but lowering them here instead of
+    // in CGBuiltin allows the above to be implemented partly in terms of
+    // amdgcn_grid_size.
+    {amdgcn_grid_size_x, B<amdgpu::grid_size<0>>, nullptr},
+    {amdgcn_grid_size_y, B<amdgpu::grid_size<1>>, nullptr},
+    {amdgcn_grid_size_z, B<amdgpu::grid_size<2>>, nullptr},
+};
+
+class LowerGPUIntrinsic : public ModulePass {
+public:
+  static char ID;
+
+  LowerGPUIntrinsic() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+};
+
+bool LowerGPUIntrinsic::runOnModule(Module &M) {
+  bool Changed = false;
+
+  Triple TT(M.getTargetTriple());
+
+  if (!TT.isAMDGPU() && !TT.isNVPTX()) {
+    return Changed;
+  }
+
+  auto &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+
+  for (const IntrinsicMap &I : ls) {
+    auto *Intr = Intrinsic::getDeclarationIfExists(&M, I.Generic);
+    if (!Intr)
+      continue;
+
+    lowerFunction maybeLowering = TT.isAMDGPU() ? I.AMDGPU : I.NVPTX;
+    if (maybeLowering == nullptr)
+      continue;
+
+    for (auto *U : make_early_inc_range(Intr->users())) {
+      if (auto *CI = dyn_cast<CallBase>(U)) {
+        if (CI->getCalledFunction() == Intr)
+          Changed |= maybeLowering(M, Builder, I.Generic, CI);
+      }
+    }
+  }
+
+  return Changed;
+}
+
+} // namespace
+
+char LowerGPUIntrinsic::ID = 0;
+
+INITIALIZE_PASS(LowerGPUIntrinsic, DEBUG_TYPE, "Lower GPU Intrinsic", false,
+                false)
+
+Pass *llvm::createLowerGPUIntrinsicPass() { return new LowerGPUIntrinsic(); }
+
+PreservedAnalyses LowerGPUIntrinsicPass::run(Module &M,
+                                             ModuleAnalysisManager &) {
+  return LowerGPUIntrinsic().runOnModule(M) ? PreservedAnalyses::none()
+                                            : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c7e4a3e824700..66ef7ebfa5fe5 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -34,6 +34,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopTermFoldPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
+  initializeLowerGPUIntrinsicPass(Registry);
   initializeMergeICmpsLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
diff --git a/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll b/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll
new file mode 100644
index 0000000000000..41c14bc323e61
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll
@@ -0,0 +1,427 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes
+; RUN: opt -S -mtriple=amdgcn-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=AMDGCN
+; RUN: opt -S -mtriple=nvptx64-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=NVPTX
+
+; Used by amdgpu to lower llvm.gpu.num.threads, harmless on nvptx
+ at __oclc_ABI_version = weak_odr hidden addrspace(4) constant i32 500
+
+define i32 @num_blocks_x() {
+; AMDGCN-LABEL: @num_blocks_x(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 12
+; AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0:![0-9]+]], !invariant.load [[META1:![0-9]+]]
+; AMDGCN-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500
+; AMDGCN-NEXT:    [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 12
+; AMDGCN-NEXT:    [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 4
+; AMDGCN-NEXT:    [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]]
+; AMDGCN-NEXT:    ret i32 [[TMP5]]
+;
+; NVPTX-LABEL: @num_blocks_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.blocks.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.x()
+
+define i32 @num_blocks_y() {
+; AMDGCN-LABEL: @num_blocks_y(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 16
+; AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]]
+; AMDGCN-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500
+; AMDGCN-NEXT:    [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 14
+; AMDGCN-NEXT:    [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 6
+; AMDGCN-NEXT:    [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]]
+; AMDGCN-NEXT:    ret i32 [[TMP5]]
+;
+; NVPTX-LABEL: @num_blocks_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.blocks.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.y()
+
+define i32 @num_blocks_z() {
+; AMDGCN-LABEL: @num_blocks_z(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 20
+; AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]]
+; AMDGCN-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500
+; AMDGCN-NEXT:    [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 16
+; AMDGCN-NEXT:    [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 8
+; AMDGCN-NEXT:    [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]]
+; AMDGCN-NEXT:    ret i32 [[TMP5]]
+;
+; NVPTX-LABEL: @num_blocks_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.blocks.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.z()
+
+define i32 @block_id_x() {
+; AMDGCN-LABEL: @block_id_x(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @block_id_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.block.id.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.x()
+
+define i32 @block_id_y() {
+; AMDGCN-LABEL: @block_id_y(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @block_id_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.block.id.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.y()
+
+define i32 @block_id_z() {
+; AMDGCN-LABEL: @block_id_z(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @block_id_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.block.id.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.z()
+
+define i32 @num_threads_x() {
+; AMDGCN-LABEL: @num_threads_x(
+; AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12
+; AMDGCN-NEXT:    [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 4
+; AMDGCN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_threads_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.threads.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.x()
+
+define i32 @num_threads_y() {
+; AMDGCN-LABEL: @num_threads_y(
+; AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 14
+; AMDGCN-NEXT:    [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 6
+; AMDGCN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_threads_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.threads.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.y()
+
+define i32 @num_threads_z() {
+; AMDGCN-LABEL: @num_threads_z(
+; AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 16
+; AMDGCN-NEXT:    [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 8
+; AMDGCN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_threads_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.threads.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.z()
+
+define i32 @thread_id_x() {
+; AMDGCN-LABEL: @thread_id_x(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @thread_id_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.thread.id.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.x()
+
+define i32 @thread_id_y() {
+; AMDGCN-LABEL: @thread_id_y(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @thread_id_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.thread.id.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.y()
+
+define i32 @thread_id_z() {
+; AMDGCN-LABEL: @thread_id_z(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @thread_id_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.thread.id.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.z()
+
+define i32 @num_lanes() {
+; AMDGCN-LABEL: @num_lanes(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.wavefrontsize()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_lanes(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.lanes()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.lanes()
+
+define i32 @lane_id() {
+; AMDGCN-LABEL: @lane_id(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; AMDGCN-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]])
+; AMDGCN-NEXT:    ret i32 [[TMP2]]
+;
+; NVPTX-LABEL: @lane_id(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.lane.id()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.lane.id()
+
+define i64 @lane_mask() {
+; AMDGCN-LABEL: @lane_mask(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; AMDGCN-NEXT:    ret i64 [[TMP1]]
+;
+; NVPTX-LABEL: @lane_mask(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.activemask()
+; NVPTX-NEXT:    [[CONV:%.*]] = zext i32 [[TMP1]] to i64
+; NVPTX-NEXT:    ret i64 [[CONV]]
+;
+  %1 = call i64 @llvm.gpu.lane.mask()
+  ret i64 %1
+}
+
+declare i64 @llvm.gpu.lane.mask()
+
+define i32 @read_first_lane_u32(i64 %lane_mask, i32 %x)  {
+; AMDGCN-LABEL: @read_first_lane_u32(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X:%.*]])
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @read_first_lane_u32(
+; NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[CONV]], i1 true)
+; NVPTX-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[CONV]], 0
+; NVPTX-NEXT:    [[SUB:%.*]] = select i1 [[ISZERO]], i32 -1, i32 [[TMP1]]
+; NVPTX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[SUB]], i32 31)
+; NVPTX-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = call i32 @llvm.gpu.read.first.lane.u32(i64 %lane_mask, i32 %x)
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.read.first.lane.u32(i64, i32)
+
+define i64 @ballot(i64 %lane_mask, i1 zeroext %x)  {
+; AMDGCN-LABEL: @ballot(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[X:%.*]])
+; AMDGCN-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], [[LANE_MASK:%.*]]
+; AMDGCN-NEXT:    ret i64 [[TMP2]]
+;
+; NVPTX-LABEL: @ballot(
+; NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[CONV]], i1 [[X:%.*]])
+; NVPTX-NEXT:    [[CONV1:%.*]] = zext i32 [[TMP1]] to i64
+; NVPTX-NEXT:    ret i64 [[CONV1]]
+;
+  %1 = call i64 @llvm.gpu.ballot(i64 %lane_mask, i1 %x)
+  ret i64 %1
+}
+
+declare i64 @llvm.gpu.ballot(i64, i1)
+
+define void @sync_threads() {
+; AMDGCN-LABEL: @sync_threads(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.s.barrier()
+; AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @sync_threads(
+; NVPTX-NEXT:    call void @llvm.nvvm.barrier0()
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.sync.threads()
+  ret void
+}
+
+declare void @llvm.gpu.sync.threads()
+
+define void @sync_lane(i64 %lane_mask) {
+; AMDGCN-LABEL: @sync_lane(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.wave.barrier()
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @sync_lane(
+; NVPTX-NEXT:    [[TMP1:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    call void @llvm.nvvm.bar.warp.sync(i32 [[TMP1]])
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.sync.lane(i64 %lane_mask)
+  ret void
+}
+
+declare void @llvm.gpu.sync.lane(i64)
+
+define i32 @shuffle_idx_u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width)  {
+; AMDGCN-LABEL: @shuffle_idx_u32(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; AMDGCN-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]])
+; AMDGCN-NEXT:    [[NOT:%.*]] = sub i32 0, [[WIDTH:%.*]]
+; AMDGCN-NEXT:    [[AND:%.*]] = and i32 [[TMP2]], [[NOT]]
+; AMDGCN-NEXT:    [[ADD:%.*]] = add i32 [[AND]], [[IDX:%.*]]
+; AMDGCN-NEXT:    [[SHL:%.*]] = shl i32 [[ADD]], 2
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL]], i32 [[X:%.*]])
+; AMDGCN-NEXT:    ret i32 [[TMP3]]
+;
+; NVPTX-LABEL: @shuffle_idx_u32(
+; NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    [[SH_PROM:%.*]] = zext i32 [[IDX:%.*]] to i64
+; NVPTX-NEXT:    [[TMP1:%.*]] = shl i32 [[WIDTH:%.*]], 8
+; NVPTX-NEXT:    [[OR:%.*]] = sub i32 8223, [[TMP1]]
+; NVPTX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[IDX]], i32 [[OR]])
+; NVPTX-NEXT:    [[TMP3:%.*]] = shl i64 1, [[SH_PROM]]
+; NVPTX-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], [[LANE_MASK]]
+; NVPTX-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0
+; NVPTX-NEXT:    [[AND4:%.*]] = select i1 [[TMP5]], i32 0, i32 [[TMP2]]
+; NVPTX-NEXT:    ret i32 [[AND4]]
+;
+  %1 = call i32 @llvm.gpu.shuffle.idx.u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width)
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.shuffle.idx.u32(i64, i32, i32, i32)
+
+define void @gpu_exit() {
+; AMDGCN-LABEL: @gpu_exit(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.endpgm()
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @gpu_exit(
+; NVPTX-NEXT:    call void @llvm.nvvm.exit()
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.exit()
+  ret void
+}
+
+declare void @llvm.gpu.exit()
+
+define void @thread_suspend() {
+; AMDGCN-LABEL: @thread_suspend(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.s.sleep(i32 2)
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @thread_suspend(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.reflect(ptr @[[GLOB0:[0-9]+]])
+; NVPTX-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 699
+; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NVPTX:       3:
+; NVPTX-NEXT:    call void @llvm.nvvm.nanosleep(i32 64)
+; NVPTX-NEXT:    br label [[TMP4]]
+; NVPTX:       4:
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.thread.suspend()
+  ret void
+}
+
+declare void @llvm.gpu.thread.suspend()