[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)
Jon Chesterfield via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 17 07:34:56 PDT 2025
https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/131190
>From e3d1c0d0f430a96e26c68e22ab53dc2fa4a14e47 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Wed, 12 Mar 2025 20:55:17 +0000
Subject: [PATCH] [SPIRV] GPU intrinsics
---
clang/include/clang/Basic/Builtins.td | 29 +
clang/test/CodeGen/amdgpu-grid-builtins.c | 158 +++++
clang/test/CodeGen/gpu_builtins.c | 647 ++++++++++++++++++
llvm/include/llvm/IR/Intrinsics.td | 63 ++
llvm/include/llvm/InitializePasses.h | 1 +
llvm/include/llvm/Transforms/Scalar.h | 6 +
.../Transforms/Scalar/LowerGPUIntrinsic.h | 26 +
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassRegistry.def | 1 +
llvm/lib/Transforms/Scalar/CMakeLists.txt | 1 +
.../Transforms/Scalar/LowerGPUIntrinsic.cpp | 506 ++++++++++++++
llvm/lib/Transforms/Scalar/Scalar.cpp | 1 +
.../gpu_intrinsic_to_amdgcn.ll | 315 +++++++++
.../gpu_intrinsic_to_nvptx.ll | 266 +++++++
14 files changed, 2021 insertions(+)
create mode 100644 clang/test/CodeGen/amdgpu-grid-builtins.c
create mode 100644 clang/test/CodeGen/gpu_builtins.c
create mode 100644 llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
create mode 100644 llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
create mode 100644 llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_amdgcn.ll
create mode 100644 llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_nvptx.ll
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 2fbdfaea57ccd..042508c1e59a8 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4770,6 +4770,35 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
let Prototype = "char const*(...)";
}
+// GPU intrinsics
+class GPUBuiltin<string prototype> : Builtin {
+ let Spellings = ["__builtin_" # NAME];
+ let Prototype = prototype;
+ let Attributes = [NoThrow];
+}
+
+multiclass GPUGridBuiltin<string prototype> {
+ def _x : GPUBuiltin<prototype>;
+ def _y : GPUBuiltin<prototype>;
+ def _z : GPUBuiltin<prototype>;
+}
+
+defm gpu_num_blocks : GPUGridBuiltin<"uint32_t()">;
+defm gpu_block_id : GPUGridBuiltin<"uint32_t()">;
+defm gpu_num_threads : GPUGridBuiltin<"uint32_t()">;
+defm gpu_thread_id : GPUGridBuiltin<"uint32_t()">;
+
+def gpu_ballot : GPUBuiltin<"uint64_t(uint64_t, bool)">;
+def gpu_exit : GPUBuiltin<"void()">;
+def gpu_lane_id : GPUBuiltin<"uint32_t()">;
+def gpu_lane_mask : GPUBuiltin<"uint64_t()">;
+def gpu_num_lanes : GPUBuiltin<"uint32_t()">;
+def gpu_read_first_lane_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t)">;
+def gpu_shuffle_idx_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t, uint32_t, uint32_t)">;
+def gpu_sync_lane : GPUBuiltin<"void(uint64_t)">;
+def gpu_sync_threads : GPUBuiltin<"void()">;
+def gpu_thread_suspend : GPUBuiltin<"void()">;
+
// HLSL
def HLSLAddUint64: LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_adduint64"];
diff --git a/clang/test/CodeGen/amdgpu-grid-builtins.c b/clang/test/CodeGen/amdgpu-grid-builtins.c
new file mode 100644
index 0000000000000..2104da2dc3cbc
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-grid-builtins.c
@@ -0,0 +1,158 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | FileCheck %s
+
+#include <stdint.h>
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x()
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_x(void)
+{
+ return __builtin_amdgcn_workgroup_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.y()
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_y(void)
+{
+ return __builtin_amdgcn_workgroup_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.z()
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_z(void)
+{
+ return __builtin_amdgcn_workgroup_id_z();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t workitem_id_x(void)
+{
+ return __builtin_amdgcn_workitem_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t workitem_id_y(void)
+{
+ return __builtin_amdgcn_workitem_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t workitem_id_z(void)
+{
+ return __builtin_amdgcn_workitem_id_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef [[META4]]
+// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT: ret i32 [[CONV]]
+//
+uint32_t workgroup_size_x(void)
+{
+ return __builtin_amdgcn_workgroup_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 14
+// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT: ret i32 [[CONV]]
+//
+uint32_t workgroup_size_y(void)
+{
+ return __builtin_amdgcn_workgroup_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 8, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT: ret i32 [[CONV]]
+//
+uint32_t workgroup_size_z(void)
+{
+ return __builtin_amdgcn_workgroup_size_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5:![0-9]+]], !invariant.load [[META4]]
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t grid_size_x(void)
+{
+ return __builtin_amdgcn_grid_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t grid_size_y(void)
+{
+ return __builtin_amdgcn_grid_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 20
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t grid_size_z(void)
+{
+ return __builtin_amdgcn_grid_size_z();
+}
+
+//.
+// CHECK: [[RNG3]] = !{i16 1, i16 1025}
+// CHECK: [[META4]] = !{}
+// CHECK: [[RNG5]] = !{i32 1, i32 0}
+//.
diff --git a/clang/test/CodeGen/gpu_builtins.c b/clang/test/CodeGen/gpu_builtins.c
new file mode 100644
index 0000000000000..8231b0952fa5e
--- /dev/null
+++ b/clang/test/CodeGen/gpu_builtins.c
@@ -0,0 +1,647 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes
+// RUN: %clang_cc1 -O1 -triple spirv64 %s -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV64
+// RUN: %clang_cc1 -O1 -triple spirv64-amd-amdhsa %s -emit-llvm -o - | FileCheck %s --check-prefix=AMDHSA
+// RUN: %clang_cc1 -O1 -triple nvptx64 -emit-llvm %s -o - | FileCheck %s --check-prefix=NVPTX
+// RUN: %clang_cc1 -O1 -triple amdgcn -emit-llvm %s -o - | FileCheck %s --check-prefix=AMDGCN
+
+#include <stdint.h>
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_x(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_x(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.x()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_x(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_x(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t num_blocks_x(void) {
+ return __builtin_gpu_num_blocks_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_y(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_y(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.y()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_y(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_y(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t num_blocks_y(void) {
+ return __builtin_gpu_num_blocks_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_z(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_z(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.z()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_z(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_z(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t num_blocks_z(void) {
+ return __builtin_gpu_num_blocks_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @block_id_x(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @block_id_x(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.x()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @block_id_x(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @block_id_x(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t block_id_x(void) {
+ return __builtin_gpu_block_id_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @block_id_y(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @block_id_y(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.y()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @block_id_y(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @block_id_y(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t block_id_y(void) {
+ return __builtin_gpu_block_id_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @block_id_z(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @block_id_z(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.z()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @block_id_z(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @block_id_z(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t block_id_z(void) {
+ return __builtin_gpu_block_id_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_x(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_x(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.x()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_threads_x(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_x(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t num_threads_x(void) {
+ return __builtin_gpu_num_threads_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_y(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_y(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.y()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_threads_y(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_y(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t num_threads_y(void) {
+ return __builtin_gpu_num_threads_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_z(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_z(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.z()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_threads_z(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_z(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t num_threads_z(void) {
+ return __builtin_gpu_num_threads_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_x(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_x(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.x()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @thread_id_x(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_x(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t thread_id_x(void) {
+ return __builtin_gpu_thread_id_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_y(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_y(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.y()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @thread_id_y(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_y(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t thread_id_y(void) {
+ return __builtin_gpu_thread_id_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_z(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_z(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.z()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @thread_id_z(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_z(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t thread_id_z(void) {
+ return __builtin_gpu_thread_id_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @num_lanes(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @num_lanes(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.lanes()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @num_lanes(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @num_lanes(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t num_lanes(void) {
+ return __builtin_gpu_num_lanes();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @lane_id(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @lane_id(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.lane.id()
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @lane_id(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @lane_id(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+uint32_t lane_id(void) {
+ return __builtin_gpu_lane_id();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @lane_mask(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// SPIRV64-NEXT: ret i64 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @lane_mask(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i64 @llvm.gpu.lane.mask()
+// AMDHSA-NEXT: ret i64 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @lane_mask(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// NVPTX-NEXT: ret i64 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @lane_mask(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// AMDGCN-NEXT: ret i64 [[TMP0]]
+//
+uint64_t lane_mask(void) {
+ return __builtin_gpu_lane_mask();
+}
+
+
+uint32_t
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @read_first_lane_u32(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @read_first_lane_u32(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @read_first_lane_u32(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @read_first_lane_u32(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __builtin_gpu_read_first_lane_u32(__lane_mask, __x);
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @ballot(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// SPIRV64-NEXT: ret i64 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @ballot(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// AMDHSA-NEXT: ret i64 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @ballot(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// NVPTX-NEXT: ret i64 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @ballot(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// AMDGCN-NEXT: ret i64 [[TMP0]]
+//
+uint64_t ballot(uint64_t __lane_mask,
+ _Bool __x) {
+ return __builtin_gpu_ballot(__lane_mask, __x);
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// SPIRV64-LABEL: @sync_threads(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: tail call void @llvm.gpu.sync.threads()
+// SPIRV64-NEXT: ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// AMDHSA-LABEL: @sync_threads(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.sync.threads()
+// AMDHSA-NEXT: ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// NVPTX-LABEL: @sync_threads(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: tail call void @llvm.gpu.sync.threads()
+// NVPTX-NEXT: ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// AMDGCN-LABEL: @sync_threads(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: tail call void @llvm.gpu.sync.threads()
+// AMDGCN-NEXT: ret void
+//
+void sync_threads(void) {
+ return __builtin_gpu_sync_threads();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// SPIRV64-LABEL: @sync_lane(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// SPIRV64-NEXT: ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// AMDHSA-LABEL: @sync_lane(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// AMDHSA-NEXT: ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// NVPTX-LABEL: @sync_lane(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// NVPTX-NEXT: ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// AMDGCN-LABEL: @sync_lane(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// AMDGCN-NEXT: ret void
+//
+void sync_lane(uint64_t __lane_mask) {
+ return __builtin_gpu_sync_lane(__lane_mask);
+}
+
+
+uint32_t
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// SPIRV64-LABEL: @shuffle_idx_u32(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// SPIRV64-NEXT: ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDHSA-LABEL: @shuffle_idx_u32(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// AMDHSA-NEXT: ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// NVPTX-LABEL: @shuffle_idx_u32(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+// AMDGCN-LABEL: @shuffle_idx_u32(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width);
+}
+
+// SPIRV64: Function Attrs: nofree norecurse noreturn nosync nounwind
+// SPIRV64-LABEL: @gpu_exit(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: tail call void @llvm.gpu.exit()
+// SPIRV64-NEXT: unreachable
+//
+// AMDHSA: Function Attrs: nofree norecurse noreturn nosync nounwind
+// AMDHSA-LABEL: @gpu_exit(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.exit()
+// AMDHSA-NEXT: unreachable
+//
+// NVPTX: Function Attrs: convergent nofree norecurse noreturn nosync nounwind
+// NVPTX-LABEL: @gpu_exit(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: tail call void @llvm.gpu.exit()
+// NVPTX-NEXT: unreachable
+//
+// AMDGCN: Function Attrs: convergent nofree norecurse noreturn nosync nounwind
+// AMDGCN-LABEL: @gpu_exit(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: tail call void @llvm.gpu.exit()
+// AMDGCN-NEXT: unreachable
+//
+void gpu_exit(void) {
+ return __builtin_gpu_exit();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// SPIRV64-LABEL: @thread_suspend(
+// SPIRV64-NEXT: entry:
+// SPIRV64-NEXT: tail call void @llvm.gpu.thread.suspend()
+// SPIRV64-NEXT: ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+// AMDHSA-LABEL: @thread_suspend(
+// AMDHSA-NEXT: entry:
+// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.thread.suspend()
+// AMDHSA-NEXT: ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// NVPTX-LABEL: @thread_suspend(
+// NVPTX-NEXT: entry:
+// NVPTX-NEXT: tail call void @llvm.gpu.thread.suspend()
+// NVPTX-NEXT: ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
+// AMDGCN-LABEL: @thread_suspend(
+// AMDGCN-NEXT: entry:
+// AMDGCN-NEXT: tail call void @llvm.gpu.thread.suspend()
+// AMDGCN-NEXT: ret void
+//
+void thread_suspend(void) {
+ return __builtin_gpu_thread_suspend();
+}
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 62239ca705b9e..c0613786a13bb 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2861,6 +2861,69 @@ def int_experimental_convergence_anchor
def int_experimental_convergence_loop
: DefaultAttrsIntrinsic<[llvm_token_ty], [], [IntrNoMem, IntrConvergent]>;
+//===------- GPU Intrinsics -----------------------------------------------===//
+
+class GPUIntrinsic<LLVMType ret_type, string name>
+ : DefaultAttrsIntrinsic<[ret_type],
+ [],
+ [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>,
+ ClangBuiltin<name>;
+
+multiclass GPUGridIntrinsic_xyz<string prefix> {
+ def _x : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_x")>;
+ def _y : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_y")>;
+ def _z : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_z")>;
+}
+
+defm int_gpu_num_blocks : GPUGridIntrinsic_xyz<"__builtin_gpu_num_blocks">;
+defm int_gpu_block_id : GPUGridIntrinsic_xyz<"__builtin_gpu_block_id">;
+defm int_gpu_num_threads : GPUGridIntrinsic_xyz<"__builtin_gpu_num_threads">;
+defm int_gpu_thread_id : GPUGridIntrinsic_xyz<"__builtin_gpu_thread_id">;
+
+def int_gpu_num_lanes : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_num_lanes">;
+def int_gpu_lane_id : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_lane_id">;
+def int_gpu_lane_mask : GPUIntrinsic<llvm_i64_ty,"__builtin_gpu_lane_mask">;
+
+def int_gpu_read_first_lane_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_i64_ty, llvm_i32_ty],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+ ClangBuiltin<"__builtin_gpu_read_first_lane_u32">;
+
+def int_gpu_shuffle_idx_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrConvergent, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+ ClangBuiltin<"__builtin_gpu_shuffle_idx_u32">;
+
+def int_gpu_ballot : DefaultAttrsIntrinsic<[llvm_i64_ty],
+ [llvm_i64_ty, llvm_i1_ty],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+ ClangBuiltin<"__builtin_gpu_ballot">;
+
+def int_gpu_sync_threads : DefaultAttrsIntrinsic<[], [],
+ // todo, attributes match barrier, but there's a fence in here too
+ // also why is there a fence in here?
+ [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+ ClangBuiltin<"__builtin_gpu_sync_threads">;
+
+def int_gpu_sync_lane : DefaultAttrsIntrinsic<[],
+ [llvm_i64_ty],
+ [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+ ClangBuiltin<"__builtin_gpu_sync_lane">;
+
+def int_gpu_exit : DefaultAttrsIntrinsic<[],
+ [],
+ [IntrNoReturn, IntrConvergent]>,
+ ClangBuiltin<"__builtin_gpu_exit">;
+
+
+def int_gpu_thread_suspend : DefaultAttrsIntrinsic<[],
+ [],
+ [IntrWillReturn, IntrNoMem, IntrHasSideEffects, IntrConvergent]>,
+ ClangBuiltin<"__builtin_gpu_thread_suspend">;
+
+
//===----------------------------------------------------------------------===//
// Target-specific intrinsics
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 78ff93019fd7e..9cbe7b068c394 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -175,6 +175,7 @@ void initializeLoopUnrollPass(PassRegistry &);
void initializeLowerAtomicLegacyPassPass(PassRegistry &);
void initializeLowerEmuTLSPass(PassRegistry &);
void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
+void initializeLowerGPUIntrinsicPass(PassRegistry &);
void initializeLowerIntrinsicsPass(PassRegistry &);
void initializeLowerInvokeLegacyPassPass(PassRegistry &);
void initializeLowerSwitchLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index fc772a7639c47..d746b7a39b871 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -135,6 +135,12 @@ FunctionPass *createSinkingPass();
//
Pass *createLowerAtomicPass();
+//===----------------------------------------------------------------------===//
+//
+// LowerGPUIntrinsic - Lower GPU intrinsics
+//
+Pass *createLowerGPUIntrinsicPass();
+
//===----------------------------------------------------------------------===//
//
// MergeICmps - Merge integer comparison chains into a memcmp
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
new file mode 100644
index 0000000000000..6e793d4965287
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
@@ -0,0 +1,26 @@
+//===--- LowerGPUIntrinsic.h - Lower GPU intrinsics -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers GPU intrinsics.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
+#define LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct LowerGPUIntrinsicPass : public PassInfoMixin<LowerGPUIntrinsicPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+ static bool isRequired() { return true; } // otherwise O0 doesn't run it
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 99e78d3b6feb8..32234df9a364f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -295,6 +295,7 @@
#include "llvm/Transforms/Scalar/LowerAtomicPass.h"
#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
#include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 586d4b7e02fc1..f24b0a2e37329 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -96,6 +96,7 @@ MODULE_PASS("iroutliner", IROutlinerPass())
MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass())
MODULE_PASS("lower-emutls", LowerEmuTLSPass())
MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass())
+MODULE_PASS("lower-gpu-intrinsic", LowerGPUIntrinsicPass())
MODULE_PASS("lower-ifunc", LowerIFuncPass())
MODULE_PASS("lowertypetests", LowerTypeTestsPass())
MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02043d01..f35c81f2e661b 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -51,6 +51,7 @@ add_llvm_component_library(LLVMScalarOpts
LowerAtomicPass.cpp
LowerConstantIntrinsics.cpp
LowerExpectIntrinsic.cpp
+ LowerGPUIntrinsic.cpp
LowerGuardIntrinsic.cpp
LowerMatrixIntrinsics.cpp
LowerWidenableCondition.cpp
diff --git a/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
new file mode 100644
index 0000000000000..3c670340fac0c
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
@@ -0,0 +1,506 @@
+//===- LowerGPUIntrinsic.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower the llvm.gpu intrinsics to target specific code sequences.
+// Can be called from clang if building for a specific GPU or from the backend
+// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or
+// nvptx, adding further architectures means adding a column to the lookup table
+// and further intrinsics adding a row.
+//
+// The idea is for the intrinsics to represent a thin abstraction over the
+// different GPU architectures. In particular, code compiled to spirv-- without
+// specifying a specific target can be specialised at JIT time, at which point
+// this pass will rewrite those intrinsics to ones that the current backend
+// knows.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-gpu-intrinsic"
+
+using namespace llvm;
+
+namespace {
+
+// For each intrinsic, specify what function to call to lower it
+typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from,
+ CallBase *CI);
+
+// Simple lowering, directly replace the intrinsic with a different one
+// with the same type, and optionally refine range metadata on the return value
+template <Intrinsic::ID To>
+bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) {
+
+ static_assert(To != Intrinsic::not_intrinsic);
+ Intrinsic::ID GenericID = from;
+ Intrinsic::ID SpecificID = To;
+
+ bool Changed = false;
+ Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID);
+ auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID);
+
+ if ((Generic->getType() != Specific->getType()) ||
+ (Generic->getReturnType() != Specific->getReturnType()))
+ report_fatal_error("LowerGPUIntrinsic: Inconsistent types between "
+ "intrinsics in lookup table");
+
+ CI->setCalledFunction(Specific);
+ Changed = true;
+
+ return Changed;
+}
+
+// Replace intrinsic call with a linear sequence of instructions
+typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI);
+
+template <builder F>
+bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) {
+ bool Changed = false;
+
+ Builder.SetInsertPoint(CI);
+
+ Value *replacement = F(M, Builder, from, CI);
+ if (replacement) {
+ CI->replaceAllUsesWith(replacement);
+ CI->eraseFromParent();
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+template <Intrinsic::ID Numerator, Intrinsic::ID Denominator>
+Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID,
+ CallBase *) {
+ Value *N = Builder.CreateIntrinsic(Numerator, {}, {});
+ Value *D = Builder.CreateIntrinsic(Denominator, {}, {});
+ return Builder.CreateUDiv(N, D);
+}
+
+namespace amdgpu {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ return Builder.CreateIntrinsic(
+ Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+ {ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+}
+
+Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1);
+ Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+ CallInst *Lo =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z});
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo});
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+ {Type::getInt32Ty(Ctx)},
+ {CI->getArgOperand(1)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+
+ Value *idx = CI->getArgOperand(1);
+ Value *x = CI->getArgOperand(2);
+ Value *width = CI->getArgOperand(3);
+
+ Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {});
+
+ Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0),
+ width, "not");
+ Value *a = Builder.CreateAnd(id, n, "and");
+ Value *add = Builder.CreateAdd(a, idx, "add");
+ Value *shl =
+ Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), "shl");
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x});
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+
+ Value *C =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+ {CI->getArgOperand(1)});
+
+ return Builder.CreateAnd(C, CI->getArgOperand(0));
+}
+
+Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+ Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ Ctx.getOrInsertSyncScopeID("workgroup"));
+
+ return F;
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+
+ auto &Ctx = M.getContext();
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {},
+ {ConstantInt::get(Type::getInt32Ty(Ctx), 2)});
+}
+
+Value *dispatch_ptr(IRBuilder<> &Builder) {
+ CallInst *Call =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
+ Call->addRetAttr(
+ Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+ Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
+ return Call;
+}
+
+Value *implicit_arg_ptr(IRBuilder<> &Builder) {
+ CallInst *Call =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+ Call->addRetAttr(
+ Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+ Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+ return Call;
+}
+
+template <unsigned Index>
+Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) {
+ auto &Ctx = M.getContext();
+ const unsigned XOffset = 12;
+ auto *DP = dispatch_ptr(Builder);
+
+ // Indexing the HSA kernel_dispatch_packet struct.
+ auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4);
+ auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset);
+ auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP);
+ llvm::MDBuilder MDB(Ctx);
+ // Known non-zero.
+ LD->setMetadata(llvm::LLVMContext::MD_range,
+ MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+ LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+ llvm::MDNode::get(Ctx, {}));
+ return LD;
+}
+
+template <int Index>
+Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID ,
+ CallBase *) {
+
+ // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by
+ // clang during compilation of user code.
+ StringRef Name = "__oclc_ABI_version";
+ auto *ABIVersionC = M.getNamedGlobal(Name);
+ if (!ABIVersionC) {
+ // In CGBuiltin, we'd have to create an extern variable to emit the load for
+ // Here, we can leave the intrinsic in place and it'll get lowered later
+ return nullptr;
+ }
+ auto &Ctx = M.getContext();
+
+ Value *ABIVersion = Builder.CreateLoad(Type::getInt32Ty(Ctx), ABIVersionC);
+
+ Value *IsCOV5 = Builder.CreateICmpSGE(
+ ABIVersion,
+ ConstantInt::get(Type::getInt32Ty(Ctx), CodeObjectVersionKind::COV_5));
+
+ Value *ImplicitGEP = Builder.CreateConstGEP1_32(
+ Type::getInt8Ty(Ctx), implicit_arg_ptr(Builder), 12 + Index * 2);
+
+ // Indexing the HSA kernel_dispatch_packet struct.
+ Value *DispatchGEP = Builder.CreateConstGEP1_32(
+ Type::getInt8Ty(Ctx), dispatch_ptr(Builder), 4 + Index * 2);
+
+ auto Result = Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
+ LoadInst *LD = Builder.CreateLoad(Type::getInt16Ty(Ctx), Result);
+
+ // TODO: CGBuiltin digs MaxOpenCLWorkGroupSize out of targetinfo and limits
+ // the range on the load based on that (MD_range)
+
+ LD->setMetadata(llvm::LLVMContext::MD_noundef, llvm::MDNode::get(Ctx, {}));
+ LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+ llvm::MDNode::get(Ctx, {}));
+
+ // The workgroup size is a uint16_t but gpu_block_id returns a uint32_t
+ return Builder.CreateZExt(LD, Type::getInt32Ty(Ctx));
+}
+
+
+template <int Index>
+Value *NumBlocks(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI)
+ {
+ // This is __builtin_amdgcn_grid_size / gpu_num_threads
+ // However we don't have a grid size intrinsic so can't expand to use that
+ // Open code it directly instead as the equivalent to
+ // Thus amdgpu::grid_size<Index> / amdgpu::WGSize<Index>
+ Value *Numerator = grid_size<Index>(M, Builder, Intrinsic::not_intrinsic, nullptr);
+ Value *Denominator = WGSize<Index>(M, Builder, Intrinsic::not_intrinsic, nullptr);
+ return Builder.CreateUDiv(Numerator, Denominator);
+ }
+
+} // namespace amdgpu
+
+namespace nvptx {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ CallInst *C = Builder.CreateIntrinsic(Intrinsic::nvvm_activemask, {}, {});
+ return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ Value *conv =
+ Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+ Value *C = Builder.CreateIntrinsic(
+ Intrinsic::cttz, {Type::getInt32Ty(Ctx)},
+ {conv, ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+ Value *iszero = Builder.CreateICmpEQ(
+ conv, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "iszero");
+ Value *sub = Builder.CreateSelect(
+ iszero, ConstantInt::get(Type::getInt32Ty(Ctx), -1), C, "sub");
+
+ return Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+ {conv, CI->getArgOperand(1), sub,
+ ConstantInt::get(Type::getInt32Ty(Ctx), 31)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+
+ Value *lane_mask = CI->getArgOperand(0);
+ Value *idx = CI->getArgOperand(1);
+ Value *x = CI->getArgOperand(2);
+ Value *width = CI->getArgOperand(3);
+
+ Value *Conv = Builder.CreateTrunc(lane_mask, Type::getInt32Ty(Ctx), "conv");
+
+ Value *sh_prom = Builder.CreateZExt(idx, Type::getInt64Ty(Ctx), "sh_prom");
+ Value *shl0 =
+ Builder.CreateShl(width, ConstantInt::get(Type::getInt32Ty(Ctx), 8));
+ Value *or0 = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 8223),
+ shl0, "or");
+
+ Value *core = Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+ {Conv, x, idx, or0});
+
+ Value *shl1 =
+ Builder.CreateShl(ConstantInt::get(Type::getInt64Ty(Ctx), 1), sh_prom);
+ Value *and0 = Builder.CreateAnd(shl1, lane_mask);
+ Value *cmp =
+ Builder.CreateICmpEQ(and0, ConstantInt::get(Type::getInt64Ty(Ctx), 0));
+ Value *and4 = Builder.CreateSelect(
+ cmp, ConstantInt::get(Type::getInt32Ty(Ctx), 0), core, "and4");
+
+ return and4;
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ Value *Conv =
+ Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+ Value *C = Builder.CreateIntrinsic(Intrinsic::nvvm_vote_ballot_sync, {},
+ {Conv, CI->getArgOperand(1)});
+
+ return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+
+ auto &Ctx = M.getContext();
+ Value *X = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx));
+ return Builder.CreateIntrinsic(Intrinsic::nvvm_bar_warp_sync, {}, {X});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+
+ auto &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+
+ Value *str = Builder.CreateGlobalString(
+ "__CUDA_ARCH", "", DL.getDefaultGlobalsAddressSpace(), &M);
+
+ Builder.SetInsertPoint(CI);
+ Value *Reflect = Builder.CreateIntrinsic(Intrinsic::nvvm_reflect, {}, {str});
+ Value *Cmp = Builder.CreateICmpUGT(
+ Reflect, ConstantInt::get(Type::getInt32Ty(Ctx), 699));
+
+ Builder.SetInsertPoint(SplitBlockAndInsertIfThen(Cmp, CI, false));
+
+ Builder.CreateIntrinsic(Intrinsic::nvvm_nanosleep, {},
+ {ConstantInt::get(Type::getInt32Ty(Ctx), 64)});
+
+ CI->eraseFromParent();
+ return nullptr; // All done
+}
+
+} // namespace nvptx
+
+struct IntrinsicMap {
+ Intrinsic::ID Generic;
+ lowerFunction AMDGPU;
+ lowerFunction NVPTX;
+};
+
+using namespace Intrinsic;
+
+static const IntrinsicMap ls[] = {
+ // This table of intrinsic => what to do with it is walked in order.
+ // A row can create calls to intrinsics that are expanded in subsequent rows
+ // but that does mean that the order of rows is somewhat significant.
+ // S<intrinsic> is a simple lowering to an existing intrinsic
+ // B<function> involves building a short sequence of instructions
+
+ {
+ gpu_num_blocks_x,
+ B<amdgpu::NumBlocks<0>>,
+ S<nvvm_read_ptx_sreg_nctaid_x>,
+ },
+ {
+ gpu_num_blocks_y,
+ B<amdgpu::NumBlocks<1>>,
+ S<nvvm_read_ptx_sreg_nctaid_y>,
+ },
+ {
+ gpu_num_blocks_z,
+ B<amdgpu::NumBlocks<2>>,
+ S<nvvm_read_ptx_sreg_nctaid_z>,
+ },
+
+ // Note: Could canonicalise in favour of the target agnostic one without
+ // breaking existing users of builtin or intrinsic:
+ // {amdgcn_workgroup_id_x, S<gpu_block_id_x>, nullptr},
+ // {gpu_block_id_x, nullptr, S<nvvm_read_ptx_sreg_ctaid_x>},
+ // Using the target agnostic one throughout the rest of the backend would
+ // work fine, and amdgpu-no-workgroup-id-x attribute and similar may be
+ // applicable to other targets.
+ // Map {block,thread}_id onto existing intrinsics for the time being.
+ {gpu_block_id_x, S<amdgcn_workgroup_id_x>, S<nvvm_read_ptx_sreg_ctaid_x>},
+ {gpu_block_id_y, S<amdgcn_workgroup_id_y>, S<nvvm_read_ptx_sreg_ctaid_y>},
+ {gpu_block_id_z, S<amdgcn_workgroup_id_z>, S<nvvm_read_ptx_sreg_ctaid_z>},
+ {gpu_thread_id_x, S<amdgcn_workitem_id_x>, S<nvvm_read_ptx_sreg_tid_x>},
+ {gpu_thread_id_y, S<amdgcn_workitem_id_y>, S<nvvm_read_ptx_sreg_tid_y>},
+ {gpu_thread_id_z, S<amdgcn_workitem_id_z>, S<nvvm_read_ptx_sreg_tid_z>},
+
+ // CGBuiltin maps builtin_amdgcn_workgroup_size onto gpu_num_threads
+ {gpu_num_threads_x, B<amdgpu::WGSize<0>>, S<nvvm_read_ptx_sreg_ntid_x>},
+ {gpu_num_threads_y, B<amdgpu::WGSize<1>>, S<nvvm_read_ptx_sreg_ntid_y>},
+ {gpu_num_threads_z, B<amdgpu::WGSize<2>>, S<nvvm_read_ptx_sreg_ntid_z>},
+
+ // Some of the following intrinsics need minor impedance matching
+ {gpu_num_lanes, S<amdgcn_wavefrontsize>, S<nvvm_read_ptx_sreg_warpsize>},
+ {gpu_lane_mask, B<amdgpu::lane_mask>, B<nvptx::lane_mask>},
+
+ {gpu_read_first_lane_u32, B<amdgpu::first_lane>, B<nvptx::first_lane>},
+ {gpu_shuffle_idx_u32, B<amdgpu::shuffle_idx>, B<nvptx::shuffle_idx>},
+
+ // shuffle sometimes emits call into lane_id so lower lane_id after shuffle
+ {gpu_lane_id, B<amdgpu::lane_id>, S<nvvm_read_ptx_sreg_laneid>},
+
+ {gpu_ballot, B<amdgpu::ballot>, B<nvptx::ballot>},
+
+ {gpu_sync_threads, B<amdgpu::sync_threads>, S<nvvm_barrier0>},
+ {gpu_sync_lane, B<amdgpu::sync_lane>, B<nvptx::sync_lane>},
+
+ {gpu_thread_suspend, B<amdgpu::thread_suspend>, B<nvptx::thread_suspend>},
+ {gpu_exit, S<amdgcn_endpgm>, S<nvvm_exit>},
+};
+
+class LowerGPUIntrinsic : public ModulePass {
+public:
+ static char ID;
+
+ LowerGPUIntrinsic() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+};
+
+bool LowerGPUIntrinsic::runOnModule(Module &M) {
+ bool Changed = false;
+
+ Triple TT(M.getTargetTriple());
+
+ if (!TT.isAMDGPU() && !TT.isNVPTX()) {
+ return Changed;
+ }
+
+ auto &Ctx = M.getContext();
+ IRBuilder<> Builder(Ctx);
+
+ for (const IntrinsicMap &I : ls) {
+ auto *Intr = Intrinsic::getDeclarationIfExists(&M, I.Generic);
+ if (!Intr)
+ continue;
+
+ lowerFunction maybeLowering = TT.isAMDGPU() ? I.AMDGPU : I.NVPTX;
+ if (maybeLowering == nullptr)
+ continue;
+
+ for (auto *U : make_early_inc_range(Intr->users())) {
+ if (auto *CI = dyn_cast<CallBase>(U)) {
+ assert (CI->getCalledFunction() == Intr);
+ Changed |= maybeLowering(M, Builder, I.Generic, CI);
+ }
+ }
+ }
+
+ return Changed;
+}
+
+} // namespace
+
+char LowerGPUIntrinsic::ID = 0;
+
+INITIALIZE_PASS(LowerGPUIntrinsic, DEBUG_TYPE, "Lower GPU Intrinsic", false,
+ false)
+
+Pass *llvm::createLowerGPUIntrinsicPass() { return new LowerGPUIntrinsic(); }
+
+PreservedAnalyses LowerGPUIntrinsicPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ return LowerGPUIntrinsic().runOnModule(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c7e4a3e824700..66ef7ebfa5fe5 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -34,6 +34,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopTermFoldPass(Registry);
initializeLoopUnrollPass(Registry);
initializeLowerAtomicLegacyPassPass(Registry);
+ initializeLowerGPUIntrinsicPass(Registry);
initializeMergeICmpsLegacyPassPass(Registry);
initializeNaryReassociateLegacyPassPass(Registry);
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
diff --git a/llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_amdgcn.ll b/llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_amdgcn.ll
new file mode 100644
index 0000000000000..dd41474ad4f07
--- /dev/null
+++ b/llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_amdgcn.ll
@@ -0,0 +1,315 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes
+; RUN: opt -S -mtriple=amdgcn-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=AMDGCN
+
+ at __oclc_ABI_version = weak_odr hidden addrspace(4) constant i32 500
+
+define i32 @num_blocks_x() {
+; AMDGCN-LABEL: @num_blocks_x(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 12
+; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0:![0-9]+]], !invariant.load [[META1:![0-9]+]]
+; AMDGCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT: [[TMP5:%.*]] = icmp sge i32 [[TMP4]], 500
+; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 12
+; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 4
+; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP5]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT: [[TMP12:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP3]], [[TMP12]]
+; AMDGCN-NEXT: ret i32 [[TMP13]]
+;
+ %1 = call i32 @llvm.gpu.num.blocks.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.x()
+
+define i32 @num_blocks_y() {
+; AMDGCN-LABEL: @num_blocks_y(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 16
+; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]]
+; AMDGCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT: [[TMP5:%.*]] = icmp sge i32 [[TMP4]], 500
+; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 14
+; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 6
+; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP5]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT: [[TMP12:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP3]], [[TMP12]]
+; AMDGCN-NEXT: ret i32 [[TMP13]]
+;
+ %1 = call i32 @llvm.gpu.num.blocks.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.y()
+
+define i32 @num_blocks_z() {
+; AMDGCN-LABEL: @num_blocks_z(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 20
+; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]]
+; AMDGCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT: [[TMP5:%.*]] = icmp sge i32 [[TMP4]], 500
+; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 16
+; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 8
+; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP5]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT: [[TMP12:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP3]], [[TMP12]]
+; AMDGCN-NEXT: ret i32 [[TMP13]]
+;
+ %1 = call i32 @llvm.gpu.num.blocks.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.z()
+
+define i32 @block_id_x() {
+; AMDGCN-LABEL: @block_id_x(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.block.id.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.x()
+
+define i32 @block_id_y() {
+; AMDGCN-LABEL: @block_id_y(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.block.id.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.y()
+
+define i32 @block_id_z() {
+; AMDGCN-LABEL: @block_id_z(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.block.id.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.z()
+
+define i32 @num_threads_x() {
+; AMDGCN-LABEL: @num_threads_x(
+; AMDGCN-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP1]], 500
+; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12
+; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 4
+; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT: [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT: ret i32 [[TMP9]]
+;
+ %1 = call i32 @llvm.gpu.num.threads.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.x()
+
+define i32 @num_threads_y() {
+; AMDGCN-LABEL: @num_threads_y(
+; AMDGCN-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP1]], 500
+; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 14
+; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 6
+; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT: [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT: ret i32 [[TMP9]]
+;
+ %1 = call i32 @llvm.gpu.num.threads.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.y()
+
+define i32 @num_threads_z() {
+; AMDGCN-LABEL: @num_threads_z(
+; AMDGCN-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
+; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP1]], 500
+; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 16
+; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 8
+; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT: [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT: ret i32 [[TMP9]]
+;
+ %1 = call i32 @llvm.gpu.num.threads.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.z()
+
+define i32 @thread_id_x() {
+; AMDGCN-LABEL: @thread_id_x(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.thread.id.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.x()
+
+define i32 @thread_id_y() {
+; AMDGCN-LABEL: @thread_id_y(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.thread.id.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.y()
+
+define i32 @thread_id_z() {
+; AMDGCN-LABEL: @thread_id_z(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.thread.id.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.z()
+
+define i32 @num_lanes() {
+; AMDGCN-LABEL: @num_lanes(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.wavefrontsize()
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.lanes()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.lanes()
+
+define i32 @lane_id() {
+; AMDGCN-LABEL: @lane_id(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; AMDGCN-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]])
+; AMDGCN-NEXT: ret i32 [[TMP2]]
+;
+ %1 = call i32 @llvm.gpu.lane.id()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.lane.id()
+
+define i64 @lane_mask() {
+; AMDGCN-LABEL: @lane_mask(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; AMDGCN-NEXT: ret i64 [[TMP1]]
+;
+ %1 = call i64 @llvm.gpu.lane.mask()
+ ret i64 %1
+}
+
+declare i64 @llvm.gpu.lane.mask()
+
+define i32 @read_first_lane_u32(i64 %lane_mask, i32 %x) {
+; AMDGCN-LABEL: @read_first_lane_u32(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X:%.*]])
+; AMDGCN-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.read.first.lane.u32(i64 %lane_mask, i32 %x)
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.read.first.lane.u32(i64, i32)
+
+define i64 @ballot(i64 %lane_mask, i1 zeroext %x) {
+; AMDGCN-LABEL: @ballot(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[X:%.*]])
+; AMDGCN-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[LANE_MASK:%.*]]
+; AMDGCN-NEXT: ret i64 [[TMP2]]
+;
+ %1 = call i64 @llvm.gpu.ballot(i64 %lane_mask, i1 %x)
+ ret i64 %1
+}
+
+declare i64 @llvm.gpu.ballot(i64, i1)
+
+define void @sync_threads() {
+; AMDGCN-LABEL: @sync_threads(
+; AMDGCN-NEXT: call void @llvm.amdgcn.s.barrier()
+; AMDGCN-NEXT: fence syncscope("workgroup") seq_cst
+; AMDGCN-NEXT: ret void
+;
+ call void @llvm.gpu.sync.threads()
+ ret void
+}
+
+declare void @llvm.gpu.sync.threads()
+
+define void @sync_lane(i64 %lane_mask) {
+; AMDGCN-LABEL: @sync_lane(
+; AMDGCN-NEXT: call void @llvm.amdgcn.wave.barrier()
+; AMDGCN-NEXT: ret void
+;
+ call void @llvm.gpu.sync.lane(i64 %lane_mask)
+ ret void
+}
+
+declare void @llvm.gpu.sync.lane(i64)
+
+define i32 @shuffle_idx_u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width) {
+; AMDGCN-LABEL: @shuffle_idx_u32(
+; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; AMDGCN-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]])
+; AMDGCN-NEXT: [[NOT:%.*]] = sub i32 0, [[WIDTH:%.*]]
+; AMDGCN-NEXT: [[AND:%.*]] = and i32 [[TMP2]], [[NOT]]
+; AMDGCN-NEXT: [[ADD:%.*]] = add i32 [[AND]], [[IDX:%.*]]
+; AMDGCN-NEXT: [[SHL:%.*]] = shl i32 [[ADD]], 2
+; AMDGCN-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL]], i32 [[X:%.*]])
+; AMDGCN-NEXT: ret i32 [[TMP3]]
+;
+ %1 = call i32 @llvm.gpu.shuffle.idx.u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width)
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.shuffle.idx.u32(i64, i32, i32, i32)
+
+define void @gpu_exit() {
+; AMDGCN-LABEL: @gpu_exit(
+; AMDGCN-NEXT: call void @llvm.amdgcn.endpgm()
+; AMDGCN-NEXT: ret void
+;
+ call void @llvm.gpu.exit()
+ ret void
+}
+
+declare void @llvm.gpu.exit()
+
+define void @thread_suspend() {
+; AMDGCN-LABEL: @thread_suspend(
+; AMDGCN-NEXT: call void @llvm.amdgcn.s.sleep(i32 2)
+; AMDGCN-NEXT: ret void
+;
+ call void @llvm.gpu.thread.suspend()
+ ret void
+}
+
+declare void @llvm.gpu.thread.suspend()
diff --git a/llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_nvptx.ll b/llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_nvptx.ll
new file mode 100644
index 0000000000000..a1c821e2ca939
--- /dev/null
+++ b/llvm/test/Transforms/LowerGPUIntrinsic/gpu_intrinsic_to_nvptx.ll
@@ -0,0 +1,266 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes
+; RUN: opt -S -mtriple=nvptx64-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=NVPTX
+
+define i32 @num_blocks_x() {
+; NVPTX-LABEL: @num_blocks_x(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.blocks.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.x()
+
+define i32 @num_blocks_y() {
+; NVPTX-LABEL: @num_blocks_y(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.blocks.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.y()
+
+define i32 @num_blocks_z() {
+; NVPTX-LABEL: @num_blocks_z(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.blocks.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.z()
+
+define i32 @block_id_x() {
+; NVPTX-LABEL: @block_id_x(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.block.id.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.x()
+
+define i32 @block_id_y() {
+; NVPTX-LABEL: @block_id_y(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.block.id.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.y()
+
+define i32 @block_id_z() {
+; NVPTX-LABEL: @block_id_z(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.block.id.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.z()
+
+define i32 @num_threads_x() {
+; NVPTX-LABEL: @num_threads_x(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.threads.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.x()
+
+define i32 @num_threads_y() {
+; NVPTX-LABEL: @num_threads_y(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.threads.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.y()
+
+define i32 @num_threads_z() {
+; NVPTX-LABEL: @num_threads_z(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.threads.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.z()
+
+define i32 @thread_id_x() {
+; NVPTX-LABEL: @thread_id_x(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.thread.id.x()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.x()
+
+define i32 @thread_id_y() {
+; NVPTX-LABEL: @thread_id_y(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.thread.id.y()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.y()
+
+define i32 @thread_id_z() {
+; NVPTX-LABEL: @thread_id_z(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.thread.id.z()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.z()
+
+define i32 @num_lanes() {
+; NVPTX-LABEL: @num_lanes(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.num.lanes()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.lanes()
+
+define i32 @lane_id() {
+; NVPTX-LABEL: @lane_id(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+; NVPTX-NEXT: ret i32 [[TMP1]]
+;
+ %1 = call i32 @llvm.gpu.lane.id()
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.lane.id()
+
+define i64 @lane_mask() {
+; NVPTX-LABEL: @lane_mask(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.activemask()
+; NVPTX-NEXT: [[CONV:%.*]] = zext i32 [[TMP1]] to i64
+; NVPTX-NEXT: ret i64 [[CONV]]
+;
+ %1 = call i64 @llvm.gpu.lane.mask()
+ ret i64 %1
+}
+
+declare i64 @llvm.gpu.lane.mask()
+
+define i32 @read_first_lane_u32(i64 %lane_mask, i32 %x) {
+; NVPTX-LABEL: @read_first_lane_u32(
+; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[CONV]], i1 true)
+; NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[CONV]], 0
+; NVPTX-NEXT: [[SUB:%.*]] = select i1 [[ISZERO]], i32 -1, i32 [[TMP1]]
+; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[SUB]], i32 31)
+; NVPTX-NEXT: ret i32 [[TMP2]]
+;
+ %1 = call i32 @llvm.gpu.read.first.lane.u32(i64 %lane_mask, i32 %x)
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.read.first.lane.u32(i64, i32)
+
+define i64 @ballot(i64 %lane_mask, i1 zeroext %x) {
+; NVPTX-LABEL: @ballot(
+; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[CONV]], i1 [[X:%.*]])
+; NVPTX-NEXT: [[CONV1:%.*]] = zext i32 [[TMP1]] to i64
+; NVPTX-NEXT: ret i64 [[CONV1]]
+;
+ %1 = call i64 @llvm.gpu.ballot(i64 %lane_mask, i1 %x)
+ ret i64 %1
+}
+
+declare i64 @llvm.gpu.ballot(i64, i1)
+
+define void @sync_threads() {
+; NVPTX-LABEL: @sync_threads(
+; NVPTX-NEXT: call void @llvm.nvvm.barrier0()
+; NVPTX-NEXT: ret void
+;
+ call void @llvm.gpu.sync.threads()
+ ret void
+}
+
+declare void @llvm.gpu.sync.threads()
+
+define void @sync_lane(i64 %lane_mask) {
+; NVPTX-LABEL: @sync_lane(
+; NVPTX-NEXT: [[TMP1:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT: call void @llvm.nvvm.bar.warp.sync(i32 [[TMP1]])
+; NVPTX-NEXT: ret void
+;
+ call void @llvm.gpu.sync.lane(i64 %lane_mask)
+ ret void
+}
+
+declare void @llvm.gpu.sync.lane(i64)
+
+define i32 @shuffle_idx_u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width) {
+; NVPTX-LABEL: @shuffle_idx_u32(
+; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT: [[SH_PROM:%.*]] = zext i32 [[IDX:%.*]] to i64
+; NVPTX-NEXT: [[TMP1:%.*]] = shl i32 [[WIDTH:%.*]], 8
+; NVPTX-NEXT: [[OR:%.*]] = sub i32 8223, [[TMP1]]
+; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[IDX]], i32 [[OR]])
+; NVPTX-NEXT: [[TMP3:%.*]] = shl i64 1, [[SH_PROM]]
+; NVPTX-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], [[LANE_MASK]]
+; NVPTX-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0
+; NVPTX-NEXT: [[AND4:%.*]] = select i1 [[TMP5]], i32 0, i32 [[TMP2]]
+; NVPTX-NEXT: ret i32 [[AND4]]
+;
+ %1 = call i32 @llvm.gpu.shuffle.idx.u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width)
+ ret i32 %1
+}
+
+declare i32 @llvm.gpu.shuffle.idx.u32(i64, i32, i32, i32)
+
+define void @gpu_exit() {
+; NVPTX-LABEL: @gpu_exit(
+; NVPTX-NEXT: call void @llvm.nvvm.exit()
+; NVPTX-NEXT: ret void
+;
+ call void @llvm.gpu.exit()
+ ret void
+}
+
+declare void @llvm.gpu.exit()
+
+define void @thread_suspend() {
+; NVPTX-LABEL: @thread_suspend(
+; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.reflect(ptr @[[GLOB0:[0-9]+]])
+; NVPTX-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 699
+; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NVPTX: 3:
+; NVPTX-NEXT: call void @llvm.nvvm.nanosleep(i32 64)
+; NVPTX-NEXT: br label [[TMP4]]
+; NVPTX: 4:
+; NVPTX-NEXT: ret void
+;
+ call void @llvm.gpu.thread.suspend()
+ ret void
+}
+
+declare void @llvm.gpu.thread.suspend()
More information about the llvm-commits
mailing list