[clang] [Headers] Create stub spirv64intrin.h (PR #131164)

Thu Mar 13 09:00:02 PDT 2025

https://github.com/JonChesterfield created https://github.com/llvm/llvm-project/pull/131164

Structure follows amdgcnintrin.h but with declarations where compiler intrinsics are not yet available.

Address space numbers, kernel attribute, checking how it interacts with openmp are left for later patches.

>From 0b8f555fd2561c7aac92935784caaaa365328ea9 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Thu, 13 Mar 2025 15:44:52 +0000
Subject: [PATCH] [Headers] Create stub spirv64intrin.h

---
 clang/lib/Headers/amdgpuintrin.h  |   2 +-
 clang/lib/Headers/gpuintrin.h     |   2 +
 clang/lib/Headers/spirv64intrin.h | 131 ++++++++++++++++++
 clang/test/Headers/gpuintrin.c    | 223 ++++++++++++++++++++++++++++++
 4 files changed, 357 insertions(+), 1 deletion(-)
 create mode 100644 clang/lib/Headers/spirv64intrin.h

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index f7fb8e2814180..817cfeec896c4 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -1,4 +1,4 @@
-//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
+//===-- amdgpuintrin.h - AMDGPU intrinsic functions -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 0fb3916acac61..cf1cfd41a6788 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV64__)
+#include <spirv64intrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirv64intrin.h b/clang/lib/Headers/spirv64intrin.h
new file mode 100644
index 0000000000000..2b9157544f170
--- /dev/null
+++ b/clang/lib/Headers/spirv64intrin.h
@@ -0,0 +1,131 @@
+//===-- spirv64intrin.h - SPIRV64 intrinsic functions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRV64INTRIN_H
+#define __SPIRV64INTRIN_H
+
+#ifndef __SPIRV64__
+#error "This file is intended for SPIRV64 targets or offloading to SPIRV64"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirv64intrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+// This is the skeleton of the spirv64 implementation for gpuintrin
+// Address spaces and kernel attribute are not yet implemented
+// The target-specific functions are declarations waiting for clang support
+
+#if defined(_OPENMP)
+#error "Openmp is not yet available on spirv64 though gpuintrin header"
+#endif
+
+// Type aliases to the address spaces used by the SPIRV backend.
+#define __gpu_private
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global
+#define __gpu_generic
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS uint32_t __gpu_num_blocks_x(void);
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS uint32_t __gpu_num_blocks_y(void);
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS uint32_t __gpu_num_blocks_z(void);
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS uint32_t __gpu_block_id_x(void);
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS uint32_t __gpu_block_id_y(void);
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS uint32_t __gpu_block_id_z(void);
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS uint32_t __gpu_num_threads_x(void);
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS uint32_t __gpu_num_threads_y(void);
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS uint32_t __gpu_num_threads_z(void);
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS uint32_t __gpu_thread_id_x(void);
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS uint32_t __gpu_thread_id_y(void);
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS uint32_t __gpu_thread_id_z(void);
+
+// Returns the size of the wave.
+_DEFAULT_FN_ATTRS uint32_t __gpu_num_lanes(void);
+
+// Returns the id of the thread inside of a wave executing together.
+_DEFAULT_FN_ATTRS uint32_t __gpu_lane_id(void);
+
+// Returns the bit-mask of active threads in the current wave.
+_DEFAULT_FN_ATTRS uint64_t __gpu_lane_mask(void);
+
+// Copies the value from the first active thread in the wave to the rest.
+_DEFAULT_FN_ATTRS uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask,
+                                                     uint32_t __x);
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x);
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS void __gpu_sync_threads(void);
+
+// Wait for all threads in the wave to converge
+_DEFAULT_FN_ATTRS void __gpu_sync_lane(uint64_t __lane_mask);
+
+// Shuffles the the lanes inside the wave according to the given index.
+_DEFAULT_FN_ATTRS uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask,
+                                                 uint32_t __idx, uint32_t __x,
+                                                 uint32_t __width);
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Terminates execution of the associated wave.
+_DEFAULT_FN_ATTRS [[noreturn]] void __gpu_exit(void);
+
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS void __gpu_thread_suspend(void);
+
+#endif // __SPIRV64INTRIN_H
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 9a15ce277ba87..eaf001be19ac9 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -9,6 +9,11 @@
 // RUN:   -target-feature +ptx62 \
 // RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=NVPTX
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include  \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple spirv64-- -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SPIRV64
 
 #include <gpuintrin.h>
 
@@ -978,6 +983,224 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    call void @llvm.nvvm.exit()
 // NVPTX-NEXT:    ret void
 //
+//
+// SPIRV64-LABEL: define spir_func void @foo(
+// SPIRV64-SAME: ) #[[ATTR0:[0-9]+]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV64-NEXT:    [[CALL1:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV64-NEXT:    [[CALL3:%.*]] = call spir_func i32 @__gpu_num_blocks(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV64-NEXT:    [[CALL5:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV64-NEXT:    [[CALL6:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV64-NEXT:    [[CALL7:%.*]] = call spir_func i32 @__gpu_block_id(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL8:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV64-NEXT:    [[CALL9:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV64-NEXT:    [[CALL10:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV64-NEXT:    [[CALL11:%.*]] = call spir_func i32 @__gpu_num_threads(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL12:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV64-NEXT:    [[CALL13:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV64-NEXT:    [[CALL14:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV64-NEXT:    [[CALL15:%.*]] = call spir_func i32 @__gpu_thread_id(i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL16:%.*]] = call spir_func i32 @__gpu_num_lanes()
+// SPIRV64-NEXT:    [[CALL17:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV64-NEXT:    [[CALL18:%.*]] = call spir_func i64 @__gpu_lane_mask()
+// SPIRV64-NEXT:    [[CALL19:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1)
+// SPIRV64-NEXT:    [[CALL20:%.*]] = call spir_func i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL21:%.*]] = call spir_func i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true)
+// SPIRV64-NEXT:    call spir_func void @__gpu_sync_threads()
+// SPIRV64-NEXT:    call spir_func void @__gpu_sync_lane(i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL22:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0)
+// SPIRV64-NEXT:    [[CALL23:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL24:%.*]] = call spir_func zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1)
+// SPIRV64-NEXT:    call spir_func void @__gpu_exit() #[[ATTR4:[0-9]+]]
+// SPIRV64-NEXT:    unreachable
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_blocks(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_block_id(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_threads(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_thread_id(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i64 @__gpu_read_first_lane_u64(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    [[__HI:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__LO:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// SPIRV64-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// SPIRV64-NEXT:    store i32 [[CONV]], ptr [[__HI]], align 4
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// SPIRV64-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// SPIRV64-NEXT:    store i32 [[CONV1]], ptr [[__LO]], align 4
+// SPIRV64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]])
+// SPIRV64-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV64-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// SPIRV64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// SPIRV64-NEXT:    [[CALL3:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]])
+// SPIRV64-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// SPIRV64-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// SPIRV64-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// SPIRV64-NEXT:    ret i64 [[OR]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i64 @__gpu_first_lane_id(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true)
+// SPIRV64-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+// SPIRV64-NEXT:    [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0
+// SPIRV64-NEXT:    [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]]
+// SPIRV64-NEXT:    [[CAST:%.*]] = trunc i64 [[FFS]] to i32
+// SPIRV64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CAST]], 1
+// SPIRV64-NEXT:    [[CONV:%.*]] = sext i32 [[SUB]] to i64
+// SPIRV64-NEXT:    ret i64 [[CONV]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func zeroext i1 @__gpu_is_first_in_lane(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV64-NEXT:    [[CONV:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[CALL1:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef [[TMP0]])
+// SPIRV64-NEXT:    [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]]
+// SPIRV64-NEXT:    ret i1 [[CMP]]
+//
 //.
 // AMDGPU: [[RNG3]] = !{i32 1, i32 0}
 // AMDGPU: [[META4]] = !{}