[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

Fri Sep 27 11:41:57 PDT 2024

================
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+         _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+         _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
----------------
jdoerfert wrote:

I'd hope the gpu target triple would turn on auto-convergent. Setting it manually is always going to fail horribly (since it's not set transitively). Thus, adding it here is pointless and might just hide the real issue under one layer of "it seems to be working".

https://github.com/llvm/llvm-project/pull/110179