[clang] [Clang][AMDGPU] Add clang builtins for buffer format load/store intrinsics (PR #187064)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 17 10:09:22 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Rana Pratap Reddy (ranapratap55)
<details>
<summary>Changes</summary>
Adding new clang builtins for AMDGPU raw/struct buffer format load/store intrinsics. Clang currently has `__builtin_amdgcn_raw_buffer_load_b*` and `__builtin_amdgcn_raw_buffer_store_b*` builtins, but is missing builtins for the format variants. These format intrinsics are currently used by device-libs via manually written IR wrappers in [buffer-intrinsics.ll](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/ockl/src/buffer-intrinsics.ll).
---
Patch is 37.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/187064.diff
10 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.td (+9)
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+29)
- (added) clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip (+208)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-format.cl (+44)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store-format.cl (+44)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-load-format.cl (+35)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-store-format.cl (+35)
- (added) clang/test/SemaHIP/builtins-amdgcn-buffer-format.hip (+46)
- (added) clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-format-error.cl (+24)
- (added) clang/test/SemaOpenCL/builtins-amdgcn-struct-buffer-format-error.cl (+23)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index acd0a34a79253..664655a1d4bfc 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -252,6 +252,15 @@ def __builtin_amdgcn_raw_buffer_load_b64 : AMDGPUBuiltin<"_ExtVector<2, unsigned
def __builtin_amdgcn_raw_buffer_load_b96 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
def __builtin_amdgcn_raw_buffer_load_b128 : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_load_format_v4f32 : AMDGPUBuiltin<"_ExtVector<4, float>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_load_format_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_store_format_v4f32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_store_format_v4f16 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_struct_buffer_load_format_v4f32 : AMDGPUBuiltin<"_ExtVector<4, float>(__amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">;
+def __builtin_amdgcn_struct_buffer_load_format_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(__amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">;
+def __builtin_amdgcn_struct_buffer_store_format_v4f32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, __amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">;
+def __builtin_amdgcn_struct_buffer_store_format_v4f16 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, __amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">;
+
def __builtin_amdgcn_raw_ptr_buffer_atomic_add_i32 : AMDGPUBuiltin<"int(int, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
def __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32 : AMDGPUBuiltin<"float(float, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-fadd-rtn-insts">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 0d572d37ab972..13236a177b398 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -2028,6 +2028,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
return emitBuiltinWithOneOverloadedType<5>(
*this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
+ case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_format_v4f32:
+ case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_format_v4f16:
+ return emitBuiltinWithOneOverloadedType<5>(
+ *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store_format);
case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
@@ -2061,6 +2065,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
}
+ case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_format_v4f32:
+ case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_format_v4f16: {
+ llvm::Type *RetTy = ConvertType(E->getType());
+ Function *F =
+ CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load_format, {RetTy});
+
+ return Builder.CreateCall(
+ F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
+ EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
+ }
+ case AMDGPU::BI__builtin_amdgcn_struct_buffer_store_format_v4f32:
+ case AMDGPU::BI__builtin_amdgcn_struct_buffer_store_format_v4f16:
+ return emitBuiltinWithOneOverloadedType<6>(
+ *this, E, Intrinsic::amdgcn_struct_ptr_buffer_store_format);
+ case AMDGPU::BI__builtin_amdgcn_struct_buffer_load_format_v4f32:
+ case AMDGPU::BI__builtin_amdgcn_struct_buffer_load_format_v4f16: {
+ llvm::Type *RetTy = ConvertType(E->getType());
+ Function *F = CGM.getIntrinsic(
+ Intrinsic::amdgcn_struct_ptr_buffer_load_format, {RetTy});
+
+ return Builder.CreateCall(
+ F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
+ EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)),
+ EmitScalarExpr(E->getArg(4))});
+ }
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32:
return emitBuiltinWithOneOverloadedType<5>(
*this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_add);
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip b/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip
new file mode 100644
index 0000000000000..1534d2dd4810c
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu verde -emit-llvm -disable-llvm-optzns -fcuda-is-device -o - %s | FileCheck %s
+
+#define __device__ __attribute__((device))
+
+typedef float v4f32 __attribute__((ext_vector_type(4)));
+typedef _Float16 v4f16 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: @_Z33test_raw_buffer_load_format_v4f32u22__amdgpu_buffer_rsrc_tii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call contract <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0)
+// CHECK-NEXT: ret <4 x float> [[TMP3]]
+//
+__device__ v4f32 test_raw_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+ return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @_Z33test_raw_buffer_load_format_v4f16u22__amdgpu_buffer_rsrc_tii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call contract <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0)
+// CHECK-NEXT: ret <4 x half> [[TMP3]]
+//
+__device__ v4f16 test_raw_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+ return __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, offset, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @_Z34test_raw_buffer_store_format_v4f32Dv4_fu22__amdgpu_buffer_rsrc_tii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store <4 x float> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VDATA_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0)
+// CHECK-NEXT: ret void
+//
+__device__ void test_raw_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+ __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc, offset, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @_Z34test_raw_buffer_store_format_v4f16Dv4_DF16_u22__amdgpu_buffer_rsrc_tii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store <4 x half> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[VDATA_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0)
+// CHECK-NEXT: ret void
+//
+__device__ void test_raw_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
+ __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata, rsrc, offset, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @_Z36test_struct_buffer_load_format_v4f32u22__amdgpu_buffer_rsrc_tiii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0)
+// CHECK-NEXT: ret <4 x float> [[TMP4]]
+//
+__device__ v4f32 test_struct_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) {
+ return __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @_Z36test_struct_buffer_load_format_v4f16u22__amdgpu_buffer_rsrc_tiii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0)
+// CHECK-NEXT: ret <4 x half> [[TMP4]]
+//
+__device__ v4f16 test_struct_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) {
+ return __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex, offset, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @_Z37test_struct_buffer_store_format_v4f32Dv4_fu22__amdgpu_buffer_rsrc_tiii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store <4 x float> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VDATA_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], i32 0)
+// CHECK-NEXT: ret void
+//
+__device__ void test_struct_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) {
+ __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex, offset, soffset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @_Z37test_struct_buffer_store_format_v4f16Dv4_DF16_u22__amdgpu_buffer_rsrc_tiii(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr
+// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
+// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr
+// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr
+// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr
+// CHECK-NEXT: store <4 x half> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[VDATA_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/187064
More information about the cfe-commits
mailing list