[llvm-branch-commits] [clang] [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 (PR #199176)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu May 21 23:17:45 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Sameer Sahasrabuddhe (ssahasra)
<details>
<summary>Changes</summary>
These builtins allow the program to request store-available and load-visible accesses as described in #<!-- -->191246. Each of them takes a __MEMORY_SCOPE_* operand that is then translated to target-specific cache policy bits.
This patch was extracted from #<!-- -->172090.
Assisted-by: Claude Opus 4.6
---
Patch is 21.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/199176.diff
8 Files Affected:
- (modified) clang/docs/LanguageExtensions.rst (+28)
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.td (+7)
- (modified) clang/include/clang/Sema/SemaAMDGPU.h (+1)
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+16)
- (modified) clang/lib/Sema/SemaAMDGPU.cpp (+12)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl (+250)
- (added) clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl (+22)
- (added) clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl (+26)
``````````diff
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 03cb02deb5e7f..6ad4e76c239d9 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5259,6 +5259,8 @@ builtin function, and are named with a ``__opencl_`` prefix. The macros
and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values
corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.)
+.. _langext-__scoped_atomic:
+
__scoped_atomic builtins
------------------------
@@ -5754,6 +5756,32 @@ returns the bit at the position of the current lane. It is almost equivalent to
``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
the given mask has the same value for all active lanes of the current wave.
+
+__builtin_amdgcn_av_{load,store}_b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Signature:
+
+.. code-block:: c
+
+ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u;
+
+ v4u __builtin_amdgcn_av_load_b128(v4u *src, int scope);
+
+ void __builtin_amdgcn_av_store_b128(v4u *dst, v4u data, int scope);
+
+Load or store a vector of 4 unsigned integers from or to memory with cache
+behavior specified by ``scope``, which is one of the ``__MEMORY_SCOPE_*`` macros
+defined for :ref:`scoped atomic builtins<langext-__c11_atomic>`.
+
+The pointer argument must point to the global or generic address space.
+
+These builtins are supported on gfx9, gfx10, gfx11, and gfx12 targets.
+
+They map to the LLVM intrinsics ``llvm.amdgcn.av.load.b128`` and
+``llvm.amdgcn.av.store.b128`` documented in `User Guide for AMDGPU Backend
+<https://llvm.org/docs/AMDGPUUsage.html>`_.
+
ARM/AArch64 Language Extensions
-------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index b15a36df6c08f..a91ad0431d4a6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -280,6 +280,13 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgp
def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
+//===----------------------------------------------------------------------===//
+// Global Available/Visible memory accesses.
+//===----------------------------------------------------------------------===//
+
+def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">;
+def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">;
+
//===----------------------------------------------------------------------===//
// Async mark builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d520f3df544f4..a6205534e0de3 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -38,6 +38,7 @@ class SemaAMDGPU : public SemaBase {
bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore);
bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
+ bool checkAVLoadStore(CallExpr *TheCall, bool IsStore);
bool checkAtomicMonitorLoad(CallExpr *TheCall);
bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index cb883e8780e59..fc6e397674cd1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1012,6 +1012,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
return Builder.CreateCall(F, {Args});
}
+ case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+ case AMDGPU::BI__builtin_amdgcn_av_store_b128: {
+ const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128;
+ SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
+ if (IsStore)
+ Args.push_back(EmitScalarExpr(E->getArg(1))); // data
+ const unsigned ScopeIdx = E->getNumArgs() - 1;
+ auto *ScopeExpr =
+ cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(ScopeIdx)));
+ Args.push_back(emitScopeMD(*this, ScopeExpr->getZExtValue()));
+ llvm::Function *F =
+ CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_av_store_b128
+ : Intrinsic::amdgcn_av_load_b128,
+ {Args[0]->getType()});
+ return Builder.CreateCall(F, Args);
+ }
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 385736c7e1eac..f2dbe28b76b5c 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -149,6 +149,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15);
+ case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+ return checkAVLoadStore(TheCall, /*IsStore=*/false);
+ case AMDGPU::BI__builtin_amdgcn_av_store_b128:
+ return checkAVLoadStore(TheCall, /*IsStore=*/true);
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
@@ -479,6 +483,14 @@ static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) {
return false;
}
+bool SemaAMDGPU::checkAVLoadStore(CallExpr *TheCall, bool IsStore) {
+ if (checkGlobalOrFlatPointerArg(*this, TheCall))
+ return true;
+
+ Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1);
+ return checkScopeAsInt(*this, Scope);
+}
+
bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
new file mode 100644
index 0000000000000..63d7fcac16874
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
@@ -0,0 +1,250 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+//------------------------------------------------------------------------------
+// Global Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META7:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_wave(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META8:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_workgroup(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META9:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_device(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META10:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_system(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META11:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_single(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META12:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_cluster(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Global Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_wave(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_workgroup(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_device(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_system(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_single(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_cluster(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META7]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_wave(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META8]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_workgroup(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META9]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_device(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META10]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_system(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META11]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_single(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META12]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_cluster(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_wave(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_workgroup(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_device(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_system(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_single(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_cluster(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+//.
+// CHECK: [[META7]] = !{!"wavefront"}
+// CHECK: [[META8]] = !{!"workgroup"}
+// CHECK: [[META9]] = !{!"agent"}
+// CHECK: [[META10]] = !{!""}
+// CHECK: [[META11]] = !{!"singlethread"}
+// CHECK: [[META12]] = !{!"cluster"}
+//.
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
new file mode 100644
index 0000000000000..b2f7b46547632
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+typedef v4u32 __private *private_ptr_to_v4u32;
+
+void test_amdgcn_av_store_b128_bad_ptr(private_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}}
+}
+
+void test_amdgcn_av_store_b128_bad_scope(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_ptr(private_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_scope(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
new file mode 100644
index 0000000000000..e85b120661cfd
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -0,0 +1,26 @@
+// We test loads and stores separately because clang only seems to exit after
+// the first 'target feature' error.
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -D...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/199176
More information about the llvm-branch-commits
mailing list