[clang] [llvm] [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 (PR #199176)
Sameer Sahasrabuddhe via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 4 23:12:11 PDT 2026
https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/199176
>From ad5404e9eb3bace085a8c51838de2e1553e7cf90 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 21 May 2026 16:55:40 +0530
Subject: [PATCH 1/5] [AMDGPU][Clang] refactor addrspace and scope checks [NFC]
Assisted-By: Claude Opus 4.6
---
.../clang/Basic/DiagnosticSemaKinds.td | 2 +-
clang/lib/Sema/SemaAMDGPU.cpp | 54 ++++++++++---------
...mdgcn-error-gfx1250-cooperative-atomics.cl | 2 +-
3 files changed, 31 insertions(+), 27 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dbe6cb2c3a41c..76bacb7d49c8b 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -14230,7 +14230,7 @@ def note_amdgcn_unguarded_builtin_silence
: Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence "
"this warning">;
-def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
+def err_amdgcn_global_or_flat_pointer_required : Error<"builtin requires a global or generic pointer">;
def err_amdgcn_dmask_has_too_many_bits_set
: Error<"dmask argument cannot have more bits set than there are elements "
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 1d2b3898c92d6..385736c7e1eac 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -452,19 +452,35 @@ bool SemaAMDGPU::checkAtomicOrderingCABIArg(Expr *E, bool MayLoad,
return false;
}
-bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
- bool Fail = false;
-
- // First argument is a global or generic pointer.
+// Check that the first argument to TheCall is a global or generic pointer.
+static bool checkGlobalOrFlatPointerArg(SemaAMDGPU &S, CallExpr *TheCall) {
Expr *PtrArg = TheCall->getArg(0);
QualType PtrTy = PtrArg->getType()->getPointeeType();
- unsigned AS = getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace());
+ unsigned AS =
+ S.getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace());
if (AS != llvm::AMDGPUAS::FLAT_ADDRESS &&
- AS != llvm::AMDGPUAS::GLOBAL_ADDRESS) {
- Fail = true;
- Diag(TheCall->getBeginLoc(), diag::err_amdgcn_coop_atomic_invalid_as)
- << PtrArg->getSourceRange();
- }
+ AS != llvm::AMDGPUAS::GLOBAL_ADDRESS)
+ return S.Diag(TheCall->getBeginLoc(),
+ diag::err_amdgcn_global_or_flat_pointer_required)
+ << PtrArg->getSourceRange();
+ return false;
+}
+
+static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) {
+ if (Scope->isValueDependent())
+ return false;
+ auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic);
+ if (std::optional<llvm::APSInt> Result =
+ Scope->getIntegerConstantExpr(S.SemaRef.Context))
+ if (!ScopeModel->isValid(Result->getZExtValue()))
+ return S.Diag(Scope->getBeginLoc(),
+ diag::err_atomic_op_has_invalid_sync_scope)
+ << Scope->getSourceRange();
+ return false;
+}
+
+bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
+ bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall);
Expr *AO = TheCall->getArg(IsStore ? 2 : 1);
Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1);
@@ -488,27 +504,15 @@ bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
}
bool SemaAMDGPU::checkAtomicMonitorLoad(CallExpr *TheCall) {
- bool Fail = false;
-
Expr *AO = TheCall->getArg(1);
Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1);
if (AO->isValueDependent() || Scope->isValueDependent())
return false;
- Fail |= checkAtomicOrderingCABIArg(TheCall->getArg(1), /*MayLoad=*/true,
- /*MayStore=*/false);
-
- auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic);
- if (std::optional<llvm::APSInt> Result =
- Scope->getIntegerConstantExpr(SemaRef.Context)) {
- if (!ScopeModel->isValid(Result->getZExtValue())) {
- Diag(Scope->getBeginLoc(), diag::err_atomic_op_has_invalid_sync_scope)
- << Scope->getSourceRange();
- Fail = true;
- }
- }
-
+ bool Fail = checkAtomicOrderingCABIArg(AO, /*MayLoad=*/true,
+ /*MayStore=*/false);
+ Fail |= checkScopeAsInt(*this, Scope);
return Fail;
}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
index 8f02e6775d37a..a440a1c040270 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
@@ -48,7 +48,7 @@ v4i test_amdgcn_cooperative_atomic_load_8x16B_acq_rel(global v4i* gaddr)
void test_amdgcn_cooperative_atomic_store_32x4B__sharedptr(local int* addr, int val)
{
- __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED, ""); // expected-error {{cooperative atomic requires a global or generic pointer}}
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED, ""); // expected-error {{builtin requires a global or generic pointer}}
}
void test_amdgcn_cooperative_atomic_store_32x4B__ordering_not_imm(local int* addr, int ord, int val)
>From 2f00bae403e863bec2260e98ae45b71f7e45f85a Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Mar 2026 12:30:16 +0530
Subject: [PATCH 2/5] [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128
These builtins allow the program to request store-available and load-visible
accesses as described in #191246. Each of them takes a __MEMORY_SCOPE_* operand
that is then translated to target-specific cache policy bits.
This patch was extracted from #172090.
Co-authored-by: macurtis-amd <macurtis at amd.com>
Assisted-by: Claude Opus 4.6z
---
clang/docs/LanguageExtensions.rst | 28 ++
clang/include/clang/Basic/BuiltinsAMDGPU.td | 7 +
clang/include/clang/Sema/SemaAMDGPU.h | 1 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 16 ++
clang/lib/Sema/SemaAMDGPU.cpp | 12 +
.../builtins-amdgcn-global-load-store.cl | 250 ++++++++++++++++++
...builtins-amdgcn-global-load-store-error.cl | 22 ++
...s-amdgcn-global-load-store-target-error.cl | 26 ++
8 files changed, 362 insertions(+)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 03cb02deb5e7f..6ad4e76c239d9 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5259,6 +5259,8 @@ builtin function, and are named with a ``__opencl_`` prefix. The macros
and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values
corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.)
+.. _langext-__scoped_atomic:
+
__scoped_atomic builtins
------------------------
@@ -5754,6 +5756,32 @@ returns the bit at the position of the current lane. It is almost equivalent to
``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
the given mask has the same value for all active lanes of the current wave.
+
+__builtin_amdgcn_av_{load,store}_b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Signature:
+
+.. code-block:: c
+
+ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u;
+
+ v4u __builtin_amdgcn_av_load_b128(v4u *src, int scope);
+
+ void __builtin_amdgcn_av_store_b128(v4u *dst, v4u data, int scope);
+
+Load or store a vector of 4 unsigned integers from or to memory with cache
+behavior specified by ``scope``, which is one of the ``__MEMORY_SCOPE_*`` macros
+defined for :ref:`scoped atomic builtins<langext-__c11_atomic>`.
+
+The pointer argument must point to the global or generic address space.
+
+These builtins are supported on gfx9, gfx10, gfx11, and gfx12 targets.
+
+They map to the LLVM intrinsics ``llvm.amdgcn.av.load.b128`` and
+``llvm.amdgcn.av.store.b128`` documented in `User Guide for AMDGPU Backend
+<https://llvm.org/docs/AMDGPUUsage.html>`_.
+
ARM/AArch64 Language Extensions
-------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index b15a36df6c08f..a91ad0431d4a6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -280,6 +280,13 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgp
def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
+//===----------------------------------------------------------------------===//
+// Global Available/Visible memory accesses.
+//===----------------------------------------------------------------------===//
+
+def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">;
+def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">;
+
//===----------------------------------------------------------------------===//
// Async mark builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d520f3df544f4..a6205534e0de3 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -38,6 +38,7 @@ class SemaAMDGPU : public SemaBase {
bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore);
bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
+ bool checkAVLoadStore(CallExpr *TheCall, bool IsStore);
bool checkAtomicMonitorLoad(CallExpr *TheCall);
bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index cb883e8780e59..fc6e397674cd1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1012,6 +1012,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
return Builder.CreateCall(F, {Args});
}
+ case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+ case AMDGPU::BI__builtin_amdgcn_av_store_b128: {
+ const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128;
+ SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
+ if (IsStore)
+ Args.push_back(EmitScalarExpr(E->getArg(1))); // data
+ const unsigned ScopeIdx = E->getNumArgs() - 1;
+ auto *ScopeExpr =
+ cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(ScopeIdx)));
+ Args.push_back(emitScopeMD(*this, ScopeExpr->getZExtValue()));
+ llvm::Function *F =
+ CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_av_store_b128
+ : Intrinsic::amdgcn_av_load_b128,
+ {Args[0]->getType()});
+ return Builder.CreateCall(F, Args);
+ }
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 385736c7e1eac..f2dbe28b76b5c 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -149,6 +149,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15);
+ case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+ return checkAVLoadStore(TheCall, /*IsStore=*/false);
+ case AMDGPU::BI__builtin_amdgcn_av_store_b128:
+ return checkAVLoadStore(TheCall, /*IsStore=*/true);
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
@@ -479,6 +483,14 @@ static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) {
return false;
}
+bool SemaAMDGPU::checkAVLoadStore(CallExpr *TheCall, bool IsStore) {
+ if (checkGlobalOrFlatPointerArg(*this, TheCall))
+ return true;
+
+ Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1);
+ return checkScopeAsInt(*this, Scope);
+}
+
bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
new file mode 100644
index 0000000000000..63d7fcac16874
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
@@ -0,0 +1,250 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+//------------------------------------------------------------------------------
+// Global Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META7:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_wave(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META8:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_workgroup(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META9:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_device(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META10:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_system(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META11:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_single(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META12:![0-9]+]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_cluster(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Global Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_wave(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_workgroup(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_device(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_system(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_single(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_global_store_b128_cluster(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META7]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_wave(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META8]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_workgroup(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META9]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_device(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META10]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_system(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META11]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_single(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META12]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_cluster(v4u32 * ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_wave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_wave(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_workgroup(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_workgroup(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_device(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_device(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_system(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_system(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_single(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_single(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_cluster(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_flat_store_b128_cluster(v4u32 * ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+//.
+// CHECK: [[META7]] = !{!"wavefront"}
+// CHECK: [[META8]] = !{!"workgroup"}
+// CHECK: [[META9]] = !{!"agent"}
+// CHECK: [[META10]] = !{!""}
+// CHECK: [[META11]] = !{!"singlethread"}
+// CHECK: [[META12]] = !{!"cluster"}
+//.
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
new file mode 100644
index 0000000000000..b2f7b46547632
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+typedef v4u32 __private *private_ptr_to_v4u32;
+
+void test_amdgcn_av_store_b128_bad_ptr(private_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}}
+}
+
+void test_amdgcn_av_store_b128_bad_scope(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_ptr(private_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_scope(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
new file mode 100644
index 0000000000000..e85b120661cfd
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -0,0 +1,26 @@
+// We test loads and stores separately because clang only seems to exit after
+// the first 'target feature' error.
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_LOAD -S -verify -o - %s
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_STORE -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+#ifdef TEST_LOAD
+v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) {
+ return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}}
+}
+#endif
+
+#ifdef TEST_STORE
+void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) {
+ __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}}
+}
+#endif
>From c295aabe208b0650f2d5ac4bde4ec6135ed2a285 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Tue, 2 Jun 2026 16:57:20 +0530
Subject: [PATCH 3/5] add docs for the builtins; split the target test; add a
host/device test
---
clang/include/clang/Basic/BuiltinsAMDGPU.td | 10 +++++--
.../include/clang/Basic/BuiltinsAMDGPUDocs.td | 29 +++++++++++++++++++
clang/test/SemaHIP/amdgpu-av-load-store.hip | 20 +++++++++++++
...s-amdgcn-global-load-store-target-error.cl | 26 ++++++++---------
4 files changed, 69 insertions(+), 16 deletions(-)
create mode 100644 clang/test/SemaHIP/amdgpu-av-load-store.hip
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index a91ad0431d4a6..a08e922ec1f7a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -284,8 +284,14 @@ def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__am
// Global Available/Visible memory accesses.
//===----------------------------------------------------------------------===//
-def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">;
-def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">;
+def __builtin_amdgcn_av_load_b128
+ : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts"> {
+ let Documentation = [DocAVLoadB128];
+}
+def __builtin_amdgcn_av_store_b128
+ : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts"> {
+ let Documentation = [DocAVStoreB128];
+}
//===----------------------------------------------------------------------===//
// Async mark builtins.
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
index cb2f000fcf548..04a3451adf667 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
@@ -573,6 +573,35 @@ WMMA with per-operand scale factors applied during the computation.
}];
}
+//===----------------------------------------------------------------------===//
+// Global Available/Visible Memory Access Builtins
+//===----------------------------------------------------------------------===//
+
+def DocCatAVLoadStore : DocumentationCategory<"Available/Visible Memory Access Builtins"> {
+ let Content = [{
+These builtins perform 128-bit global or flat memory loads and stores with
+available/visible (AV) semantics.
+}];
+}
+
+def DocAVLoadB128 : Documentation {
+ let Category = DocCatAVLoadStore;
+ let Content = [{
+Loads 128 bits (4 x i32) from the pointer ``ptr``. The pointer must be in
+the global or generic address space. The ``scope`` argument specifies the
+synchronization scope using a ``__MEMORY_SCOPE_*`` constant.
+}];
+}
+
+def DocAVStoreB128 : Documentation {
+ let Category = DocCatAVLoadStore;
+ let Content = [{
+Stores 128 bits (4 x i32) of ``data`` to the pointer ``ptr``. The pointer
+must be in the global or generic address space. The ``scope`` argument
+specifies the synchronization scope using a ``__MEMORY_SCOPE_*`` constant.
+}];
+}
+
//===----------------------------------------------------------------------===//
// Wave Data Exchange Builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/test/SemaHIP/amdgpu-av-load-store.hip b/clang/test/SemaHIP/amdgpu-av-load-store.hip
new file mode 100644
index 0000000000000..1e9688e891228
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-av-load-store.hip
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify %s -fcuda-is-device
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+
+__device__ void test_av_load_store_device(v4u32 *ptr, v4u32 data) {
+ v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0);
+ __builtin_amdgcn_av_store_b128(ptr, data, 0);
+}
+
+__global__ void test_av_load_store_kernel(v4u32 *ptr, v4u32 data) {
+ v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0);
+ __builtin_amdgcn_av_store_b128(ptr, data, 0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
index e85b120661cfd..cec85fbeb9446 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -1,26 +1,24 @@
-// We test loads and stores separately because clang only seems to exit after
-// the first 'target feature' error.
-
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_LOAD -S -verify -o - %s
-
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_STORE -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_STORE -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_STORE -S -verify -o - %s
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/store.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/store.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/store.cl
// REQUIRES: amdgpu-registered-target
+//--- load.cl
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
typedef v4u32 __global *global_ptr_to_v4u32;
-#ifdef TEST_LOAD
v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) {
return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}}
}
-#endif
-#ifdef TEST_STORE
+//--- store.cl
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) {
__builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}}
}
-#endif
>From 1834a71e9907c7c3d1cb276f622b3600b2ac4de2 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Wed, 3 Jun 2026 12:21:37 +0530
Subject: [PATCH 4/5] add ArgNames; use target feature "flat-global-insts"
---
clang/include/clang/Basic/BuiltinsAMDGPU.td | 8 +++++---
.../builtins-amdgcn-global-load-store-target-error.cl | 4 ++--
llvm/lib/TargetParser/AMDGPUTargetParser.cpp | 8 ++++++++
3 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index a08e922ec1f7a..0b50946bb47b3 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -285,12 +285,14 @@ def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__am
//===----------------------------------------------------------------------===//
def __builtin_amdgcn_av_load_b128
- : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts"> {
+ : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "flat-global-insts"> {
let Documentation = [DocAVLoadB128];
+ let ArgNames = ["ptr", "scope"];
}
def __builtin_amdgcn_av_store_b128
- : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts"> {
+ : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "flat-global-insts"> {
let Documentation = [DocAVStoreB128];
+ let ArgNames = ["ptr", "data", "scope"];
}
//===----------------------------------------------------------------------===//
@@ -375,7 +377,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned i
// GFX9+ only builtins.
//===----------------------------------------------------------------------===//
-def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "gfx9-insts">;
+def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "flat-global-insts">;
def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">;
def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">;
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
index cec85fbeb9446..9a61513cdc05b 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -12,7 +12,7 @@ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int
typedef v4u32 __global *global_ptr_to_v4u32;
v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) {
- return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}}
+ return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature flat-global-insts}}
}
//--- store.cl
@@ -20,5 +20,5 @@ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int
typedef v4u32 __global *global_ptr_to_v4u32;
void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) {
- __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}}
+ __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature flat-global-insts}}
}
diff --git a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
index 756b7c2154ca2..24e6ece329c4c 100644
--- a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
+++ b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
@@ -226,6 +226,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["dpp"] = true;
Features["gfx8-insts"] = true;
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["gfx10-insts"] = true;
Features["gfx10-3-insts"] = true;
Features["gfx11-insts"] = true;
@@ -280,6 +281,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["dpp"] = true;
Features["gfx8-insts"] = true;
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["gfx10-insts"] = true;
Features["gfx10-3-insts"] = true;
Features["gfx11-insts"] = true;
@@ -313,6 +315,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["dpp"] = true;
Features["gfx8-insts"] = true;
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["gfx10-insts"] = true;
Features["gfx10-3-insts"] = true;
Features["gfx11-insts"] = true;
@@ -354,6 +357,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["dpp"] = true;
Features["gfx8-insts"] = true;
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["gfx10-insts"] = true;
Features["gfx10-3-insts"] = true;
Features["gfx11-insts"] = true;
@@ -391,6 +395,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["dpp"] = true;
Features["gfx8-insts"] = true;
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["gfx10-insts"] = true;
Features["gfx10-3-insts"] = true;
Features["image-insts"] = true;
@@ -427,6 +432,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["dpp"] = true;
Features["gfx8-insts"] = true;
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["gfx10-insts"] = true;
Features["image-insts"] = true;
Features["s-memrealtime"] = true;
@@ -486,6 +492,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["dot7-insts"] = true;
Features["dot10-insts"] = true;
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["gfx8-insts"] = true;
Features["16-bit-insts"] = true;
Features["dpp"] = true;
@@ -532,6 +539,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
case GK_GFX900:
case GK_GFX9_GENERIC:
Features["gfx9-insts"] = true;
+ Features["flat-global-insts"] = true;
Features["vmem-to-lds-load-insts"] = true;
[[fallthrough]];
case GK_GFX810:
>From 55d2d6efdd2e33aa38c33fe84d9174872cdc6645 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Fri, 5 Jun 2026 11:42:00 +0530
Subject: [PATCH 5/5] revert incorrect replacement
---
clang/include/clang/Basic/BuiltinsAMDGPU.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 0b50946bb47b3..18ad906a0b03b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -377,7 +377,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned i
// GFX9+ only builtins.
//===----------------------------------------------------------------------===//
-def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "flat-global-insts">;
+def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "gfx9-insts">;
def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">;
def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">;
More information about the cfe-commits
mailing list