[clang] [llvm] [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 (PR #199176)

Tue Jun 9 05:00:51 PDT 2026

https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/199176

>From 9315208f004466ac908acdd2830856706492b9a7 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Mar 2026 12:30:16 +0530
Subject: [PATCH 1/4] [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128

These builtins allow the program to request store-available and load-visible
accesses as described in #191246. Each of them takes a __MEMORY_SCOPE_* operand
that is then translated to target-specific cache policy bits.

This patch was extracted from #172090.

Co-authored-by: macurtis-amd <macurtis at amd.com>
Assisted-by: Claude Opus 4.6z
---
 clang/docs/LanguageExtensions.rst             |  28 ++
 clang/include/clang/Basic/BuiltinsAMDGPU.td   |   7 +
 clang/include/clang/Sema/SemaAMDGPU.h         |   1 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  16 ++
 clang/lib/Sema/SemaAMDGPU.cpp                 |  12 +
 .../builtins-amdgcn-global-load-store.cl      | 250 ++++++++++++++++++
 ...builtins-amdgcn-global-load-store-error.cl |  22 ++
 ...s-amdgcn-global-load-store-target-error.cl |  26 ++
 8 files changed, 362 insertions(+)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index fbb9947f39d3e..9fa39c09b1f6c 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5261,6 +5261,8 @@ builtin function, and are named with a ``__opencl_`` prefix. The macros
 and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values
 corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.)
 
+.. _langext-__scoped_atomic:
+
 __scoped_atomic builtins
 ------------------------
 
@@ -5756,6 +5758,32 @@ returns the bit at the position of the current lane. It is almost equivalent to
 ``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
 the given mask has the same value for all active lanes of the current wave.
 
+
+__builtin_amdgcn_av_{load,store}_b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Signature:
+
+.. code-block:: c
+
+    typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u;
+
+    v4u __builtin_amdgcn_av_load_b128(v4u *src, int scope);
+
+    void __builtin_amdgcn_av_store_b128(v4u *dst, v4u data, int scope);
+
+Load or store a vector of 4 unsigned integers from or to memory with cache
+behavior specified by ``scope``, which is one of the ``__MEMORY_SCOPE_*`` macros
+defined for :ref:`scoped atomic builtins<langext-__c11_atomic>`.
+
+The pointer argument must point to the global or generic address space.
+
+These builtins are supported on gfx9, gfx10, gfx11, and gfx12 targets.
+
+They map to the LLVM intrinsics ``llvm.amdgcn.av.load.b128`` and
+``llvm.amdgcn.av.store.b128`` documented in `User Guide for AMDGPU Backend
+<https://llvm.org/docs/AMDGPUUsage.html>`_.
+
 ARM/AArch64 Language Extensions
 -------------------------------
 
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index d8020bdcc8458..afcafe4defdbe 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -280,6 +280,13 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgp
 def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
 def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
 
+//===----------------------------------------------------------------------===//
+// Global Available/Visible memory accesses.
+//===----------------------------------------------------------------------===//
+
+def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">;
+def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">;
+
 //===----------------------------------------------------------------------===//
 // Async mark builtins.
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d520f3df544f4..a6205534e0de3 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -38,6 +38,7 @@ class SemaAMDGPU : public SemaBase {
   bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore);
 
   bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
+  bool checkAVLoadStore(CallExpr *TheCall, bool IsStore);
   bool checkAtomicMonitorLoad(CallExpr *TheCall);
 
   bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index a88dbb71b3ddf..21f32b12c4fd1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1009,6 +1009,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
     return Builder.CreateCall(F, {Args});
   }
+  case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+  case AMDGPU::BI__builtin_amdgcn_av_store_b128: {
+    const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128;
+    SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
+    if (IsStore)
+      Args.push_back(EmitScalarExpr(E->getArg(1))); // data
+    const unsigned ScopeIdx = E->getNumArgs() - 1;
+    auto *ScopeExpr =
+        cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(ScopeIdx)));
+    Args.push_back(emitScopeMD(*this, ScopeExpr->getZExtValue()));
+    llvm::Function *F =
+        CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_av_store_b128
+                                 : Intrinsic::amdgcn_av_load_b128,
+                         {Args[0]->getType()});
+    return Builder.CreateCall(F, Args);
+  }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
     Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
                                    {llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 60f74fd15226f..757cdfbf20819 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -149,6 +149,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
     return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15);
+  case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+    return checkAVLoadStore(TheCall, /*IsStore=*/false);
+  case AMDGPU::BI__builtin_amdgcn_av_store_b128:
+    return checkAVLoadStore(TheCall, /*IsStore=*/true);
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
@@ -482,6 +486,14 @@ static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) {
   return false;
 }
 
+bool SemaAMDGPU::checkAVLoadStore(CallExpr *TheCall, bool IsStore) {
+  if (checkGlobalOrFlatPointerArg(*this, TheCall))
+    return true;
+
+  Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1);
+  return checkScopeAsInt(*this, Scope);
+}
+
 bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
   bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall);
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
new file mode 100644
index 0000000000000..63d7fcac16874
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
@@ -0,0 +1,250 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+//------------------------------------------------------------------------------
+// Global Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META7:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_wave(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META8:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_workgroup(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META9:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_device(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META10:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_system(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META11:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_single(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META12:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_cluster(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Global Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_wave(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_workgroup(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_device(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_system(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_single(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_cluster(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META7]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_wave(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META8]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_workgroup(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META9]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_device(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META10]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_system(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META11]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_single(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META12]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_cluster(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_wave(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_workgroup(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_device(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_system(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_single(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_cluster(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+//.
+// CHECK: [[META7]] = !{!"wavefront"}
+// CHECK: [[META8]] = !{!"workgroup"}
+// CHECK: [[META9]] = !{!"agent"}
+// CHECK: [[META10]] = !{!""}
+// CHECK: [[META11]] = !{!"singlethread"}
+// CHECK: [[META12]] = !{!"cluster"}
+//.
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
new file mode 100644
index 0000000000000..b2f7b46547632
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950         -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+typedef v4u32 __private *private_ptr_to_v4u32;
+
+void test_amdgcn_av_store_b128_bad_ptr(private_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);  //expected-error{{builtin requires a global or generic pointer}}
+}
+
+void test_amdgcn_av_store_b128_bad_scope(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, 42);  //expected-error{{synchronization scope argument to atomic operation is invalid}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_ptr(private_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);  //expected-error{{builtin requires a global or generic pointer}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_scope(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, 42);  //expected-error{{synchronization scope argument to atomic operation is invalid}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
new file mode 100644
index 0000000000000..e85b120661cfd
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -0,0 +1,26 @@
+// We test loads and stores separately because clang only seems to exit after
+// the first 'target feature' error.
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_LOAD  -S -verify -o - %s
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_STORE -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+#ifdef TEST_LOAD
+v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}}
+}
+#endif
+
+#ifdef TEST_STORE
+void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}}
+}
+#endif

>From 43d1fb1d47b39fd914a6c7f0ac329193ea80fcee Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Tue, 2 Jun 2026 16:57:20 +0530
Subject: [PATCH 2/4] add docs for the builtins; split the target test; add a
 host/device test

---
 clang/include/clang/Basic/BuiltinsAMDGPU.td   | 10 +++++--
 .../include/clang/Basic/BuiltinsAMDGPUDocs.td | 29 +++++++++++++++++++
 clang/test/SemaHIP/amdgpu-av-load-store.hip   | 20 +++++++++++++
 ...s-amdgcn-global-load-store-target-error.cl | 26 ++++++++---------
 4 files changed, 69 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/SemaHIP/amdgpu-av-load-store.hip

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index afcafe4defdbe..5236de8671ebb 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -284,8 +284,14 @@ def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__am
 // Global Available/Visible memory accesses.
 //===----------------------------------------------------------------------===//
 
-def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">;
-def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">;
+def __builtin_amdgcn_av_load_b128
+    : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts"> {
+  let Documentation = [DocAVLoadB128];
+}
+def __builtin_amdgcn_av_store_b128
+    : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts"> {
+  let Documentation = [DocAVStoreB128];
+}
 
 //===----------------------------------------------------------------------===//
 // Async mark builtins.
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
index a92b85d75d902..293431c5de7e8 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
@@ -573,6 +573,35 @@ WMMA with per-operand scale factors applied during the computation.
 }];
 }
 
+//===----------------------------------------------------------------------===//
+// Global Available/Visible Memory Access Builtins
+//===----------------------------------------------------------------------===//
+
+def DocCatAVLoadStore : DocumentationCategory<"Available/Visible Memory Access Builtins"> {
+  let Content = [{
+These builtins perform 128-bit global or flat memory loads and stores with
+available/visible (AV) semantics.
+}];
+}
+
+def DocAVLoadB128 : Documentation {
+  let Category = DocCatAVLoadStore;
+  let Content = [{
+Loads 128 bits (4 x i32) from the pointer ``ptr``. The pointer must be in
+the global or generic address space. The ``scope`` argument specifies the
+synchronization scope using a ``__MEMORY_SCOPE_*`` constant.
+}];
+}
+
+def DocAVStoreB128 : Documentation {
+  let Category = DocCatAVLoadStore;
+  let Content = [{
+Stores 128 bits (4 x i32) of ``data`` to the pointer ``ptr``. The pointer
+must be in the global or generic address space. The ``scope`` argument
+specifies the synchronization scope using a ``__MEMORY_SCOPE_*`` constant.
+}];
+}
+
 //===----------------------------------------------------------------------===//
 // Wave Data Exchange Builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/SemaHIP/amdgpu-av-load-store.hip b/clang/test/SemaHIP/amdgpu-av-load-store.hip
new file mode 100644
index 0000000000000..1e9688e891228
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-av-load-store.hip
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify %s -fcuda-is-device
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+
+__device__ void test_av_load_store_device(v4u32 *ptr, v4u32 data) {
+  v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0);
+  __builtin_amdgcn_av_store_b128(ptr, data, 0);
+}
+
+__global__ void test_av_load_store_kernel(v4u32 *ptr, v4u32 data) {
+  v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0);
+  __builtin_amdgcn_av_store_b128(ptr, data, 0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
index e85b120661cfd..cec85fbeb9446 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -1,26 +1,24 @@
-// We test loads and stores separately because clang only seems to exit after
-// the first 'target feature' error.
-
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD  -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD  -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_LOAD  -S -verify -o - %s
-
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_STORE -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_STORE -S -verify -o - %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_STORE -S -verify -o - %s
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/store.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/store.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/store.cl
 // REQUIRES: amdgpu-registered-target
 
+//--- load.cl
 typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
 typedef v4u32 __global *global_ptr_to_v4u32;
 
-#ifdef TEST_LOAD
 v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) {
   return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}}
 }
-#endif
 
-#ifdef TEST_STORE
+//--- store.cl
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
 void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) {
   __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}}
 }
-#endif

>From c2f14391158fae4765605c0619e4de08dd442321 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Wed, 3 Jun 2026 12:21:37 +0530
Subject: [PATCH 3/4] add ArgNames; use target feature "flat-global-insts"

---
 clang/include/clang/Basic/BuiltinsAMDGPU.td               | 8 +++++---
 .../builtins-amdgcn-global-load-store-target-error.cl     | 4 ++--
 llvm/lib/TargetParser/AMDGPUTargetParser.cpp              | 8 ++++++++
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 5236de8671ebb..36b62a5ee7c72 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -285,12 +285,14 @@ def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__am
 //===----------------------------------------------------------------------===//
 
 def __builtin_amdgcn_av_load_b128
-    : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts"> {
+    : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "flat-global-insts"> {
   let Documentation = [DocAVLoadB128];
+  let ArgNames = ["ptr", "scope"];
 }
 def __builtin_amdgcn_av_store_b128
-    : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts"> {
+    : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "flat-global-insts"> {
   let Documentation = [DocAVStoreB128];
+  let ArgNames = ["ptr", "data", "scope"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -375,7 +377,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned i
 // GFX9+ only builtins.
 //===----------------------------------------------------------------------===//
 
-def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "gfx9-insts">;
+def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "flat-global-insts">;
 
 def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">;
 def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">;
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
index cec85fbeb9446..9a61513cdc05b 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -12,7 +12,7 @@ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int
 typedef v4u32 __global *global_ptr_to_v4u32;
 
 v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) {
-  return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}}
+  return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature flat-global-insts}}
 }
 
 //--- store.cl
@@ -20,5 +20,5 @@ typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int
 typedef v4u32 __global *global_ptr_to_v4u32;
 
 void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) {
-  __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}}
+  __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature flat-global-insts}}
 }
diff --git a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
index 756b7c2154ca2..24e6ece329c4c 100644
--- a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
+++ b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
@@ -226,6 +226,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -280,6 +281,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -313,6 +315,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -354,6 +357,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -391,6 +395,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["image-insts"] = true;
@@ -427,6 +432,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["image-insts"] = true;
     Features["s-memrealtime"] = true;
@@ -486,6 +492,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["dot7-insts"] = true;
     Features["dot10-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx8-insts"] = true;
     Features["16-bit-insts"] = true;
     Features["dpp"] = true;
@@ -532,6 +539,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
   case GK_GFX900:
   case GK_GFX9_GENERIC:
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["vmem-to-lds-load-insts"] = true;
     [[fallthrough]];
   case GK_GFX810:

>From 4c2061ad884a1727f097e94adea7a569e0a1957b Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Tue, 9 Jun 2026 17:30:22 +0530
Subject: [PATCH 4/4] fix failing tests

---
 clang/include/clang/Basic/BuiltinsAMDGPU.td          | 2 +-
 clang/test/CodeGen/amdgpu-builtin-is-invocable.c     | 2 +-
 clang/test/CodeGen/amdgpu-builtin-processor-is.c     | 2 +-
 clang/test/CodeGen/link-builtin-bitcode.c            | 6 +++---
 clang/test/CodeGenCXX/dynamic-cast-address-space.cpp | 4 ++--
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 36b62a5ee7c72..8eed188b0f4b2 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -377,7 +377,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned i
 // GFX9+ only builtins.
 //===----------------------------------------------------------------------===//
 
-def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "flat-global-insts">;
+def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "gfx9-insts">;
 
 def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">;
 def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">;
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index 3f988cbdf7cee..8de4b0ec8bc20 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -54,7 +54,7 @@ void foo() {
 // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
 // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 // AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 308eec1d212a5..8dfbb55566598 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -63,7 +63,7 @@ void foo() {
 //.
 // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 // AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
diff --git a/clang/test/CodeGen/link-builtin-bitcode.c b/clang/test/CodeGen/link-builtin-bitcode.c
index ed5abca7f34b0..8849ee735ce04 100644
--- a/clang/test/CodeGen/link-builtin-bitcode.c
+++ b/clang/test/CodeGen/link-builtin-bitcode.c
@@ -44,6 +44,6 @@ int bar() { return no_attr() + attr_in_target() + attr_not_in_target() + attr_in
 // CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] {
 
 // CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" }
-// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" }
-// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" }
-// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" }
+// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-global-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" }
+// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+flat-global-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" }
+// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-global-insts,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" }
diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index dbca787e26da2..8252a511d6b44 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -107,9 +107,9 @@ const B& f(A *a) {
 // CHECK: attributes #[[ATTR3]] = { nounwind }
 // CHECK: attributes #[[ATTR4]] = { noreturn }
 //.
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
 //.