[llvm-branch-commits] [clang] [llvm] AMDGPU: Support buffer_atomic_pk_add_bf16 for gfx950 (PR #117599)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 25 13:57:20 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117599
>From 35faf696c6d46717bc65a94e130f04777a5c12c8 Mon Sep 17 00:00:00 2001
From: Sirish Pande <Sirish.Pande at amd.com>
Date: Thu, 16 May 2024 16:21:42 -0500
Subject: [PATCH] AMDGPU: Support buffer_atomic_pk_add_bf16 for gfx950
Co-authored-by: Sirish Pande <Sirish.Pande at amd.com>
---
clang/test/CodeGenOpenCL/amdgpu-features.cl | 6 +-
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +-
llvm/lib/Target/AMDGPU/AMDGPU.td | 3 +-
llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +
llvm/lib/TargetParser/TargetParser.cpp | 2 +
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll | 92 +++++++++++++++++++
llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 8 +-
llvm/test/MC/AMDGPU/gfx950_asm_features.s | 60 ++++++++++++
llvm/test/MC/AMDGPU/gfx950_err.s | 12 +++
llvm/test/MC/Disassembler/AMDGPU/gfx950.txt | 45 +++++++++
10 files changed, 224 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 1e2921160d28f2..f739872685e780 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
@@ -109,8 +109,8 @@
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" "uniform-work-group-size"="true"
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c35c1635a7b6b5..79ad9dbb67430d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1452,7 +1452,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
// gfx908 intrinsic
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx12+.
+// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx950, gfx12+.
def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
@@ -1527,7 +1527,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<2, 0>;
-// gfx908 intrinsic
+// gfx908 intrinsic. Supports v2bf16 on gfx12+ and gfx950
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index be213431ea8f1f..89a35d911e8b1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1593,7 +1593,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
FeatureFP4ConversionScaleInsts,
FeatureFP6BF6ConversionScaleInsts,
FeatureDot12Insts,
- FeatureDot13Insts
+ FeatureDot13Insts,
+ FeatureAtomicBufferPkAddBF16Inst
])>;
def FeatureISAVersion9_4_0 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index e5978aee2b39a3..a288c58def5cbd 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -3296,6 +3296,8 @@ defm BUFFER_WBINVL1_VOL : MUBUF_Real_vi <0x3f>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
+defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_vi <0x52>;
+
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
let SubtargetPredicate = isGFX90APlus in {
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 8b445b1aaa0548..a2782d00b35203 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -379,6 +379,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["atomic-ds-pk-add-16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
+ Features["atomic-buffer-pk-add-bf16-inst"] = true;
Features["atomic-global-pk-add-bf16-inst"] = true;
Features["16-bit-insts"] = true;
Features["dpp"] = true;
@@ -479,6 +480,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["ashr-pk-insts"] = true;
Features["dot12-insts"] = true;
Features["dot13-insts"] = true;
+ Features["atomic-buffer-pk-add-bf16-inst"] = true;
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
new file mode 100644
index 00000000000000..ab380dbef107ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX950-SDAG
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX950-GISEL
+
+declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
+declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
+
+define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX950-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_ret:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen sc0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: flat_store_dword v[2:3], v0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen sc0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: flat_store_dword v[2:3], v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
+ %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ store <2 x bfloat> %orig, ptr null
+ ret float 1.0
+}
+
+define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX950-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_noret:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen
+; GFX950-GISEL-NEXT: s_endpgm
+ %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX950-SDAG-LABEL: raw_buffer_atomic_add_v2bf16:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: raw_buffer_atomic_add_v2bf16:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
+; GFX950-GISEL-NEXT: s_endpgm
+ %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX950-SDAG-LABEL: raw_buffer_atomic_add_v2bf16_ret:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen sc0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: flat_store_dword v[2:3], v0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen sc0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: flat_store_dword v[2:3], v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
+ %orig = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ store <2 x bfloat> %orig, ptr null
+ ret float 1.0
+}
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
index efeaf8339f6928..40ea9bb3678d9e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
@@ -2636,16 +2636,16 @@ buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCA
// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0 offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
buffer_atomic_pk_add_bf16 v5, off, s[8:11], -1 offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0.5 offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
buffer_atomic_pk_add_bf16 v5, off, s[8:11], -4.0 offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 glc
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
index 614d9e8703ae45..5aa1e2d5ccb11d 100644
--- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
@@ -1089,3 +1089,63 @@ v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], 22
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x2e,0x02]
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0xff,0x02,0x03]
+buffer_atomic_pk_add_bf16 v255, off, s[8:11], s3 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x03,0x03]
+buffer_atomic_pk_add_bf16 v5, off, s[12:15], s3 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x18,0x03]
+buffer_atomic_pk_add_bf16 v5, off, s[96:99], s3 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x65]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s101 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x7c]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], m0 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x48,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 idxen offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x48,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 offen offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:7
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x80]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xc1]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], -1 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf0]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0.5 offset:4095
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf7]
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], -4.0 offset:4095
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index 21ffdba0d8e0cb..fd3da56c5130c4 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -341,3 +341,15 @@ v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 div:2
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 clamp div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:4095 glc
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:4095 slc
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:4095 dlc
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:4095 glc slc dlc
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt
index 3852845d308834..9fc9c58387b901 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt
@@ -74,3 +74,48 @@
# GFX950: v_permlane32_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0x9a,0xd1,0x02,0x01,0x00,0x00]
0x01,0x08,0x9a,0xd1,0x02,0x01,0x00,0x00
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0xff,0x02,0x03]
+0xff,0x0f,0x48,0xe1,0x00,0xff,0x02,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x03,0x03]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x03,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x18,0x03]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x18,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x65]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x65
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x7c]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x7c
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x48,0xe1,0x00,0x05,0x02,0x03]
+0xff,0x2f,0x48,0xe1,0x00,0x05,0x02,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x48,0xe1,0x00,0x05,0x02,0x03]
+0xff,0x1f,0x48,0xe1,0x00,0x05,0x02,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
+0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
+0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
+0x07,0x00,0x48,0xe1,0x00,0x05,0x02,0x03
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x80]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x80
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xc1]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xc1
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf0]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf0
+
+# GFX950: buffer_atomic_pk_add_bf16 v5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf7]
+0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf7
More information about the llvm-branch-commits
mailing list