[llvm] [AMDGPU] Use fake16 load/store with +real-true16 and srcam-ecc (PR #161256)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 29 11:33:19 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
Fixes: SC1-6072
---
Patch is 260.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161256.diff
12 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+4)
- (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+2-2)
- (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_load_local.ll (+132)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_store_local.ll (+212)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+2432-1156)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/minmax.ll (+21-24)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index eaa1870f4be28..53b8c314d2ac4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2595,6 +2595,10 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+def Use32BitLoadStoreWithTrue16Insts : True16PredicateClass<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->d16PreservesUnusedBits()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureSRAMECC)>;
+
def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index f2e432fa8d7f5..830443380b443 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -969,7 +969,7 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
@@ -1050,7 +1050,7 @@ multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 9f33bac4c56ea..80999a5a5cc84 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1982,7 +1982,7 @@ defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
@@ -2127,7 +2127,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
@@ -2187,7 +2187,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in {
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
@@ -2356,7 +2356,7 @@ defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, extloadi16_private, i32>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
@@ -2366,7 +2366,7 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
}
-let True16Predicate = UseRealTrue16Insts in {
+let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index aaedb85c8353c..e67d7fdba58d3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
; CI-LABEL: atomic_load_monotonic_i8:
@@ -33,6 +35,14 @@ define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u8 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1
ret i8 %load
}
@@ -66,6 +76,14 @@ define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 offset:16
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i8_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u8 v0, v0 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
%load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1
ret i8 %load
@@ -100,6 +118,14 @@ define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
ret i16 %load
}
@@ -133,6 +159,14 @@ define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i16_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
%load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2
ret i16 %load
@@ -160,6 +194,14 @@ define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4
ret i32 %load
}
@@ -186,6 +228,14 @@ define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i32_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
%load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4
ret i32 %load
@@ -213,6 +263,14 @@ define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8
ret i64 %load
}
@@ -239,6 +297,14 @@ define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i64_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16
%load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8
ret i64 %load
@@ -266,6 +332,14 @@ define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f32_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16
%load = load atomic float, ptr addrspace(3) %gep monotonic, align 4
ret float %load
@@ -293,6 +367,14 @@ define double @atomic_load_monotonic_f64_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f64_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds double, ptr addrspace(3) %ptr, i32 16
%load = load atomic double, ptr addrspace(3) %gep monotonic, align 8
ret double %load
@@ -320,6 +402,14 @@ define ptr @atomic_load_monotonic_p0i8_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_p0i8_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds ptr, ptr addrspace(3) %ptr, i32 16
%load = load atomic ptr, ptr addrspace(3) %gep monotonic, align 8
ret ptr %load
@@ -347,6 +437,14 @@ define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_p3i8_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %ptr, i32 16
%load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
ret ptr addrspace(3) %load
@@ -381,6 +479,14 @@ define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2
%ret = bitcast half %load to i16
ret i16 %ret
@@ -415,6 +521,14 @@ define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f16_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
%load = load atomic half, ptr addrspace(3) %gep monotonic, align 2
%ret = bitcast half %load to i16
@@ -450,6 +564,14 @@ define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
%ret = bitcast bfloat %load to i16
ret i16 %ret
@@ -484,6 +606,14 @@ define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_bf16_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
%load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2
%ret = bitcast bfloat %load to i16
@@ -491,3 +621,5 @@ define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; GFX1250-FAKE16: {{.*}}
+; GFX1250-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
index c2bb4f004e001..83e30773615aa 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
; CI-LABEL: atomic_store_monotonic_i8:
@@ -41,6 +43,28 @@ define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: atomic_store_monotonic_i8:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, 2
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v1
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v2
+; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: atomic_store_monotonic_i8:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v1
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v2
+; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%val1 = add i8 %val, 2
store atomic i8 %val, ptr addrspace(3) %ptr monotonic, align 1
store atomic i8 %val1, ptr addrspace(3) %ptr monotonic, align 1
@@ -84,6 +108,28 @@ define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) {
; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: atomic_store_monotonic_offset_i8:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, 2
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v1 offset:8
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v2 offset:16
+; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: atomic_store_monotonic_offset_i8:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v1 offset:8
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16
+; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-FAKE16-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/161256
More information about the llvm-commits
mailing list