[llvm] [AMDGPU][True16][CodeGen] D16 LDS lodd/store pseudo instructions in true16 (PR #131427)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 16 19:17:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
Implement new pseudos with the suffix _t16 which have VGPR_16 as the store src or load dst. This affects LDS 8 and 16-bit loads and stores. Lower the pseudos to the existing real Hi/Lo instructions in MC inst layer with VGPR_32 src or dst
---
Patch is 166.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131427.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+70-17)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_load_local.ll (+397-112)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_store_local.ll (+400-86)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+61-31)
- (modified) llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll (+72-32)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+1182-566)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d3487daee364f..e1e7433b04697 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -127,6 +127,15 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
}
}
+multiclass DS_1A1D_NORET_t16<string opName, RegisterClass rc = VGPR_32>
+: DS_1A1D_NORET_mc<opName, rc> {
+ let has_m0_read = 0 in {
+ let True16Predicate = UseRealTrue16Insts in {
+ def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+ }
+ }
+}
+
multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
let has_m0_read = 0 in {
def "" : DS_1A1D_NORET<opName, rc>;
@@ -294,6 +303,15 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
}
}
+multiclass DS_1A_RET_t16<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset>
+: DS_1A_RET_mc<opName, rc, HasTiedOutput, ofs> {
+ let has_m0_read = 0 in {
+ let True16Predicate = UseRealTrue16Insts in {
+ def "_t16" : DS_1A_RET<opName#"_t16", VGPR_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
+ }
+ }
+}
+
multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
let has_m0_read = 0 in {
def "" : DS_1A_RET<opName, rc>;
@@ -457,8 +475,6 @@ defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">;
defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">;
let mayLoad = 0 in {
-defm DS_WRITE_B8 : DS_1A1D_NORET_mc<"ds_write_b8">;
-defm DS_WRITE_B16 : DS_1A1D_NORET_mc<"ds_write_b16">;
defm DS_WRITE_B32 : DS_1A1D_NORET_mc<"ds_write_b32">;
defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">;
defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">;
@@ -473,6 +489,9 @@ def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
} // End has_m0_read = 0
+defm DS_WRITE_B8 : DS_1A1D_NORET_t16<"ds_write_b8">;
+defm DS_WRITE_B16 : DS_1A1D_NORET_t16<"ds_write_b16">;
+
let SubtargetPredicate = HasDSAddTid in {
def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
}
@@ -625,10 +644,7 @@ def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>;
}
let mayStore = 0 in {
-defm DS_READ_I8 : DS_1A_RET_mc<"ds_read_i8">;
-defm DS_READ_U8 : DS_1A_RET_mc<"ds_read_u8">;
defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">;
-defm DS_READ_U16 : DS_1A_RET_mc<"ds_read_u16">;
defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">;
defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>;
@@ -649,6 +665,10 @@ def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
}
} // End has_m0_read = 0
+defm DS_READ_I8 : DS_1A_RET_t16<"ds_read_i8">;
+defm DS_READ_U8 : DS_1A_RET_t16<"ds_read_u8">;
+defm DS_READ_U16 : DS_1A_RET_t16<"ds_read_u16">;
+
let SubtargetPredicate = HasDSAddTid in {
def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
}
@@ -784,34 +804,51 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
}
+multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
+
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ let True16Predicate = p in {
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
+ }
+ }
+}
+
class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
(inst $ptr, Offset:$offset, (i1 0), $in)
>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "extloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "zextloadi8_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
+defm : DSReadPat_t16 <DS_READ_I8, i16, "sextloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "extloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "zextloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U16, i16, "load_local">;
foreach vt = Reg32Types.types in {
defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
}
-defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_zext_8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_zext_8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_zext_8_local">;
-defm : DSReadPat_mc <DS_READ_I8, i16, "atomic_load_sext_8_local">;
+defm : DSReadPat_t16 <DS_READ_I8, i16, "atomic_load_sext_8_local">;
defm : DSReadPat_mc <DS_READ_I8, i32, "atomic_load_sext_8_local">;
-defm : DSReadPat_mc <DS_READ_U16, i16, "atomic_load_16_local">;
+defm : DSReadPat_t16 <DS_READ_U16, i16, "atomic_load_16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_zext_16_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "atomic_load_sext_16_local">;
@@ -850,18 +887,34 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
}
}
+multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ let True16Predicate = p in {
+ def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
+ }
+ }
+}
+
defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
-defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
-defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
+defm : DSWritePat_t16 <DS_WRITE_B8, i16, "truncstorei8_local">;
+defm : DSWritePat_t16 <DS_WRITE_B16, i16, "store_local">;
foreach vt = Reg32Types.types in {
defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
}
-defm : DSWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
+defm : DSWritePat_t16 <DS_WRITE_B8, i16, "atomic_store_8_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
-defm : DSWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
+defm : DSWritePat_t16 <DS_WRITE_B16, i16, "atomic_store_16_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
defm : DSWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
defm : DSWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index a3b6c283512f3..7f45b038b6d0d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -1,208 +1,493 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; GCN-LABEL: {{^}}atomic_load_monotonic_i8:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u8 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i8:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u8 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u8 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1
ret i8 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i8_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u8 v0, v0 offset:16{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i8_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u8 v0, v0 offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i8_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u8 v0, v0 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8_offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0 offset:16
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8_offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 offset:16
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
%load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1
ret i8 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
ret i16 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i16_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i16_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0 offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i16_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0 offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16_offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16_offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
%load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2
ret i16 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i32:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b32 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b32 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b32 v0, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4
ret i32 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i32_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b32 v0, v0 offset:64
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b32 v0, v0 offset:64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
%load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4
ret i32 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i64:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b64 v[0:1], v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b64 v[0:1], v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8
ret i64 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i64_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16
%load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8
ret i64 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_f32_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_f32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b32 v0, v0 offset:64
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_f32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b32 v0, v0 offset:64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_f32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16
%load = ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/131427
More information about the llvm-commits
mailing list