[llvm] [AMDGPU][True16][CodeGen] D16 LDS lodd/store pseudo instructions in true16 (PR #131427)

via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 16 19:17:30 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

<details>
<summary>Changes</summary>

Implement new pseudos with the suffix _t16 which have VGPR_16 as the store src or load dst. This affects LDS 8 and 16-bit loads and stores. Lower the pseudos to the existing real Hi/Lo instructions in MC inst layer with VGPR_32 src or dst

---

Patch is 166.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131427.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+70-17) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_load_local.ll (+397-112) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_store_local.ll (+400-86) 
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+61-31) 
- (modified) llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll (+72-32) 
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+1182-566) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d3487daee364f..e1e7433b04697 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -127,6 +127,15 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
   }
 }
 
+multiclass DS_1A1D_NORET_t16<string opName, RegisterClass rc = VGPR_32> 
+: DS_1A1D_NORET_mc<opName, rc> {
+  let has_m0_read = 0 in {
+    let True16Predicate = UseRealTrue16Insts in {
+      def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+    }
+  }
+}
+
 multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
   let has_m0_read = 0 in {
     def "" : DS_1A1D_NORET<opName, rc>;
@@ -294,6 +303,15 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
   }
 }
 
+multiclass DS_1A_RET_t16<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset> 
+: DS_1A_RET_mc<opName, rc, HasTiedOutput, ofs> {
+  let has_m0_read = 0 in {
+    let True16Predicate = UseRealTrue16Insts in {
+      def "_t16" : DS_1A_RET<opName#"_t16", VGPR_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
+    }
+  }
+}
+
 multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
   let has_m0_read = 0 in {
     def "" : DS_1A_RET<opName, rc>;
@@ -457,8 +475,6 @@ defm DS_MIN_F32       : DS_1A1D_NORET_mc<"ds_min_f32">;
 defm DS_MAX_F32       : DS_1A1D_NORET_mc<"ds_max_f32">;
 
 let mayLoad = 0 in {
-defm DS_WRITE_B8      : DS_1A1D_NORET_mc<"ds_write_b8">;
-defm DS_WRITE_B16     : DS_1A1D_NORET_mc<"ds_write_b16">;
 defm DS_WRITE_B32     : DS_1A1D_NORET_mc<"ds_write_b32">;
 defm DS_WRITE2_B32    : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">;
 defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">;
@@ -473,6 +489,9 @@ def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
 
 } // End has_m0_read = 0
 
+defm DS_WRITE_B8      : DS_1A1D_NORET_t16<"ds_write_b8">;
+defm DS_WRITE_B16     : DS_1A1D_NORET_t16<"ds_write_b16">;
+
 let SubtargetPredicate = HasDSAddTid in {
 def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
 }
@@ -625,10 +644,7 @@ def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>;
 }
 
 let mayStore = 0 in {
-defm DS_READ_I8      : DS_1A_RET_mc<"ds_read_i8">;
-defm DS_READ_U8      : DS_1A_RET_mc<"ds_read_u8">;
 defm DS_READ_I16     : DS_1A_RET_mc<"ds_read_i16">;
-defm DS_READ_U16     : DS_1A_RET_mc<"ds_read_u16">;
 defm DS_READ_B32     : DS_1A_RET_mc<"ds_read_b32">;
 defm DS_READ_B64     : DS_1A_RET_mc<"ds_read_b64", VReg_64>;
 
@@ -649,6 +665,10 @@ def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
 }
 } // End has_m0_read = 0
 
+defm DS_READ_I8      : DS_1A_RET_t16<"ds_read_i8">;
+defm DS_READ_U8      : DS_1A_RET_t16<"ds_read_u8">;
+defm DS_READ_U16     : DS_1A_RET_t16<"ds_read_u16">;
+
 let SubtargetPredicate = HasDSAddTid in {
 def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
 }
@@ -784,34 +804,51 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
   }
 }
 
+multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
+
+  let OtherPredicates = [LDSRequiresM0Init] in {
+    def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+      let True16Predicate = p in {
+        def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+      }
+    let True16Predicate = UseRealTrue16Insts in {
+      def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
+    }
+  }
+}
+
 class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
   (inst $ptr, Offset:$offset, (i1 0), $in)
 >;
 
 defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_I8,  i16, "sextloadi8_local">;
 defm : DSReadPat_mc <DS_READ_U8,  i32, "extloadi8_local">;
 defm : DSReadPat_mc <DS_READ_U8,  i32, "zextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8,  i16, "extloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8,  i16, "zextloadi8_local">;
 defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
 defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
+defm : DSReadPat_t16 <DS_READ_I8,  i16, "sextloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U8,  i16, "extloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U8,  i16, "zextloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U16, i16, "load_local">;
 
 foreach vt = Reg32Types.types in {
 defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
 }
 
-defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_8_local">;
 defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_zext_8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_zext_8_local">;
 defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_zext_8_local">;
-defm : DSReadPat_mc <DS_READ_I8, i16, "atomic_load_sext_8_local">;
+defm : DSReadPat_t16 <DS_READ_I8, i16, "atomic_load_sext_8_local">;
 defm : DSReadPat_mc <DS_READ_I8, i32, "atomic_load_sext_8_local">;
-defm : DSReadPat_mc <DS_READ_U16, i16, "atomic_load_16_local">;
+defm : DSReadPat_t16 <DS_READ_U16, i16, "atomic_load_16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_zext_16_local">;
 defm : DSReadPat_mc <DS_READ_I16, i32, "atomic_load_sext_16_local">;
@@ -850,18 +887,34 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
   }
 }
 
+multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
+  let OtherPredicates = [LDSRequiresM0Init] in {
+    def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+      let True16Predicate = p in {
+        def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+      }
+    let True16Predicate = UseRealTrue16Insts in {
+      def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
+    }
+  }
+}
+
 defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
 defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
-defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
-defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
+defm : DSWritePat_t16 <DS_WRITE_B8, i16, "truncstorei8_local">;
+defm : DSWritePat_t16 <DS_WRITE_B16, i16, "store_local">;
 
 foreach vt = Reg32Types.types in {
 defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
 }
 
-defm : DSWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
+defm : DSWritePat_t16 <DS_WRITE_B8, i16, "atomic_store_8_local">;
 defm : DSWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
-defm : DSWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
+defm : DSWritePat_t16 <DS_WRITE_B16, i16, "atomic_store_16_local">;
 defm : DSWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
 defm : DSWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
 defm : DSWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index a3b6c283512f3..7f45b038b6d0d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -1,208 +1,493 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i8:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u8 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i8:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_u8 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_u8 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    ds_load_u8_d16 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    ds_load_u8 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1
   ret i8 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i8_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u8 v0, v0 offset:16{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i8_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_u8 v0, v0 offset:16
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i8_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_u8 v0, v0 offset:16
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    ds_load_u8_d16 v0, v0 offset:16
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    ds_load_u8 v0, v0 offset:16
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
   %load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1
   ret i8 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_u16 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    ds_load_u16_d16 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    ds_load_u16 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
   ret i16 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i16_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i16_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_u16 v0, v0 offset:32
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i16_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_u16 v0, v0 offset:32
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    ds_load_u16_d16 v0, v0 offset:32
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    ds_load_u16 v0, v0 offset:32
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
   %load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2
   ret i16 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i32:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4
   ret i32 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i32_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_b32 v0, v0 offset:64
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i32_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v0, v0 offset:64
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b32 v0, v0 offset:64
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
   %load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4
   ret i32 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i64:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_b64 v[0:1], v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b64 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8
   ret i64 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_i64_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i64_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_b64 v[0:1], v0 offset:128
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i64_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0 offset:128
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i64_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b64 v[0:1], v0 offset:128
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16
   %load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8
   ret i64 %load
 }
 
-; GCN-LABEL: {{^}}atomic_load_monotonic_f32_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_f32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_b32 v0, v0 offset:64
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_f32_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v0, v0 offset:64
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_f32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b32 v0, v0 offset:64
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16
   %load = ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131427


More information about the llvm-commits mailing list