[llvm-branch-commits] [llvm] [AMDGPU] Return two MMOs for load-to-lds and store-from-lds intrinsics (PR #175845)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jan 13 13:47:25 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Nicolai Hähnle (nhaehnle)
<details>
<summary>Changes</summary>
Accurately represent both the load and the store part of those
intrinsics.
The test changes seem to be mostly fairly insignificant changes caused by
subtly different scheduler behavior.
---
**Stack**:
- [4/4] #<!-- -->175846
- [3/4] #<!-- -->175845 ⬅
- [2/4] #<!-- -->175844
- [1/4] #<!-- -->175843
⚠️ *Part of a stack created by [spr](https://github.com/nhaehnle/spr). Merging this PR using the GitHub UI may have unexpected results.*
---
Full diff: https://github.com/llvm/llvm-project/pull/175845.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+56-51)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll (+3-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cc239752cd453..5224f2f093755 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1440,8 +1440,23 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+
+ // Entry 0: Load from buffer.
+ // Don't set an offset, since the pointer value always represents the
+ // base of the buffer.
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
- Info.ptrVal = CI.getArgOperand(1);
+ Info.flags &= ~MachineMemOperand::MOStore;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to LDS.
+ // Instruction offset is applied, and an additional per-lane offset
+ // which we simulate using a larger memory type.
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64);
+ Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))->getZExtValue();
+ Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
Infos.push_back(Info);
return;
}
@@ -1634,10 +1649,18 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+ // Entry 0: Load from source (global/flat).
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
- Info.ptrVal = CI.getArgOperand(1);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(0); // Global pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to LDS (same offset).
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(1); // LDS pointer
Infos.push_back(Info);
return;
}
@@ -1645,24 +1668,45 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ // Entry 0: Load from LDS.
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
- Info.ptrVal = CI.getArgOperand(0);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(1); // LDS pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to global (same offset).
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(0); // Global pointer
Infos.push_back(Info);
return;
}
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
- Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
- Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
- Info.ptrVal = CI.getArgOperand(1);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
- if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+ bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
+
+ // Entry 0: Load from source (global/flat).
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ Info.ptrVal = CI.getArgOperand(0); // Source pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ if (IsVolatile)
Info.flags |= MachineMemOperand::MOVolatile;
Infos.push_back(Info);
+
+ // Entry 1: Store to LDS.
+ // Same offset from the instruction, but an additional per-lane offset is
+ // added. Represent that using a wider memory type.
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64);
+ Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Infos.push_back(Info);
return;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
@@ -11184,7 +11228,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
unsigned IntrinsicID = Op.getConstantOperandVal(1);
- MachineFunction &MF = DAG.getMachineFunction();
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp_compr: {
@@ -11459,29 +11502,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
- MachineMemOperand *LoadMMO = M->getMemOperand();
- // Don't set the offset value here because the pointer points to the base of
- // the buffer.
- MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-
- MachinePointerInfo StorePtrI = LoadPtrI;
- LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
- LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
- StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
-
- auto F = LoadMMO->getFlags() &
- ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO =
- MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
- MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
- StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
- DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+ DAG.setNodeMemRefs(Load, M->memoperands());
return SDValue(Load, 0);
}
@@ -11563,25 +11585,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
- MachineMemOperand *LoadMMO = M->getMemOperand();
- MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
- LoadPtrI.Offset = Op->getConstantOperandVal(5);
- MachinePointerInfo StorePtrI = LoadPtrI;
- LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
- LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
- StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
- auto F = LoadMMO->getFlags() &
- ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO =
- MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
- MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
- StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
- LoadMMO->getAAInfo());
-
auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
- DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+ DAG.setNodeMemRefs(Load, M->memoperands());
return SDValue(Load, 0);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 352af044b0a6d..f66ad928d261d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -267,9 +267,8 @@ main_body:
define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_dword_volatile:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
@@ -278,9 +277,8 @@ define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture
;
; GFX942-LABEL: buffer_load_lds_dword_volatile:
; GFX942: ; %bb.0: ; %main_body
-; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
index 0b40c81af414d..14d2e4ca5f2c3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -340,18 +340,16 @@ define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gpt
define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
; GFX90A: ; %bb.0:
-; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
; GFX942: ; %bb.0:
-; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX942-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
index a00aca34252b1..74fddfe290818 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
@@ -11,15 +11,14 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
; CHECK-NEXT: s_add_u32 s4, s0, 64
; CHECK-NEXT: s_addc_u32 s5, s1, 0
; CHECK-NEXT: s_mov_b32 m0, s2
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4
-; CHECK-NEXT: s_load_dword s4, s[0:1], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s6
; CHECK-NEXT: global_store_dword v0, v3, s[0:1] offset:64
+; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4
; CHECK-NEXT: ; sched_barrier mask(0x00000000)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: v_mov_b32_e32 v2, s3
``````````
</details>
https://github.com/llvm/llvm-project/pull/175845
More information about the llvm-branch-commits
mailing list