[llvm] [AMDGPU] Generalize global.load.lds to buffer fat pointers (PR #134911)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 8 12:19:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
Author: Krzysztof Drewniak (krzysz00)
<details>
<summary>Changes</summary>
Direct load to LDS can also be implemented on buffer fat pointers, using the pointer as the offset to raw.buffer.ptr.load.lds. This commit generalizes the existing intrinsic to support this usage.
---
Full diff: https://github.com/llvm/llvm-project/pull/134911.diff
3 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+5-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+24-1)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+18)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 217e43fcce4fd..fc6dac5dc99fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2624,17 +2624,20 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//
+// Intrinsic for loading data from a global-memory pointer to LDS
+// Also supports buffer fat pointers.
class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
[],
- [LLVMQualPointerType<1>, // Base global pointer to load from
- LLVMQualPointerType<3>, // LDS base pointer to store to
+ [llvm_anyptr_ty, // Global or buffer fat pointer to load from (per-lane)
+ LLVMQualPointerType<3>, // LDS base pointer to store to (uniform)
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // imm offset (applied to both global and LDS address)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
// bit 1 = sc1,
// bit 4 = scc))
+ // See raw_ptr_buffer_load_lds for semantics on ptr addrspace(7)
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 766a4ea250942..f8b3c122d75ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
+ case Intrinsic::amdgcn_global_load_lds:
return true;
}
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ Value *Ptr = I.getArgOperand(0);
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&I);
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *LDSPtr = I.getArgOperand(1);
+ Value *LoadSize = I.getArgOperand(2);
+ Value *ImmOff = I.getArgOperand(3);
+ Value *Aux = I.getArgOperand(4);
+ Value *SOffset = IRB.getInt32(0);
+ Instruction *NewLoad = IRB.CreateIntrinsic(
+ Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
+ {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ copyMetadata(NewLoad, &I);
+ SplitUsers.insert(&I);
+ I.replaceAllUsesWith(NewLoad);
+ return {nullptr, nullptr};
+ }
}
return {nullptr, nullptr};
}
@@ -2291,7 +2311,10 @@ class AMDGPULowerBufferFatPointers : public ModulePass {
public:
static char ID;
- AMDGPULowerBufferFatPointers() : ModulePass(ID) {}
+ AMDGPULowerBufferFatPointers() : ModulePass(ID) {
+ initializeAMDGPULowerBufferFatPointersPass(
+ *PassRegistry::getPassRegistry());
+ }
bool run(Module &M, const TargetMachine &TM);
bool runOnModule(Module &M) override;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index ee51b0b84554e..75175955b313f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -1724,3 +1724,21 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng
call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false)
ret void
}
+
+;;; Buffer load to LDS
+
+declare void @llvm.amdgcn.global.load.lds.p7(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg)
+
+define void @llvm_amdgcn_global_load_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) {;
+; CHECK-LABEL: define void @llvm_amdgcn_global_load_lds(
+; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0
+; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1
+; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0)
+; CHECK-NEXT: ret void
+;
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %idx
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0)
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/134911
More information about the llvm-commits
mailing list