[llvm] promoteMUBUFLoadStoreScalarOffset (PR #142328)
Aniket Lal via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 1 21:34:11 PDT 2025
https://github.com/lalaniket8 created https://github.com/llvm/llvm-project/pull/142328
This PR works on the following SILoadStore Optimization.
Following instructions:
`v_add_u32 v1, v0, sN`
`buffer_{load,store}_T v*, s[buf:buf+3], v1 offen`
can be optimized to:
`buffer_{load,store}_T v*, s[bufDesc:bufDesc+3], v0, sN offen`
>From 2b8b07248d2f7976f977875e06999074162c2bf8 Mon Sep 17 00:00:00 2001
From: anikelal <anikelal at amd.com>
Date: Mon, 2 Jun 2025 10:01:05 +0530
Subject: [PATCH] promoteMUBUFLoadStoreScalarOffset
---
.../Target/AMDGPU/SILoadStoreOptimizer.cpp | 73 ++++++++++++++-----
...ion-load-store-into-vector-scalar-parts.ll | 65 +++++++++++++++++
2 files changed, 120 insertions(+), 18 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b0d6fd95cd271..b4667968b6b71 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -282,6 +282,7 @@ class SILoadStoreOptimizer {
bool promoteConstantOffsetToImm(MachineInstr &CI,
MemInfoMap &Visited,
SmallPtrSet<MachineInstr *, 4> &Promoted) const;
+ bool promoteMUBUFLoadStoreScalarOffset(MachineInstr &CI) const;
void addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const;
@@ -427,16 +428,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN://
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact://
case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
- case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
- case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN://
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact://
case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
return BUFFER_LOAD;
@@ -2092,25 +2093,25 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
if (!Base.isReg())
return;
- MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
+ MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());//REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
|| Def->getNumOperands() != 5)
return;
- MachineOperand BaseLo = Def->getOperand(1);
- MachineOperand BaseHi = Def->getOperand(3);
+ MachineOperand BaseLo = Def->getOperand(1);//%LO:vgpr_32
+ MachineOperand BaseHi = Def->getOperand(3);//%HI:vgpr_32
if (!BaseLo.isReg() || !BaseHi.isReg())
return;
- MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
- MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
+ MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());//%LO:vgpr_32, %c:sreg_64_xexec = V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
+ MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());//%HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
!BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
return;
- const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
- const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
+ const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);//%BASE_LO:vgpr_32
+ const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);//%103:sgpr_32
auto Offset0P = extractConstOffset(*Src0);
if (Offset0P)
@@ -2120,12 +2121,12 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
return;
BaseLo = *Src0;
}
-
+//BaseLo = %103:sgpr_32
if (!BaseLo.isReg())
return;
- Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
- Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
+ Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);// %BASE_HI:vgpr_32
+ Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);// 0
if (Src0->isImm())
std::swap(Src0, Src1);
@@ -2133,14 +2134,14 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
if (!Src1->isImm() || Src0->isImm())
return;
- uint64_t Offset1 = Src1->getImm();
- BaseHi = *Src0;
+ uint64_t Offset1 = Src1->getImm(); //0
+ BaseHi = *Src0;//%BASE_HI:vgpr_32
if (!BaseHi.isReg())
return;
- Addr.Base.LoReg = BaseLo.getReg();
- Addr.Base.HiReg = BaseHi.getReg();
+ Addr.Base.LoReg = BaseLo.getReg();//%103:sgpr_32
+ Addr.Base.HiReg = BaseHi.getReg();//%BASE_HI:vgpr_32
Addr.Base.LoSubReg = BaseLo.getSubReg();
Addr.Base.HiSubReg = BaseHi.getSubReg();
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
@@ -2298,6 +2299,39 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
return false;
}
+bool SILoadStoreOptimizer::promoteMUBUFLoadStoreScalarOffset(
+ MachineInstr &MI) const{
+ if(!SIInstrInfo::isMUBUF(MI))
+ return false;
+ LLVM_DEBUG(dbgs() << "tryToPromoteMUBUFLoadStoreScalarOffset:"; MI.dump());
+ auto vaddr = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ if(!vaddr) return false;
+ LLVM_DEBUG(dbgs() << "\n vaddr:"; vaddr->dump());
+ MachineInstr *Def = MRI->getUniqueVRegDef(vaddr->getReg());
+ if(!Def) return false;
+ LLVM_DEBUG(dbgs() << "\n def:"; Def->dump());
+ auto opsrc0 = TII->getNamedOperand(*Def, AMDGPU::OpName::src0);
+ if(!opsrc0) return false;
+ auto opsrc1 = TII->getNamedOperand(*Def, AMDGPU::OpName::src1);
+ if(!opsrc1) return false;
+ LLVM_DEBUG(dbgs() << "\n opsrc0:"; opsrc0->dump());
+ LLVM_DEBUG(dbgs() << "\n opsrc1:"; opsrc1->dump());
+ auto isopsrc0scalarreg = TII->getRegisterInfo().isSGPRClass(MRI->getRegClass(opsrc0->getReg()));
+ auto isopsrc1scalarreg = TII->getRegisterInfo().isSGPRClass(MRI->getRegClass(opsrc1->getReg()));
+ LLVM_DEBUG(dbgs() << "\n isopsrc0scalarreg:" << isopsrc0scalarreg << " isopsrc1scalarreg:" << isopsrc1scalarreg;);
+ if(!(isopsrc0scalarreg ^ isopsrc1scalarreg)) return false;
+ auto scalarOp = isopsrc0scalarreg ? opsrc0 : opsrc1;
+
+ // if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
+ // !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
+ // return;
+
+ // const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);//%BASE_LO:vgpr_32
+ // const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);//%103:sgpr_32
+
+ return false;
+}
+
void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const {
for (std::list<CombineInfo> &AddrList : MergeableInsts) {
@@ -2331,6 +2365,9 @@ SILoadStoreOptimizer::collectMergeableInsts(
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
Modified = true;
+ if (promoteMUBUFLoadStoreScalarOffset(MI))
+ Modified = true;
+
// Treat volatile accesses, ordered accesses and unmodeled side effects as
// barriers. We can look after this barrier for separate merges.
if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll
new file mode 100644
index 0000000000000..71994b8252f60
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+ at 0 = external dso_local addrspace(4) constant [4 x <2 x float>]
+ at 1 = external dso_local addrspace(4) constant i32
+
+; GCN-LABEL: name: test_buffer_load_sgpr_plus_imm_offset
+; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0
+; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
+; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2
+; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3
+; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr4
+; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
+; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 77,
+; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
+; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
+; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2
+; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3
+; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4
+; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
+; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 77,
+define void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> %base, i32 %i, i32 inreg %j, ptr addrspace(1) inreg %out) {
+ %off = add i32 %i, %j
+ %v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0)
+ store i32 %v, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32>, ptr addrspace(8) nocapture, i32, i32, i32 immarg) #1
+
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) nounwind readnone willreturn
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.reloc.constant(metadata) #3
+
+; Function Attrs: nounwind readnone speculatable
+declare i64 @llvm.amdgcn.s.getpc() #3
+
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) #1
+
+attributes #0 = { argmemonly nounwind willreturn }
+attributes #1 = { nounwind memory(argmem: write) }
+attributes #2 = { nounwind "amdgpu-unroll-threshold"="700" }
+attributes #3 = { nounwind readnone speculatable }
+attributes #4 = { nounwind writeonly }
+
+!llpc.compute.mode = !{!0}
+!llpc.options = !{!1}
+!llpc.options.CS = !{!2}
+!llpc.user.data.nodes = !{!3, !4, !5, !6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{i32 2, i32 3, i32 1}
+!1 = !{i32 245227952, i32 996822128, i32 2024708198, i32 497230408}
+!2 = !{i32 1381820427, i32 1742110173, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64}
+!3 = !{!"DescriptorTableVaPtr", i32 0, i32 1, i32 1}
+!4 = !{!"DescriptorBuffer", i32 4, i32 8, i32 0, i32 0}
+!5 = !{!"DescriptorTableVaPtr", i32 1, i32 1, i32 1}
+!6 = !{!"DescriptorBuffer", i32 4, i32 8, i32 1, i32 0}
+!7 = !{!"\82\B0amdpal.pipelines\91\88\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\82\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\B7.internal_pipeline_hash\92\CF;jLp\0E\9D\E1\B0\CF\1D\A3\22Hx\AE\98f\AA.registers\88\CD.\07\02\CD.\08\03\CD.\09\01\CD.\12\CE\00,\00\00\CD.\13\CD\0F\88\CD.@\CE\10\00\00\00\CD.B\00\CD.C\01\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CFg\D6}\DDR\\\E8\0B\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\02\AEamdpal.version\92\02\03"}
+!8 = !{i32 5}
+!9 = !{!"doff_0_0_b"}
+!10 = !{}
+!11 = !{!"doff_1_0_b"}
More information about the llvm-commits
mailing list