[llvm] [SelectionDAG] Optimize MPI for align(1) GEPs using base pointer (PR #145309)
Acthinks Yang via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 23 04:01:59 PDT 2025
https://github.com/Acthinks created https://github.com/llvm/llvm-project/pull/145309
Summary:
For align(1) loads/stores from GEPs:
- Replace MPI(gep, 0) with MPI(base_ptr, const_offset)
- Preserve base pointer's stronger alignment
- Implemented directly in SelectionDAGBuilder::visitLoad/visitStore
Issue: #143215
Future:
1. Optimize expandUnalignedLoad/Store
2. Improve computeKnownBits for phi(GEP)
>From c6453fbdd26f18f587fe9c76fa781405450e7816 Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Mon, 23 Jun 2025 17:56:44 +0800
Subject: [PATCH] [SelectionDAG] Optimize MPI for align(1) GEPs using base
pointer
Summary:
For align(1) loads/stores from GEPs:
- Replace MPI(gep, 0) with MPI(base_ptr, const_offset)
- Preserve base pointer's stronger alignment
- Implemented directly in SelectionDAGBuilder::visitLoad/visitStore
Issue: #143215
Future:
1. Optimize expandUnalignedLoad/Store
2. Improve computeKnownBits for phi(GEP)
---
.../SelectionDAG/SelectionDAGBuilder.cpp | 59 ++++++++++--
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 52 +++++------
.../CodeGen/RISCV/unaligned-load-store.ll | 92 +++++++++++++++++++
3 files changed, 169 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c01f1e7928477..3f5963a735ffb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4566,6 +4566,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
+ const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
@@ -4587,7 +4588,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4597,7 +4598,25 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
- TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+ TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);
+
+ // See visitStore comments.
+ int64_t Offset = 0;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(SV);
+ GEP && Alignment == Align(1)) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, &I, AC);
+ if (BaseAlignment > Alignment) {
+ SV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
SDValue Root;
bool ConstantMemory = false;
@@ -4647,7 +4666,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4734,6 +4753,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
+ const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
@@ -4754,8 +4774,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
- SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
+ &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4772,7 +4792,32 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();
- auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+ // refine MPI: V + Offset
+ // Example:
+ // align 4 %p
+ // %gep = getelementptr i8, ptr %p, i32 1
+ // store i32 %v, ptr %len, align 1
+ // ->
+ // MPI: V = %p, Offset = 1
+ // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
+ int64_t Offset = 0;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
+ GEP && Alignment == Align(1)) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, &I, AC);
+ if (BaseAlignment > Alignment) {
+ PtrV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
+
+ auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4787,7 +4832,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5396f63..2010c350226dc 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4708,13 +4708,13 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
-; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
+; SI-NEXT: s_load_dword s3, s[4:5], 0xd
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -4725,10 +4725,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT: s_lshl_b32 s0, s3, 24
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, s0, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -4740,45 +4740,43 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: s_add_u32 s2, s4, 50
-; VI-NEXT: s_addc_u32 s3, s5, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 3
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_load_ubyte v6, v[0:1]
+; VI-NEXT: s_load_dword s2, s[4:5], 0x34
+; VI-NEXT: s_add_u32 s0, s4, 50
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s2, s2, 24
; VI-NEXT: s_add_u32 s0, s4, 51
+; VI-NEXT: flat_load_ubyte v7, v[0:1]
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ubyte v8, v[0:1]
-; VI-NEXT: flat_load_ubyte v9, v[2:3]
-; VI-NEXT: flat_load_ubyte v10, v[4:5]
-; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[4:5], 0x24
+; VI-NEXT: s_load_dword s3, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: flat_store_dword v[2:3], v7
+; VI-NEXT: flat_store_dword v[2:3], v9
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; VI-NEXT: v_or_b32_e32 v4, v4, v9
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v7
+; VI-NEXT: v_or_b32_e32 v4, v4, v6
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; VI-NEXT: v_or_b32_e32 v5, s2, v5
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index c9c49e8f7f532..cde1b6af4717f 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -578,5 +578,97 @@ define void @store_large_constant(ptr %x) {
store i64 18364758544493064720, ptr %x, align 1
ret void
}
+
+define void @store_const_with_align_attribute(ptr align 2 %p) {
+; SLOW-LABEL: store_const_with_align_attribute:
+; SLOW: # %bb.0: # %entry
+; SLOW-NEXT: sb zero, 3(a0)
+; SLOW-NEXT: sh zero, 4(a0)
+; SLOW-NEXT: sb zero, 6(a0)
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: store_const_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: sw zero, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 0, ptr %len, align 1
+ ret void
+}
+
+; TODO: opt expandUnalignedStore
+define void @store_with_align_attribute(ptr align 2 %p, i32 %v) {
+; SLOW-LABEL: store_with_align_attribute:
+; SLOW: # %bb.0: # %entry
+; SLOW-NEXT: srli a2, a1, 24
+; SLOW-NEXT: srli a3, a1, 16
+; SLOW-NEXT: srli a4, a1, 8
+; SLOW-NEXT: sb a1, 3(a0)
+; SLOW-NEXT: sb a4, 4(a0)
+; SLOW-NEXT: sb a3, 5(a0)
+; SLOW-NEXT: sb a2, 6(a0)
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: store_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: sw a1, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+; TODO: opt expandUnalignedLoad
+define i32 @load_with_align_attribute(ptr align 2 %p) {
+; SLOWBASE-LABEL: load_with_align_attribute:
+; SLOWBASE: # %bb.0: # %entry
+; SLOWBASE-NEXT: lbu a1, 4(a0)
+; SLOWBASE-NEXT: lbu a2, 3(a0)
+; SLOWBASE-NEXT: lbu a3, 5(a0)
+; SLOWBASE-NEXT: lbu a0, 6(a0)
+; SLOWBASE-NEXT: slli a1, a1, 8
+; SLOWBASE-NEXT: or a1, a1, a2
+; SLOWBASE-NEXT: slli a3, a3, 16
+; SLOWBASE-NEXT: slli a0, a0, 24
+; SLOWBASE-NEXT: or a0, a0, a3
+; SLOWBASE-NEXT: or a0, a0, a1
+; SLOWBASE-NEXT: ret
+;
+; RV32IZBKB-LABEL: load_with_align_attribute:
+; RV32IZBKB: # %bb.0: # %entry
+; RV32IZBKB-NEXT: lbu a1, 4(a0)
+; RV32IZBKB-NEXT: lbu a2, 5(a0)
+; RV32IZBKB-NEXT: lbu a3, 6(a0)
+; RV32IZBKB-NEXT: lbu a0, 3(a0)
+; RV32IZBKB-NEXT: packh a2, a2, a3
+; RV32IZBKB-NEXT: packh a0, a0, a1
+; RV32IZBKB-NEXT: pack a0, a0, a2
+; RV32IZBKB-NEXT: ret
+;
+; RV64IZBKB-LABEL: load_with_align_attribute:
+; RV64IZBKB: # %bb.0: # %entry
+; RV64IZBKB-NEXT: lbu a1, 3(a0)
+; RV64IZBKB-NEXT: lbu a2, 4(a0)
+; RV64IZBKB-NEXT: lbu a3, 5(a0)
+; RV64IZBKB-NEXT: lbu a0, 6(a0)
+; RV64IZBKB-NEXT: packh a1, a1, a2
+; RV64IZBKB-NEXT: slli a3, a3, 16
+; RV64IZBKB-NEXT: slli a0, a0, 24
+; RV64IZBKB-NEXT: or a0, a0, a3
+; RV64IZBKB-NEXT: or a0, a0, a1
+; RV64IZBKB-NEXT: ret
+;
+; FAST-LABEL: load_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: lw a0, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SLOWZBKB: {{.*}}
More information about the llvm-commits
mailing list