[llvm] [SelectionDAG] Optimize unaligned load stores to realign after offset (PR #145309)
Acthinks Yang via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 6 20:10:30 PDT 2025
https://github.com/Acthinks updated https://github.com/llvm/llvm-project/pull/145309
>From f4e2549b4c68489b995365de77f11d9b0a43d7f8 Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Tue, 1 Jul 2025 09:56:20 +0800
Subject: [PATCH 1/3] [PreCommit] Unaligned load/store realigned after offset
---
.../unaligned-load-store-with-aligned.ll | 157 ++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
new file mode 100644
index 0000000000000..f0a7e18054970
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: store_b32_basealign2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: srli a2, a1, 24
+; CHECK-NEXT: srli a3, a1, 16
+; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: sb a1, 1(a0)
+; CHECK-NEXT: sb a4, 2(a0)
+; CHECK-NEXT: sb a3, 3(a0)
+; CHECK-NEXT: sb a2, 4(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: store_b32_basealign2_offset3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: srli a2, a1, 24
+; CHECK-NEXT: srli a3, a1, 16
+; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: sb a1, 3(a0)
+; CHECK-NEXT: sb a4, 4(a0)
+; CHECK-NEXT: sb a3, 5(a0)
+; CHECK-NEXT: sb a2, 6(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset1(ptr align 4 %p) {
+; CHECK-LABEL: store_b64_basealign4_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sb zero, 5(a0)
+; CHECK-NEXT: sb zero, 6(a0)
+; CHECK-NEXT: sb zero, 7(a0)
+; CHECK-NEXT: sb zero, 8(a0)
+; CHECK-NEXT: sb zero, 1(a0)
+; CHECK-NEXT: sb zero, 2(a0)
+; CHECK-NEXT: sb zero, 3(a0)
+; CHECK-NEXT: sb zero, 4(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i64 0, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset2(ptr align 4 %p) {
+; CHECK-LABEL: store_b64_basealign4_offset2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sh zero, 2(a0)
+; CHECK-NEXT: sh zero, 4(a0)
+; CHECK-NEXT: sh zero, 6(a0)
+; CHECK-NEXT: sh zero, 8(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ store i64 0, ptr %len, align 2
+ ret void
+}
+
+define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
+; CHECK-LABEL: load_b32_base_align2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 2(a0)
+; CHECK-NEXT: lbu a2, 1(a0)
+; CHECK-NEXT: lbu a3, 3(a0)
+; CHECK-NEXT: lbu a0, 4(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: or a1, a1, a2
+; CHECK-NEXT: slli a3, a3, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a3
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
+; CHECK-LABEL: load_b32_base_align2_offset3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 4(a0)
+; CHECK-NEXT: lbu a2, 3(a0)
+; CHECK-NEXT: lbu a3, 5(a0)
+; CHECK-NEXT: lbu a0, 6(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: or a1, a1, a2
+; CHECK-NEXT: slli a3, a3, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a3
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i64 @load_b64_base_align2_offset1(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 3(a0)
+; CHECK-NEXT: lbu a2, 4(a0)
+; CHECK-NEXT: lbu a3, 5(a0)
+; CHECK-NEXT: lbu a4, 2(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: slli a2, a2, 16
+; CHECK-NEXT: slli a3, a3, 24
+; CHECK-NEXT: or a1, a1, a4
+; CHECK-NEXT: or a2, a3, a2
+; CHECK-NEXT: lbu a3, 7(a0)
+; CHECK-NEXT: lbu a4, 6(a0)
+; CHECK-NEXT: lbu a5, 8(a0)
+; CHECK-NEXT: lbu a0, 9(a0)
+; CHECK-NEXT: slli a3, a3, 8
+; CHECK-NEXT: or a3, a3, a4
+; CHECK-NEXT: slli a5, a5, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a5, a0, a5
+; CHECK-NEXT: or a0, a2, a1
+; CHECK-NEXT: or a1, a5, a3
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %v = load i64, ptr %len, align 1
+ ret i64 %v
+}
+
+define i64 @load_b64_base_align2_offset2(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align2_offset2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lhu a1, 4(a0)
+; CHECK-NEXT: lhu a2, 2(a0)
+; CHECK-NEXT: lhu a3, 8(a0)
+; CHECK-NEXT: lhu a4, 6(a0)
+; CHECK-NEXT: slli a0, a1, 16
+; CHECK-NEXT: or a0, a0, a2
+; CHECK-NEXT: slli a1, a3, 16
+; CHECK-NEXT: or a1, a1, a4
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %v = load i64, ptr %len, align 2
+ ret i64 %v
+}
>From 04d5ecc1ff4d9e299a6f41f68ac98325873a1ed7 Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Mon, 23 Jun 2025 17:56:44 +0800
Subject: [PATCH 2/3] [SelectionDAG] Optimize unaligned load stores to realign
after offset
Summary:
For loads/stores from GEPs:
- Replace MPI(gep, 0) with MPI(base_ptr, const_offset)
- Preserve base pointer's stronger alignment
- Optimize expandUnalignedLoad/Store
Issue: #143215
---
llvm/include/llvm/CodeGen/MachineMemOperand.h | 20 +--
llvm/lib/CodeGen/MachineOperand.cpp | 2 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 65 ++++++++-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 125 +++++++++++++++---
.../branch-folding-implicit-def-subreg.ll | 4 +-
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 2 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 54 +++-----
.../unaligned-load-store-with-aligned.ll | 35 ++---
8 files changed, 215 insertions(+), 92 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index a297d3d8f8498..6958a86c37cae 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -50,27 +50,28 @@ struct MachinePointerInfo {
uint8_t StackID;
+ const Value *OrgV;
+
explicit MachinePointerInfo(const Value *v, int64_t offset = 0,
- uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ uint8_t ID = 0, const Value *orgv = nullptr)
+ : V(v), Offset(offset), StackID(ID), OrgV(orgv) {
AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0;
}
explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0,
uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
AddrSpace = v ? v->getAddressSpace() : 0;
}
explicit MachinePointerInfo(unsigned AddressSpace = 0, int64_t offset = 0)
: V((const Value *)nullptr), Offset(offset), AddrSpace(AddressSpace),
- StackID(0) {}
+ StackID(0), OrgV((const Value *)nullptr) {}
explicit MachinePointerInfo(
- PointerUnion<const Value *, const PseudoSourceValue *> v,
- int64_t offset = 0,
- uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ PointerUnion<const Value *, const PseudoSourceValue *> v,
+ int64_t offset = 0, uint8_t ID = 0)
+ : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
if (V) {
if (const auto *ValPtr = dyn_cast_if_present<const Value *>(V))
AddrSpace = ValPtr->getType()->getPointerAddressSpace();
@@ -83,7 +84,8 @@ struct MachinePointerInfo {
if (V.isNull())
return MachinePointerInfo(AddrSpace, Offset + O);
if (isa<const Value *>(V))
- return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID);
+ return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID,
+ OrgV);
return MachinePointerInfo(cast<const PseudoSourceValue *>(V), Offset + O,
StackID);
}
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d251697f2567..6f10ab39a8ab8 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1050,7 +1050,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
return isDereferenceableAndAlignedPointer(
BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL,
- dyn_cast<Instruction>(BasePtr));
+ dyn_cast<Instruction>(OrgV ? OrgV : BasePtr));
}
/// getConstantPool - Return a MachinePointerInfo record that refers to the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ecd1ff87e7fbc..3a3c19e5dc36b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4567,10 +4567,41 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
return std::nullopt;
}
+static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
+ const Value *&PtrV, const Value *&CxtI,
+ int64_t &Offset) {
+ Align PrefAlign = DL.getPrefTypeAlign(Ty);
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
+ GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ KnownBits Known = computeKnownBits(PtrV, DL);
+ KnownBits SplitKnown =
+ KnownBits::add(Known, KnownBits::makeConstant(APInt(
+ Known.getBitWidth(), Alignment.value())));
+ unsigned TrailZ = std::min(SplitKnown.countMinTrailingZeros(),
+ +Value::MaxAlignmentExponent);
+ Align ExpandAlign =
+ Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, GEP);
+ if (ExpandAlign > Alignment) {
+ CxtI = PtrV;
+ PtrV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
+}
+
void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
+ const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
@@ -4592,7 +4623,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4602,7 +4633,12 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
- TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+ TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);
+
+ // See visitStore comments.
+ int64_t Offset = 0;
+ const Value *CxtI = nullptr;
+ tryToImproveAlign(DL, Ty, Alignment, SV, CxtI, Offset);
SDValue Root;
bool ConstantMemory = false;
@@ -4652,7 +4688,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0,
+ CxtI)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4739,6 +4776,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
+ const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
@@ -4759,8 +4797,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
- SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
+ &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4777,7 +4815,19 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();
- auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+ // refine MPI: V + Offset
+ // Example:
+ // align 4 %p
+ // %gep = getelementptr i8, ptr %p, i32 1
+ // store i32 %v, ptr %len, align 1
+ // ->
+ // MPI: V = %p, Offset = 1
+ // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
+ int64_t Offset = 0;
+ const Value *CxtI = nullptr;
+ tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, CxtI, Offset);
+
+ auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4792,7 +4842,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset,
+ 0, CxtI)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 000f8cc6786a5..7f3983db095d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10370,6 +10370,59 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
"Unaligned load of unsupported type.");
+ Align BaseAlignment = LD->getBaseAlign();
+ Align Alignment = LD->getAlign();
+
+ // Divide the load according to the latest align information
+ if (commonAlignment(BaseAlignment,
+ Alignment.value() + LD->getPointerInfo().Offset) >
+ Alignment) {
+ ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+ // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+ if (HiExtType == ISD::NON_EXTLOAD)
+ HiExtType = ISD::ZEXTLOAD;
+
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ unsigned NumBytes = LoadedVT.getSizeInBits() / 8;
+ // LE/BE use the same initial Alignment
+ unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
+ unsigned RemainderBytes = NumBytes;
+ SDValue Result = DAG.getConstant(0, dl, VT);
+ SmallVector<SDValue, 4> Chains;
+ while (RemainderBytes) {
+ unsigned CurrBytes =
+ std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
+ ISD::LoadExtType ExtType = ISD::ZEXTLOAD;
+ if (RemainderBytes + CurrBytes == NumBytes)
+ ExtType = HiExtType;
+
+ SDValue CurrLD = DAG.getExtLoad(
+ ExtType, dl, VT, Chain,
+ DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
+ LD->getPointerInfo().getWithOffset(PtrOffset),
+ EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
+ if (IsLE)
+ Chains.push_back(CurrLD.getValue(1));
+ else
+ Chains.insert(Chains.begin(), CurrLD.getValue(1));
+ SDValue CurrV = DAG.getNode(
+ ISD::SHL, dl, VT, CurrLD,
+ DAG.getShiftAmountConstant((NumBytes - RemainderBytes) * 8, VT, dl));
+ Result = DAG.getNode(ISD::OR, dl, VT, CurrV, Result);
+ RemainderBytes -= CurrBytes;
+ if (RemainderBytes == 0)
+ break;
+ Alignment = commonAlignment(BaseAlignment,
+ LD->getPointerInfo().Offset + PtrOffset +
+ (IsLE ? CurrBytes : -CurrBytes));
+ PtrOffset =
+ IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ return std::make_pair(Result, TF);
+ }
// Compute the new VT that is half the size of the old one. This is an
// integer MVT.
unsigned NumBits = LoadedVT.getSizeInBits();
@@ -10377,7 +10430,6 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
NumBits >>= 1;
- Align Alignment = LD->getBaseAlign();
unsigned IncrementSize = NumBits / 8;
ISD::LoadExtType HiExtType = LD->getExtensionType();
@@ -10389,24 +10441,24 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
SDValue Lo, Hi;
if (DAG.getDataLayout().isLittleEndian()) {
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
} else {
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
}
// aggregate the two parts
@@ -10428,7 +10480,8 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Ptr = ST->getBasePtr();
SDValue Val = ST->getValue();
EVT VT = Val.getValueType();
- Align Alignment = ST->getBaseAlign();
+ Align BaseAlignment = ST->getBaseAlign();
+ Align Alignment = ST->getAlign();
auto &MF = DAG.getMachineFunction();
EVT StoreMemVT = ST->getMemoryVT();
@@ -10447,7 +10500,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
// FIXME: Does not handle truncating floating point stores!
SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
- Alignment, ST->getMemOperand()->getFlags());
+ BaseAlignment, ST->getMemOperand()->getFlags());
return Result;
}
// Do a (aligned) store to a stack slot, then copy from the stack slot
@@ -10515,6 +10568,47 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
"Unaligned store of unknown type.");
+
+ // Divide the store value according to the latest align information
+ if (commonAlignment(BaseAlignment,
+ Alignment.value() + ST->getPointerInfo().Offset) >
+ Alignment) {
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ unsigned NumBytes = StoreMemVT.getFixedSizeInBits() / 8;
+ SmallVector<SDValue, 8> Stores;
+ // LE/BE use the same initial Alignment
+ unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
+ unsigned RemainderBytes = NumBytes;
+ while (RemainderBytes) {
+ unsigned CurrBytes =
+ std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
+ SDValue CurrST = DAG.getTruncStore(
+ Chain, dl, Val,
+ DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
+ ST->getPointerInfo().getWithOffset(PtrOffset),
+ EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
+ ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ if (IsLE)
+ Stores.push_back(CurrST);
+ else
+ Stores.insert(Stores.begin(), CurrST);
+ RemainderBytes -= CurrBytes;
+ if (RemainderBytes == 0)
+ break;
+
+ Val = DAG.getNode(ISD::SRL, dl, VT, Val,
+ DAG.getShiftAmountConstant(CurrBytes * 8, VT, dl));
+ Alignment = commonAlignment(BaseAlignment,
+ ST->getPointerInfo().Offset + PtrOffset +
+ (IsLE ? CurrBytes : -CurrBytes));
+ PtrOffset =
+ IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
+ }
+
+ SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+ return Result;
+ }
+
// Get the half-size VT
EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
unsigned NumBits = NewStoredVT.getFixedSizeInBits();
@@ -10538,17 +10632,18 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Store1, Store2;
Store1 = DAG.getTruncStore(Chain, dl,
DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
- Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
- ST->getMemOperand()->getFlags());
+ Ptr, ST->getPointerInfo(), NewStoredVT,
+ BaseAlignment, ST->getMemOperand()->getFlags());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Store2 = DAG.getTruncStore(
Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
- ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
- ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT,
+ BaseAlignment, ST->getMemOperand()->getFlags(), ST->getAAInfo());
SDValue Result =
DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+
return Result;
}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index ae90cfb631e8d..d04ce840ce5b8 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -14,8 +14,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.f1.kernarg.segment + 24, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.f1.kernarg.segment + 40, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 3303cb86c874e..557aa8f35001f 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -113,7 +113,7 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
; GCN-NEXT: liveins: $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.uniform_trunc_i64_to_i1.kernarg.segment + 36, align 4, basealign 16, addrspace 4)
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5396f63..18ec2144f13d4 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4708,13 +4708,12 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
-; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
-; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
-; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
+; SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:50
+; SI-NEXT: s_load_dword s3, s[4:5], 0xd
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -4725,11 +4724,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; SI-NEXT: s_lshl_b32 s0, s3, 24
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v2, s0, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -4741,46 +4738,39 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: s_add_u32 s2, s4, 50
-; VI-NEXT: s_addc_u32 s3, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 3
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: s_add_u32 s0, s4, 51
-; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: s_addc_u32 s3, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_load_dword s0, s[4:5], 0x34
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: flat_load_ubyte v8, v[0:1]
-; VI-NEXT: flat_load_ubyte v9, v[2:3]
-; VI-NEXT: flat_load_ubyte v10, v[4:5]
-; VI-NEXT: flat_load_ubyte v6, v[6:7]
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s2, s0, 24
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[4:5], 0x24
+; VI-NEXT: s_load_dword s3, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_or_b32_e32 v6, s2, v2
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: flat_store_dword v[2:3], v7
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; VI-NEXT: v_or_b32_e32 v4, v4, v9
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: flat_store_dword v[2:3], v6
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
index f0a7e18054970..721ef95a21866 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
@@ -6,11 +6,9 @@ define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
; CHECK-LABEL: store_b32_basealign2_offset1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: srli a2, a1, 24
-; CHECK-NEXT: srli a3, a1, 16
-; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: srli a3, a1, 8
; CHECK-NEXT: sb a1, 1(a0)
-; CHECK-NEXT: sb a4, 2(a0)
-; CHECK-NEXT: sb a3, 3(a0)
+; CHECK-NEXT: sh a3, 2(a0)
; CHECK-NEXT: sb a2, 4(a0)
; CHECK-NEXT: ret
entry:
@@ -23,11 +21,9 @@ define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
; CHECK-LABEL: store_b32_basealign2_offset3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: srli a2, a1, 24
-; CHECK-NEXT: srli a3, a1, 16
-; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: srli a3, a1, 8
; CHECK-NEXT: sb a1, 3(a0)
-; CHECK-NEXT: sb a4, 4(a0)
-; CHECK-NEXT: sb a3, 5(a0)
+; CHECK-NEXT: sh a3, 4(a0)
; CHECK-NEXT: sb a2, 6(a0)
; CHECK-NEXT: ret
entry:
@@ -39,14 +35,10 @@ entry:
define void @store_b64_basealign4_offset1(ptr align 4 %p) {
; CHECK-LABEL: store_b64_basealign4_offset1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: sb zero, 5(a0)
-; CHECK-NEXT: sb zero, 6(a0)
-; CHECK-NEXT: sb zero, 7(a0)
-; CHECK-NEXT: sb zero, 8(a0)
; CHECK-NEXT: sb zero, 1(a0)
-; CHECK-NEXT: sb zero, 2(a0)
-; CHECK-NEXT: sb zero, 3(a0)
-; CHECK-NEXT: sb zero, 4(a0)
+; CHECK-NEXT: sh zero, 2(a0)
+; CHECK-NEXT: sw zero, 4(a0)
+; CHECK-NEXT: sb zero, 8(a0)
; CHECK-NEXT: ret
entry:
%len = getelementptr inbounds nuw i8, ptr %p, i32 1
@@ -58,8 +50,7 @@ define void @store_b64_basealign4_offset2(ptr align 4 %p) {
; CHECK-LABEL: store_b64_basealign4_offset2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh zero, 2(a0)
-; CHECK-NEXT: sh zero, 4(a0)
-; CHECK-NEXT: sh zero, 6(a0)
+; CHECK-NEXT: sw zero, 4(a0)
; CHECK-NEXT: sh zero, 8(a0)
; CHECK-NEXT: ret
entry:
@@ -71,15 +62,12 @@ entry:
define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lbu a1, 2(a0)
+; CHECK-NEXT: lhu a1, 2(a0)
; CHECK-NEXT: lbu a2, 1(a0)
-; CHECK-NEXT: lbu a3, 3(a0)
; CHECK-NEXT: lbu a0, 4(a0)
; CHECK-NEXT: slli a1, a1, 8
; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a3, a3, 16
; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a3
; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
entry:
@@ -91,15 +79,12 @@ entry:
define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset3:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lbu a1, 4(a0)
+; CHECK-NEXT: lhu a1, 4(a0)
; CHECK-NEXT: lbu a2, 3(a0)
-; CHECK-NEXT: lbu a3, 5(a0)
; CHECK-NEXT: lbu a0, 6(a0)
; CHECK-NEXT: slli a1, a1, 8
; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a3, a3, 16
; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a3
; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
entry:
>From b5e656e56adee72a26eceb70fb0d30019e935d4d Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Sun, 6 Jul 2025 18:25:59 +0800
Subject: [PATCH 3/3] use isValidAssumeForContext with AllowEphemerals=true
when determining dereferenceability
---
llvm/include/llvm/CodeGen/MachineMemOperand.h | 20 +++++++++----------
llvm/lib/Analysis/Loads.cpp | 4 ++--
llvm/lib/CodeGen/MachineOperand.cpp | 2 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 15 +++++---------
4 files changed, 17 insertions(+), 24 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 6958a86c37cae..a297d3d8f8498 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -50,28 +50,27 @@ struct MachinePointerInfo {
uint8_t StackID;
- const Value *OrgV;
-
explicit MachinePointerInfo(const Value *v, int64_t offset = 0,
- uint8_t ID = 0, const Value *orgv = nullptr)
- : V(v), Offset(offset), StackID(ID), OrgV(orgv) {
+ uint8_t ID = 0)
+ : V(v), Offset(offset), StackID(ID) {
AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0;
}
explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0,
uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
+ : V(v), Offset(offset), StackID(ID) {
AddrSpace = v ? v->getAddressSpace() : 0;
}
explicit MachinePointerInfo(unsigned AddressSpace = 0, int64_t offset = 0)
: V((const Value *)nullptr), Offset(offset), AddrSpace(AddressSpace),
- StackID(0), OrgV((const Value *)nullptr) {}
+ StackID(0) {}
explicit MachinePointerInfo(
- PointerUnion<const Value *, const PseudoSourceValue *> v,
- int64_t offset = 0, uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
+ PointerUnion<const Value *, const PseudoSourceValue *> v,
+ int64_t offset = 0,
+ uint8_t ID = 0)
+ : V(v), Offset(offset), StackID(ID) {
if (V) {
if (const auto *ValPtr = dyn_cast_if_present<const Value *>(V))
AddrSpace = ValPtr->getType()->getPointerAddressSpace();
@@ -84,8 +83,7 @@ struct MachinePointerInfo {
if (V.isNull())
return MachinePointerInfo(AddrSpace, Offset + O);
if (isa<const Value *>(V))
- return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID,
- OrgV);
+ return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID);
return MachinePointerInfo(cast<const PseudoSourceValue *>(V), Offset + O,
StackID);
}
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 880249588f0b2..ba23f8fc69d2d 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -111,7 +111,7 @@ static bool isDereferenceableAndAlignedPointer(
// anyway.
auto *I = dyn_cast<Instruction>(V);
if (I && !isa<AllocaInst>(I))
- return CtxI && isValidAssumeForContext(I, CtxI, DT);
+ return CtxI && isValidAssumeForContext(I, CtxI, DT, true);
return true;
};
if (IsKnownDeref()) {
@@ -183,7 +183,7 @@ static bool isDereferenceableAndAlignedPointer(
if (getKnowledgeForValue(
V, {Attribute::Dereferenceable, Attribute::Alignment}, *AC,
[&](RetainedKnowledge RK, Instruction *Assume, auto) {
- if (!isValidAssumeForContext(Assume, CtxI, DT))
+ if (!isValidAssumeForContext(Assume, CtxI, DT, true))
return false;
if (RK.AttrKind == Attribute::Alignment)
AlignRK = std::max(AlignRK, RK);
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 6f10ab39a8ab8..0d251697f2567 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1050,7 +1050,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
return isDereferenceableAndAlignedPointer(
BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL,
- dyn_cast<Instruction>(OrgV ? OrgV : BasePtr));
+ dyn_cast<Instruction>(BasePtr));
}
/// getConstantPool - Return a MachinePointerInfo record that refers to the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3a3c19e5dc36b..e58ce40cfdfd9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4568,8 +4568,7 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
}
static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
- const Value *&PtrV, const Value *&CxtI,
- int64_t &Offset) {
+ const Value *&PtrV, int64_t &Offset) {
Align PrefAlign = DL.getPrefTypeAlign(Ty);
if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) {
@@ -4588,7 +4587,6 @@ static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
Align BaseAlignment =
getKnownAlignment(const_cast<Value *>(BasePtrV), DL, GEP);
if (ExpandAlign > Alignment) {
- CxtI = PtrV;
PtrV = BasePtrV;
Alignment = BaseAlignment;
Offset = OffsetAccumulated.getSExtValue();
@@ -4637,8 +4635,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// See visitStore comments.
int64_t Offset = 0;
- const Value *CxtI = nullptr;
- tryToImproveAlign(DL, Ty, Alignment, SV, CxtI, Offset);
+ tryToImproveAlign(DL, Ty, Alignment, SV, Offset);
SDValue Root;
bool ConstantMemory = false;
@@ -4688,8 +4685,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0,
- CxtI)
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4824,8 +4820,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// MPI: V = %p, Offset = 1
// SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
int64_t Offset = 0;
- const Value *CxtI = nullptr;
- tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, CxtI, Offset);
+ tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, Offset);
auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
@@ -4843,7 +4838,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset,
- 0, CxtI)
+ 0)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
More information about the llvm-commits
mailing list