[llvm] [SelectionDAG] Optimize MPI for align(1) GEPs using base pointer (PR #145309)
Acthinks Yang via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 30 19:31:29 PDT 2025
https://github.com/Acthinks updated https://github.com/llvm/llvm-project/pull/145309
>From 8c18de474e304a98dd02bcfa803f77b0bd5860da Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Tue, 1 Jul 2025 09:56:20 +0800
Subject: [PATCH 1/2] [PreCommit] Unaligned load/store realigned after offset
---
.../unaligned-load-store-with-aligned.ll | 157 ++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
new file mode 100644
index 0000000000000..f0a7e18054970
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: store_b32_basealign2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: srli a2, a1, 24
+; CHECK-NEXT: srli a3, a1, 16
+; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: sb a1, 1(a0)
+; CHECK-NEXT: sb a4, 2(a0)
+; CHECK-NEXT: sb a3, 3(a0)
+; CHECK-NEXT: sb a2, 4(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: store_b32_basealign2_offset3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: srli a2, a1, 24
+; CHECK-NEXT: srli a3, a1, 16
+; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: sb a1, 3(a0)
+; CHECK-NEXT: sb a4, 4(a0)
+; CHECK-NEXT: sb a3, 5(a0)
+; CHECK-NEXT: sb a2, 6(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset1(ptr align 4 %p) {
+; CHECK-LABEL: store_b64_basealign4_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sb zero, 5(a0)
+; CHECK-NEXT: sb zero, 6(a0)
+; CHECK-NEXT: sb zero, 7(a0)
+; CHECK-NEXT: sb zero, 8(a0)
+; CHECK-NEXT: sb zero, 1(a0)
+; CHECK-NEXT: sb zero, 2(a0)
+; CHECK-NEXT: sb zero, 3(a0)
+; CHECK-NEXT: sb zero, 4(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i64 0, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset2(ptr align 4 %p) {
+; CHECK-LABEL: store_b64_basealign4_offset2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sh zero, 2(a0)
+; CHECK-NEXT: sh zero, 4(a0)
+; CHECK-NEXT: sh zero, 6(a0)
+; CHECK-NEXT: sh zero, 8(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ store i64 0, ptr %len, align 2
+ ret void
+}
+
+define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
+; CHECK-LABEL: load_b32_base_align2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 2(a0)
+; CHECK-NEXT: lbu a2, 1(a0)
+; CHECK-NEXT: lbu a3, 3(a0)
+; CHECK-NEXT: lbu a0, 4(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: or a1, a1, a2
+; CHECK-NEXT: slli a3, a3, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a3
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
+; CHECK-LABEL: load_b32_base_align2_offset3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 4(a0)
+; CHECK-NEXT: lbu a2, 3(a0)
+; CHECK-NEXT: lbu a3, 5(a0)
+; CHECK-NEXT: lbu a0, 6(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: or a1, a1, a2
+; CHECK-NEXT: slli a3, a3, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a3
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i64 @load_b64_base_align2_offset1(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 3(a0)
+; CHECK-NEXT: lbu a2, 4(a0)
+; CHECK-NEXT: lbu a3, 5(a0)
+; CHECK-NEXT: lbu a4, 2(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: slli a2, a2, 16
+; CHECK-NEXT: slli a3, a3, 24
+; CHECK-NEXT: or a1, a1, a4
+; CHECK-NEXT: or a2, a3, a2
+; CHECK-NEXT: lbu a3, 7(a0)
+; CHECK-NEXT: lbu a4, 6(a0)
+; CHECK-NEXT: lbu a5, 8(a0)
+; CHECK-NEXT: lbu a0, 9(a0)
+; CHECK-NEXT: slli a3, a3, 8
+; CHECK-NEXT: or a3, a3, a4
+; CHECK-NEXT: slli a5, a5, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a5, a0, a5
+; CHECK-NEXT: or a0, a2, a1
+; CHECK-NEXT: or a1, a5, a3
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %v = load i64, ptr %len, align 1
+ ret i64 %v
+}
+
+define i64 @load_b64_base_align2_offset2(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align2_offset2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lhu a1, 4(a0)
+; CHECK-NEXT: lhu a2, 2(a0)
+; CHECK-NEXT: lhu a3, 8(a0)
+; CHECK-NEXT: lhu a4, 6(a0)
+; CHECK-NEXT: slli a0, a1, 16
+; CHECK-NEXT: or a0, a0, a2
+; CHECK-NEXT: slli a1, a3, 16
+; CHECK-NEXT: or a1, a1, a4
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %v = load i64, ptr %len, align 2
+ ret i64 %v
+}
>From c88643827d1f2bc24d53b52e61479868536f99e2 Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Mon, 23 Jun 2025 17:56:44 +0800
Subject: [PATCH 2/2] [SelectionDAG] Optimize unaligned load stores to realign
after offset
Summary:
For loads/stores from GEPs:
- Replace MPI(gep, 0) with MPI(base_ptr, const_offset)
- Preserve base pointer's stronger alignment
- Optimize expandUnalignedLoad/Store
Issue: #143215
---
llvm/include/llvm/CodeGen/MachineMemOperand.h | 20 +--
llvm/lib/CodeGen/MachineOperand.cpp | 2 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 65 ++++++++-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 125 +++++++++++++++---
.../branch-folding-implicit-def-subreg.ll | 4 +-
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 2 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 54 +++-----
.../unaligned-load-store-with-aligned.ll | 35 ++---
8 files changed, 215 insertions(+), 92 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index a297d3d8f8498..6958a86c37cae 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -50,27 +50,28 @@ struct MachinePointerInfo {
uint8_t StackID;
+ const Value *OrgV;
+
explicit MachinePointerInfo(const Value *v, int64_t offset = 0,
- uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ uint8_t ID = 0, const Value *orgv = nullptr)
+ : V(v), Offset(offset), StackID(ID), OrgV(orgv) {
AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0;
}
explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0,
uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
AddrSpace = v ? v->getAddressSpace() : 0;
}
explicit MachinePointerInfo(unsigned AddressSpace = 0, int64_t offset = 0)
: V((const Value *)nullptr), Offset(offset), AddrSpace(AddressSpace),
- StackID(0) {}
+ StackID(0), OrgV((const Value *)nullptr) {}
explicit MachinePointerInfo(
- PointerUnion<const Value *, const PseudoSourceValue *> v,
- int64_t offset = 0,
- uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ PointerUnion<const Value *, const PseudoSourceValue *> v,
+ int64_t offset = 0, uint8_t ID = 0)
+ : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
if (V) {
if (const auto *ValPtr = dyn_cast_if_present<const Value *>(V))
AddrSpace = ValPtr->getType()->getPointerAddressSpace();
@@ -83,7 +84,8 @@ struct MachinePointerInfo {
if (V.isNull())
return MachinePointerInfo(AddrSpace, Offset + O);
if (isa<const Value *>(V))
- return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID);
+ return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID,
+ OrgV);
return MachinePointerInfo(cast<const PseudoSourceValue *>(V), Offset + O,
StackID);
}
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d251697f2567..6f10ab39a8ab8 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1050,7 +1050,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
return isDereferenceableAndAlignedPointer(
BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL,
- dyn_cast<Instruction>(BasePtr));
+ dyn_cast<Instruction>(OrgV ? OrgV : BasePtr));
}
/// getConstantPool - Return a MachinePointerInfo record that refers to the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 04d6fd5f48cc3..34d020f42e37c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4562,10 +4562,41 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
return std::nullopt;
}
+static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
+ const Value *&PtrV, const Value *&CxtI,
+ int64_t &Offset) {
+ Align PrefAlign = DL.getPrefTypeAlign(Ty);
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
+ GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ KnownBits Known = computeKnownBits(PtrV, DL);
+ KnownBits SplitKnown =
+ KnownBits::add(Known, KnownBits::makeConstant(APInt(
+ Known.getBitWidth(), Alignment.value())));
+ unsigned TrailZ = std::min(SplitKnown.countMinTrailingZeros(),
+ +Value::MaxAlignmentExponent);
+ Align ExpandAlign =
+ Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, GEP);
+ if (ExpandAlign > Alignment) {
+ CxtI = PtrV;
+ PtrV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
+}
+
void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
+ const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
@@ -4587,7 +4618,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4597,7 +4628,12 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
- TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+ TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);
+
+ // See visitStore comments.
+ int64_t Offset = 0;
+ const Value *CxtI = nullptr;
+ tryToImproveAlign(DL, Ty, Alignment, SV, CxtI, Offset);
SDValue Root;
bool ConstantMemory = false;
@@ -4647,7 +4683,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0,
+ CxtI)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4734,6 +4771,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
+ const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
@@ -4754,8 +4792,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
- SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
+ &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4772,7 +4810,19 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();
- auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+ // refine MPI: V + Offset
+ // Example:
+ // align 4 %p
+ // %gep = getelementptr i8, ptr %p, i32 1
+ // store i32 %v, ptr %len, align 1
+ // ->
+ // MPI: V = %p, Offset = 1
+ // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
+ int64_t Offset = 0;
+ const Value *CxtI = nullptr;
+ tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, CxtI, Offset);
+
+ auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4787,7 +4837,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset,
+ 0, CxtI)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 000f8cc6786a5..7f3983db095d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10370,6 +10370,59 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
"Unaligned load of unsupported type.");
+ Align BaseAlignment = LD->getBaseAlign();
+ Align Alignment = LD->getAlign();
+
+ // Divide the load according to the latest align information
+ if (commonAlignment(BaseAlignment,
+ Alignment.value() + LD->getPointerInfo().Offset) >
+ Alignment) {
+ ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+ // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+ if (HiExtType == ISD::NON_EXTLOAD)
+ HiExtType = ISD::ZEXTLOAD;
+
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ unsigned NumBytes = LoadedVT.getSizeInBits() / 8;
+ // LE/BE use the same initial Alignment
+ unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
+ unsigned RemainderBytes = NumBytes;
+ SDValue Result = DAG.getConstant(0, dl, VT);
+ SmallVector<SDValue, 4> Chains;
+ while (RemainderBytes) {
+ unsigned CurrBytes =
+ std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
+ ISD::LoadExtType ExtType = ISD::ZEXTLOAD;
+ if (RemainderBytes + CurrBytes == NumBytes)
+ ExtType = HiExtType;
+
+ SDValue CurrLD = DAG.getExtLoad(
+ ExtType, dl, VT, Chain,
+ DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
+ LD->getPointerInfo().getWithOffset(PtrOffset),
+ EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
+ if (IsLE)
+ Chains.push_back(CurrLD.getValue(1));
+ else
+ Chains.insert(Chains.begin(), CurrLD.getValue(1));
+ SDValue CurrV = DAG.getNode(
+ ISD::SHL, dl, VT, CurrLD,
+ DAG.getShiftAmountConstant((NumBytes - RemainderBytes) * 8, VT, dl));
+ Result = DAG.getNode(ISD::OR, dl, VT, CurrV, Result);
+ RemainderBytes -= CurrBytes;
+ if (RemainderBytes == 0)
+ break;
+ Alignment = commonAlignment(BaseAlignment,
+ LD->getPointerInfo().Offset + PtrOffset +
+ (IsLE ? CurrBytes : -CurrBytes));
+ PtrOffset =
+ IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ return std::make_pair(Result, TF);
+ }
// Compute the new VT that is half the size of the old one. This is an
// integer MVT.
unsigned NumBits = LoadedVT.getSizeInBits();
@@ -10377,7 +10430,6 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
NumBits >>= 1;
- Align Alignment = LD->getBaseAlign();
unsigned IncrementSize = NumBits / 8;
ISD::LoadExtType HiExtType = LD->getExtensionType();
@@ -10389,24 +10441,24 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
SDValue Lo, Hi;
if (DAG.getDataLayout().isLittleEndian()) {
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
} else {
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
}
// aggregate the two parts
@@ -10428,7 +10480,8 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Ptr = ST->getBasePtr();
SDValue Val = ST->getValue();
EVT VT = Val.getValueType();
- Align Alignment = ST->getBaseAlign();
+ Align BaseAlignment = ST->getBaseAlign();
+ Align Alignment = ST->getAlign();
auto &MF = DAG.getMachineFunction();
EVT StoreMemVT = ST->getMemoryVT();
@@ -10447,7 +10500,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
// FIXME: Does not handle truncating floating point stores!
SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
- Alignment, ST->getMemOperand()->getFlags());
+ BaseAlignment, ST->getMemOperand()->getFlags());
return Result;
}
// Do a (aligned) store to a stack slot, then copy from the stack slot
@@ -10515,6 +10568,47 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
"Unaligned store of unknown type.");
+
+ // Divide the store value according to the latest align information
+ if (commonAlignment(BaseAlignment,
+ Alignment.value() + ST->getPointerInfo().Offset) >
+ Alignment) {
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ unsigned NumBytes = StoreMemVT.getFixedSizeInBits() / 8;
+ SmallVector<SDValue, 8> Stores;
+ // LE/BE use the same initial Alignment
+ unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
+ unsigned RemainderBytes = NumBytes;
+ while (RemainderBytes) {
+ unsigned CurrBytes =
+ std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
+ SDValue CurrST = DAG.getTruncStore(
+ Chain, dl, Val,
+ DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
+ ST->getPointerInfo().getWithOffset(PtrOffset),
+ EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
+ ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ if (IsLE)
+ Stores.push_back(CurrST);
+ else
+ Stores.insert(Stores.begin(), CurrST);
+ RemainderBytes -= CurrBytes;
+ if (RemainderBytes == 0)
+ break;
+
+ Val = DAG.getNode(ISD::SRL, dl, VT, Val,
+ DAG.getShiftAmountConstant(CurrBytes * 8, VT, dl));
+ Alignment = commonAlignment(BaseAlignment,
+ ST->getPointerInfo().Offset + PtrOffset +
+ (IsLE ? CurrBytes : -CurrBytes));
+ PtrOffset =
+ IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
+ }
+
+ SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+ return Result;
+ }
+
// Get the half-size VT
EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
unsigned NumBits = NewStoredVT.getFixedSizeInBits();
@@ -10538,17 +10632,18 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Store1, Store2;
Store1 = DAG.getTruncStore(Chain, dl,
DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
- Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
- ST->getMemOperand()->getFlags());
+ Ptr, ST->getPointerInfo(), NewStoredVT,
+ BaseAlignment, ST->getMemOperand()->getFlags());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Store2 = DAG.getTruncStore(
Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
- ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
- ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT,
+ BaseAlignment, ST->getMemOperand()->getFlags(), ST->getAAInfo());
SDValue Result =
DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+
return Result;
}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 509ba295ea7f7..55e695bc7b9bc 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -14,8 +14,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.f1.kernarg.segment + 24, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.f1.kernarg.segment + 40, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 3303cb86c874e..557aa8f35001f 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -113,7 +113,7 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
; GCN-NEXT: liveins: $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.uniform_trunc_i64_to_i1.kernarg.segment + 36, align 4, basealign 16, addrspace 4)
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5396f63..18ec2144f13d4 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4708,13 +4708,12 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
-; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
-; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
-; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
+; SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:50
+; SI-NEXT: s_load_dword s3, s[4:5], 0xd
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -4725,11 +4724,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; SI-NEXT: s_lshl_b32 s0, s3, 24
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v2, s0, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -4741,46 +4738,39 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: s_add_u32 s2, s4, 50
-; VI-NEXT: s_addc_u32 s3, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 3
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: s_add_u32 s0, s4, 51
-; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: s_addc_u32 s3, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_load_dword s0, s[4:5], 0x34
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: flat_load_ubyte v8, v[0:1]
-; VI-NEXT: flat_load_ubyte v9, v[2:3]
-; VI-NEXT: flat_load_ubyte v10, v[4:5]
-; VI-NEXT: flat_load_ubyte v6, v[6:7]
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s2, s0, 24
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[4:5], 0x24
+; VI-NEXT: s_load_dword s3, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_or_b32_e32 v6, s2, v2
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: flat_store_dword v[2:3], v7
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; VI-NEXT: v_or_b32_e32 v4, v4, v9
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: flat_store_dword v[2:3], v6
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
index f0a7e18054970..721ef95a21866 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
@@ -6,11 +6,9 @@ define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
; CHECK-LABEL: store_b32_basealign2_offset1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: srli a2, a1, 24
-; CHECK-NEXT: srli a3, a1, 16
-; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: srli a3, a1, 8
; CHECK-NEXT: sb a1, 1(a0)
-; CHECK-NEXT: sb a4, 2(a0)
-; CHECK-NEXT: sb a3, 3(a0)
+; CHECK-NEXT: sh a3, 2(a0)
; CHECK-NEXT: sb a2, 4(a0)
; CHECK-NEXT: ret
entry:
@@ -23,11 +21,9 @@ define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
; CHECK-LABEL: store_b32_basealign2_offset3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: srli a2, a1, 24
-; CHECK-NEXT: srli a3, a1, 16
-; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: srli a3, a1, 8
; CHECK-NEXT: sb a1, 3(a0)
-; CHECK-NEXT: sb a4, 4(a0)
-; CHECK-NEXT: sb a3, 5(a0)
+; CHECK-NEXT: sh a3, 4(a0)
; CHECK-NEXT: sb a2, 6(a0)
; CHECK-NEXT: ret
entry:
@@ -39,14 +35,10 @@ entry:
define void @store_b64_basealign4_offset1(ptr align 4 %p) {
; CHECK-LABEL: store_b64_basealign4_offset1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: sb zero, 5(a0)
-; CHECK-NEXT: sb zero, 6(a0)
-; CHECK-NEXT: sb zero, 7(a0)
-; CHECK-NEXT: sb zero, 8(a0)
; CHECK-NEXT: sb zero, 1(a0)
-; CHECK-NEXT: sb zero, 2(a0)
-; CHECK-NEXT: sb zero, 3(a0)
-; CHECK-NEXT: sb zero, 4(a0)
+; CHECK-NEXT: sh zero, 2(a0)
+; CHECK-NEXT: sw zero, 4(a0)
+; CHECK-NEXT: sb zero, 8(a0)
; CHECK-NEXT: ret
entry:
%len = getelementptr inbounds nuw i8, ptr %p, i32 1
@@ -58,8 +50,7 @@ define void @store_b64_basealign4_offset2(ptr align 4 %p) {
; CHECK-LABEL: store_b64_basealign4_offset2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh zero, 2(a0)
-; CHECK-NEXT: sh zero, 4(a0)
-; CHECK-NEXT: sh zero, 6(a0)
+; CHECK-NEXT: sw zero, 4(a0)
; CHECK-NEXT: sh zero, 8(a0)
; CHECK-NEXT: ret
entry:
@@ -71,15 +62,12 @@ entry:
define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lbu a1, 2(a0)
+; CHECK-NEXT: lhu a1, 2(a0)
; CHECK-NEXT: lbu a2, 1(a0)
-; CHECK-NEXT: lbu a3, 3(a0)
; CHECK-NEXT: lbu a0, 4(a0)
; CHECK-NEXT: slli a1, a1, 8
; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a3, a3, 16
; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a3
; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
entry:
@@ -91,15 +79,12 @@ entry:
define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset3:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lbu a1, 4(a0)
+; CHECK-NEXT: lhu a1, 4(a0)
; CHECK-NEXT: lbu a2, 3(a0)
-; CHECK-NEXT: lbu a3, 5(a0)
; CHECK-NEXT: lbu a0, 6(a0)
; CHECK-NEXT: slli a1, a1, 8
; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a3, a3, 16
; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a3
; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
entry:
More information about the llvm-commits
mailing list