[llvm] [SelectionDAG] Optimize unaligned load stores to realign after offset (PR #145309)
Acthinks Yang via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 8 00:24:40 PDT 2025
https://github.com/Acthinks updated https://github.com/llvm/llvm-project/pull/145309
>From f4e2549b4c68489b995365de77f11d9b0a43d7f8 Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Tue, 1 Jul 2025 09:56:20 +0800
Subject: [PATCH 1/4] [PreCommit] Unaligned load/store realigned after offset
---
.../unaligned-load-store-with-aligned.ll | 157 ++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
new file mode 100644
index 0000000000000..f0a7e18054970
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: store_b32_basealign2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: srli a2, a1, 24
+; CHECK-NEXT: srli a3, a1, 16
+; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: sb a1, 1(a0)
+; CHECK-NEXT: sb a4, 2(a0)
+; CHECK-NEXT: sb a3, 3(a0)
+; CHECK-NEXT: sb a2, 4(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: store_b32_basealign2_offset3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: srli a2, a1, 24
+; CHECK-NEXT: srli a3, a1, 16
+; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: sb a1, 3(a0)
+; CHECK-NEXT: sb a4, 4(a0)
+; CHECK-NEXT: sb a3, 5(a0)
+; CHECK-NEXT: sb a2, 6(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset1(ptr align 4 %p) {
+; CHECK-LABEL: store_b64_basealign4_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sb zero, 5(a0)
+; CHECK-NEXT: sb zero, 6(a0)
+; CHECK-NEXT: sb zero, 7(a0)
+; CHECK-NEXT: sb zero, 8(a0)
+; CHECK-NEXT: sb zero, 1(a0)
+; CHECK-NEXT: sb zero, 2(a0)
+; CHECK-NEXT: sb zero, 3(a0)
+; CHECK-NEXT: sb zero, 4(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i64 0, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset2(ptr align 4 %p) {
+; CHECK-LABEL: store_b64_basealign4_offset2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sh zero, 2(a0)
+; CHECK-NEXT: sh zero, 4(a0)
+; CHECK-NEXT: sh zero, 6(a0)
+; CHECK-NEXT: sh zero, 8(a0)
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ store i64 0, ptr %len, align 2
+ ret void
+}
+
+define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
+; CHECK-LABEL: load_b32_base_align2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 2(a0)
+; CHECK-NEXT: lbu a2, 1(a0)
+; CHECK-NEXT: lbu a3, 3(a0)
+; CHECK-NEXT: lbu a0, 4(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: or a1, a1, a2
+; CHECK-NEXT: slli a3, a3, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a3
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
+; CHECK-LABEL: load_b32_base_align2_offset3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 4(a0)
+; CHECK-NEXT: lbu a2, 3(a0)
+; CHECK-NEXT: lbu a3, 5(a0)
+; CHECK-NEXT: lbu a0, 6(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: or a1, a1, a2
+; CHECK-NEXT: slli a3, a3, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a3
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i64 @load_b64_base_align2_offset1(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align2_offset1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lbu a1, 3(a0)
+; CHECK-NEXT: lbu a2, 4(a0)
+; CHECK-NEXT: lbu a3, 5(a0)
+; CHECK-NEXT: lbu a4, 2(a0)
+; CHECK-NEXT: slli a1, a1, 8
+; CHECK-NEXT: slli a2, a2, 16
+; CHECK-NEXT: slli a3, a3, 24
+; CHECK-NEXT: or a1, a1, a4
+; CHECK-NEXT: or a2, a3, a2
+; CHECK-NEXT: lbu a3, 7(a0)
+; CHECK-NEXT: lbu a4, 6(a0)
+; CHECK-NEXT: lbu a5, 8(a0)
+; CHECK-NEXT: lbu a0, 9(a0)
+; CHECK-NEXT: slli a3, a3, 8
+; CHECK-NEXT: or a3, a3, a4
+; CHECK-NEXT: slli a5, a5, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a5, a0, a5
+; CHECK-NEXT: or a0, a2, a1
+; CHECK-NEXT: or a1, a5, a3
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %v = load i64, ptr %len, align 1
+ ret i64 %v
+}
+
+define i64 @load_b64_base_align2_offset2(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align2_offset2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lhu a1, 4(a0)
+; CHECK-NEXT: lhu a2, 2(a0)
+; CHECK-NEXT: lhu a3, 8(a0)
+; CHECK-NEXT: lhu a4, 6(a0)
+; CHECK-NEXT: slli a0, a1, 16
+; CHECK-NEXT: or a0, a0, a2
+; CHECK-NEXT: slli a1, a3, 16
+; CHECK-NEXT: or a1, a1, a4
+; CHECK-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %v = load i64, ptr %len, align 2
+ ret i64 %v
+}
>From 04d5ecc1ff4d9e299a6f41f68ac98325873a1ed7 Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Mon, 23 Jun 2025 17:56:44 +0800
Subject: [PATCH 2/4] [SelectionDAG] Optimize unaligned load stores to realign
after offset
Summary:
For loads/stores from GEPs:
- Replace MPI(gep, 0) with MPI(base_ptr, const_offset)
- Preserve base pointer's stronger alignment
- Optimize expandUnalignedLoad/Store
Issue: #143215
---
llvm/include/llvm/CodeGen/MachineMemOperand.h | 20 +--
llvm/lib/CodeGen/MachineOperand.cpp | 2 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 65 ++++++++-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 125 +++++++++++++++---
.../branch-folding-implicit-def-subreg.ll | 4 +-
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 2 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 54 +++-----
.../unaligned-load-store-with-aligned.ll | 35 ++---
8 files changed, 215 insertions(+), 92 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index a297d3d8f8498..6958a86c37cae 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -50,27 +50,28 @@ struct MachinePointerInfo {
uint8_t StackID;
+ const Value *OrgV;
+
explicit MachinePointerInfo(const Value *v, int64_t offset = 0,
- uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ uint8_t ID = 0, const Value *orgv = nullptr)
+ : V(v), Offset(offset), StackID(ID), OrgV(orgv) {
AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0;
}
explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0,
uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
AddrSpace = v ? v->getAddressSpace() : 0;
}
explicit MachinePointerInfo(unsigned AddressSpace = 0, int64_t offset = 0)
: V((const Value *)nullptr), Offset(offset), AddrSpace(AddressSpace),
- StackID(0) {}
+ StackID(0), OrgV((const Value *)nullptr) {}
explicit MachinePointerInfo(
- PointerUnion<const Value *, const PseudoSourceValue *> v,
- int64_t offset = 0,
- uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID) {
+ PointerUnion<const Value *, const PseudoSourceValue *> v,
+ int64_t offset = 0, uint8_t ID = 0)
+ : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
if (V) {
if (const auto *ValPtr = dyn_cast_if_present<const Value *>(V))
AddrSpace = ValPtr->getType()->getPointerAddressSpace();
@@ -83,7 +84,8 @@ struct MachinePointerInfo {
if (V.isNull())
return MachinePointerInfo(AddrSpace, Offset + O);
if (isa<const Value *>(V))
- return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID);
+ return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID,
+ OrgV);
return MachinePointerInfo(cast<const PseudoSourceValue *>(V), Offset + O,
StackID);
}
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d251697f2567..6f10ab39a8ab8 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1050,7 +1050,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
return isDereferenceableAndAlignedPointer(
BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL,
- dyn_cast<Instruction>(BasePtr));
+ dyn_cast<Instruction>(OrgV ? OrgV : BasePtr));
}
/// getConstantPool - Return a MachinePointerInfo record that refers to the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ecd1ff87e7fbc..3a3c19e5dc36b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4567,10 +4567,41 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
return std::nullopt;
}
+static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
+ const Value *&PtrV, const Value *&CxtI,
+ int64_t &Offset) {
+ Align PrefAlign = DL.getPrefTypeAlign(Ty);
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
+ GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ KnownBits Known = computeKnownBits(PtrV, DL);
+ KnownBits SplitKnown =
+ KnownBits::add(Known, KnownBits::makeConstant(APInt(
+ Known.getBitWidth(), Alignment.value())));
+ unsigned TrailZ = std::min(SplitKnown.countMinTrailingZeros(),
+ +Value::MaxAlignmentExponent);
+ Align ExpandAlign =
+ Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, GEP);
+ if (ExpandAlign > Alignment) {
+ CxtI = PtrV;
+ PtrV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
+}
+
void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
+ const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
@@ -4592,7 +4623,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4602,7 +4633,12 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
- TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+ TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);
+
+ // See visitStore comments.
+ int64_t Offset = 0;
+ const Value *CxtI = nullptr;
+ tryToImproveAlign(DL, Ty, Alignment, SV, CxtI, Offset);
SDValue Root;
bool ConstantMemory = false;
@@ -4652,7 +4688,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0,
+ CxtI)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4739,6 +4776,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
+ const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
@@ -4759,8 +4797,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
- SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
+ &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4777,7 +4815,19 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();
- auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+ // refine MPI: V + Offset
+ // Example:
+ // align 4 %p
+ // %gep = getelementptr i8, ptr %p, i32 1
+ // store i32 %v, ptr %len, align 1
+ // ->
+ // MPI: V = %p, Offset = 1
+ // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
+ int64_t Offset = 0;
+ const Value *CxtI = nullptr;
+ tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, CxtI, Offset);
+
+ auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4792,7 +4842,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset,
+ 0, CxtI)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 000f8cc6786a5..7f3983db095d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10370,6 +10370,59 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
"Unaligned load of unsupported type.");
+ Align BaseAlignment = LD->getBaseAlign();
+ Align Alignment = LD->getAlign();
+
+ // Divide the load according to the latest align information
+ if (commonAlignment(BaseAlignment,
+ Alignment.value() + LD->getPointerInfo().Offset) >
+ Alignment) {
+ ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+ // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+ if (HiExtType == ISD::NON_EXTLOAD)
+ HiExtType = ISD::ZEXTLOAD;
+
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ unsigned NumBytes = LoadedVT.getSizeInBits() / 8;
+ // LE/BE use the same initial Alignment
+ unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
+ unsigned RemainderBytes = NumBytes;
+ SDValue Result = DAG.getConstant(0, dl, VT);
+ SmallVector<SDValue, 4> Chains;
+ while (RemainderBytes) {
+ unsigned CurrBytes =
+ std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
+ ISD::LoadExtType ExtType = ISD::ZEXTLOAD;
+ if (RemainderBytes + CurrBytes == NumBytes)
+ ExtType = HiExtType;
+
+ SDValue CurrLD = DAG.getExtLoad(
+ ExtType, dl, VT, Chain,
+ DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
+ LD->getPointerInfo().getWithOffset(PtrOffset),
+ EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
+ if (IsLE)
+ Chains.push_back(CurrLD.getValue(1));
+ else
+ Chains.insert(Chains.begin(), CurrLD.getValue(1));
+ SDValue CurrV = DAG.getNode(
+ ISD::SHL, dl, VT, CurrLD,
+ DAG.getShiftAmountConstant((NumBytes - RemainderBytes) * 8, VT, dl));
+ Result = DAG.getNode(ISD::OR, dl, VT, CurrV, Result);
+ RemainderBytes -= CurrBytes;
+ if (RemainderBytes == 0)
+ break;
+ Alignment = commonAlignment(BaseAlignment,
+ LD->getPointerInfo().Offset + PtrOffset +
+ (IsLE ? CurrBytes : -CurrBytes));
+ PtrOffset =
+ IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ return std::make_pair(Result, TF);
+ }
// Compute the new VT that is half the size of the old one. This is an
// integer MVT.
unsigned NumBits = LoadedVT.getSizeInBits();
@@ -10377,7 +10430,6 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
NumBits >>= 1;
- Align Alignment = LD->getBaseAlign();
unsigned IncrementSize = NumBits / 8;
ISD::LoadExtType HiExtType = LD->getExtensionType();
@@ -10389,24 +10441,24 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
SDValue Lo, Hi;
if (DAG.getDataLayout().isLittleEndian()) {
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
} else {
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
- LD->getAAInfo());
+ NewLoadedVT, BaseAlignment,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
}
// aggregate the two parts
@@ -10428,7 +10480,8 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Ptr = ST->getBasePtr();
SDValue Val = ST->getValue();
EVT VT = Val.getValueType();
- Align Alignment = ST->getBaseAlign();
+ Align BaseAlignment = ST->getBaseAlign();
+ Align Alignment = ST->getAlign();
auto &MF = DAG.getMachineFunction();
EVT StoreMemVT = ST->getMemoryVT();
@@ -10447,7 +10500,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
// FIXME: Does not handle truncating floating point stores!
SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
- Alignment, ST->getMemOperand()->getFlags());
+ BaseAlignment, ST->getMemOperand()->getFlags());
return Result;
}
// Do a (aligned) store to a stack slot, then copy from the stack slot
@@ -10515,6 +10568,47 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
"Unaligned store of unknown type.");
+
+ // Divide the store value according to the latest align information
+ if (commonAlignment(BaseAlignment,
+ Alignment.value() + ST->getPointerInfo().Offset) >
+ Alignment) {
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+ unsigned NumBytes = StoreMemVT.getFixedSizeInBits() / 8;
+ SmallVector<SDValue, 8> Stores;
+ // LE/BE use the same initial Alignment
+ unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
+ unsigned RemainderBytes = NumBytes;
+ while (RemainderBytes) {
+ unsigned CurrBytes =
+ std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
+ SDValue CurrST = DAG.getTruncStore(
+ Chain, dl, Val,
+ DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
+ ST->getPointerInfo().getWithOffset(PtrOffset),
+ EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
+ ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ if (IsLE)
+ Stores.push_back(CurrST);
+ else
+ Stores.insert(Stores.begin(), CurrST);
+ RemainderBytes -= CurrBytes;
+ if (RemainderBytes == 0)
+ break;
+
+ Val = DAG.getNode(ISD::SRL, dl, VT, Val,
+ DAG.getShiftAmountConstant(CurrBytes * 8, VT, dl));
+ Alignment = commonAlignment(BaseAlignment,
+ ST->getPointerInfo().Offset + PtrOffset +
+ (IsLE ? CurrBytes : -CurrBytes));
+ PtrOffset =
+ IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
+ }
+
+ SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+ return Result;
+ }
+
// Get the half-size VT
EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
unsigned NumBits = NewStoredVT.getFixedSizeInBits();
@@ -10538,17 +10632,18 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Store1, Store2;
Store1 = DAG.getTruncStore(Chain, dl,
DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
- Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
- ST->getMemOperand()->getFlags());
+ Ptr, ST->getPointerInfo(), NewStoredVT,
+ BaseAlignment, ST->getMemOperand()->getFlags());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Store2 = DAG.getTruncStore(
Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
- ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
- ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT,
+ BaseAlignment, ST->getMemOperand()->getFlags(), ST->getAAInfo());
SDValue Result =
DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+
return Result;
}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index ae90cfb631e8d..d04ce840ce5b8 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -14,8 +14,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.f1.kernarg.segment + 24, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.f1.kernarg.segment + 40, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 3303cb86c874e..557aa8f35001f 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -113,7 +113,7 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
; GCN-NEXT: liveins: $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.uniform_trunc_i64_to_i1.kernarg.segment + 36, align 4, basealign 16, addrspace 4)
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5396f63..18ec2144f13d4 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4708,13 +4708,12 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
-; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
-; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
-; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
+; SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:50
+; SI-NEXT: s_load_dword s3, s[4:5], 0xd
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -4725,11 +4724,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; SI-NEXT: s_lshl_b32 s0, s3, 24
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v2, s0, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -4741,46 +4738,39 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: s_add_u32 s2, s4, 50
-; VI-NEXT: s_addc_u32 s3, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 3
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: s_add_u32 s0, s4, 51
-; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: s_addc_u32 s3, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_load_dword s0, s[4:5], 0x34
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: flat_load_ubyte v8, v[0:1]
-; VI-NEXT: flat_load_ubyte v9, v[2:3]
-; VI-NEXT: flat_load_ubyte v10, v[4:5]
-; VI-NEXT: flat_load_ubyte v6, v[6:7]
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s2, s0, 24
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[4:5], 0x24
+; VI-NEXT: s_load_dword s3, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_or_b32_e32 v6, s2, v2
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: flat_store_dword v[2:3], v7
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; VI-NEXT: v_or_b32_e32 v4, v4, v9
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: flat_store_dword v[2:3], v6
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
index f0a7e18054970..721ef95a21866 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
@@ -6,11 +6,9 @@ define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
; CHECK-LABEL: store_b32_basealign2_offset1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: srli a2, a1, 24
-; CHECK-NEXT: srli a3, a1, 16
-; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: srli a3, a1, 8
; CHECK-NEXT: sb a1, 1(a0)
-; CHECK-NEXT: sb a4, 2(a0)
-; CHECK-NEXT: sb a3, 3(a0)
+; CHECK-NEXT: sh a3, 2(a0)
; CHECK-NEXT: sb a2, 4(a0)
; CHECK-NEXT: ret
entry:
@@ -23,11 +21,9 @@ define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
; CHECK-LABEL: store_b32_basealign2_offset3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: srli a2, a1, 24
-; CHECK-NEXT: srli a3, a1, 16
-; CHECK-NEXT: srli a4, a1, 8
+; CHECK-NEXT: srli a3, a1, 8
; CHECK-NEXT: sb a1, 3(a0)
-; CHECK-NEXT: sb a4, 4(a0)
-; CHECK-NEXT: sb a3, 5(a0)
+; CHECK-NEXT: sh a3, 4(a0)
; CHECK-NEXT: sb a2, 6(a0)
; CHECK-NEXT: ret
entry:
@@ -39,14 +35,10 @@ entry:
define void @store_b64_basealign4_offset1(ptr align 4 %p) {
; CHECK-LABEL: store_b64_basealign4_offset1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: sb zero, 5(a0)
-; CHECK-NEXT: sb zero, 6(a0)
-; CHECK-NEXT: sb zero, 7(a0)
-; CHECK-NEXT: sb zero, 8(a0)
; CHECK-NEXT: sb zero, 1(a0)
-; CHECK-NEXT: sb zero, 2(a0)
-; CHECK-NEXT: sb zero, 3(a0)
-; CHECK-NEXT: sb zero, 4(a0)
+; CHECK-NEXT: sh zero, 2(a0)
+; CHECK-NEXT: sw zero, 4(a0)
+; CHECK-NEXT: sb zero, 8(a0)
; CHECK-NEXT: ret
entry:
%len = getelementptr inbounds nuw i8, ptr %p, i32 1
@@ -58,8 +50,7 @@ define void @store_b64_basealign4_offset2(ptr align 4 %p) {
; CHECK-LABEL: store_b64_basealign4_offset2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh zero, 2(a0)
-; CHECK-NEXT: sh zero, 4(a0)
-; CHECK-NEXT: sh zero, 6(a0)
+; CHECK-NEXT: sw zero, 4(a0)
; CHECK-NEXT: sh zero, 8(a0)
; CHECK-NEXT: ret
entry:
@@ -71,15 +62,12 @@ entry:
define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lbu a1, 2(a0)
+; CHECK-NEXT: lhu a1, 2(a0)
; CHECK-NEXT: lbu a2, 1(a0)
-; CHECK-NEXT: lbu a3, 3(a0)
; CHECK-NEXT: lbu a0, 4(a0)
; CHECK-NEXT: slli a1, a1, 8
; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a3, a3, 16
; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a3
; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
entry:
@@ -91,15 +79,12 @@ entry:
define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset3:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lbu a1, 4(a0)
+; CHECK-NEXT: lhu a1, 4(a0)
; CHECK-NEXT: lbu a2, 3(a0)
-; CHECK-NEXT: lbu a3, 5(a0)
; CHECK-NEXT: lbu a0, 6(a0)
; CHECK-NEXT: slli a1, a1, 8
; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a3, a3, 16
; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a3
; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
entry:
>From b5e656e56adee72a26eceb70fb0d30019e935d4d Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Sun, 6 Jul 2025 18:25:59 +0800
Subject: [PATCH 3/4] use isValidAssumeForContext with AllowEphemerals=true
when determining dereferenceability
---
llvm/include/llvm/CodeGen/MachineMemOperand.h | 20 +++++++++----------
llvm/lib/Analysis/Loads.cpp | 4 ++--
llvm/lib/CodeGen/MachineOperand.cpp | 2 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 15 +++++---------
4 files changed, 17 insertions(+), 24 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 6958a86c37cae..a297d3d8f8498 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -50,28 +50,27 @@ struct MachinePointerInfo {
uint8_t StackID;
- const Value *OrgV;
-
explicit MachinePointerInfo(const Value *v, int64_t offset = 0,
- uint8_t ID = 0, const Value *orgv = nullptr)
- : V(v), Offset(offset), StackID(ID), OrgV(orgv) {
+ uint8_t ID = 0)
+ : V(v), Offset(offset), StackID(ID) {
AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0;
}
explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0,
uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
+ : V(v), Offset(offset), StackID(ID) {
AddrSpace = v ? v->getAddressSpace() : 0;
}
explicit MachinePointerInfo(unsigned AddressSpace = 0, int64_t offset = 0)
: V((const Value *)nullptr), Offset(offset), AddrSpace(AddressSpace),
- StackID(0), OrgV((const Value *)nullptr) {}
+ StackID(0) {}
explicit MachinePointerInfo(
- PointerUnion<const Value *, const PseudoSourceValue *> v,
- int64_t offset = 0, uint8_t ID = 0)
- : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
+ PointerUnion<const Value *, const PseudoSourceValue *> v,
+ int64_t offset = 0,
+ uint8_t ID = 0)
+ : V(v), Offset(offset), StackID(ID) {
if (V) {
if (const auto *ValPtr = dyn_cast_if_present<const Value *>(V))
AddrSpace = ValPtr->getType()->getPointerAddressSpace();
@@ -84,8 +83,7 @@ struct MachinePointerInfo {
if (V.isNull())
return MachinePointerInfo(AddrSpace, Offset + O);
if (isa<const Value *>(V))
- return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID,
- OrgV);
+ return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID);
return MachinePointerInfo(cast<const PseudoSourceValue *>(V), Offset + O,
StackID);
}
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 880249588f0b2..ba23f8fc69d2d 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -111,7 +111,7 @@ static bool isDereferenceableAndAlignedPointer(
// anyway.
auto *I = dyn_cast<Instruction>(V);
if (I && !isa<AllocaInst>(I))
- return CtxI && isValidAssumeForContext(I, CtxI, DT);
+ return CtxI && isValidAssumeForContext(I, CtxI, DT, true);
return true;
};
if (IsKnownDeref()) {
@@ -183,7 +183,7 @@ static bool isDereferenceableAndAlignedPointer(
if (getKnowledgeForValue(
V, {Attribute::Dereferenceable, Attribute::Alignment}, *AC,
[&](RetainedKnowledge RK, Instruction *Assume, auto) {
- if (!isValidAssumeForContext(Assume, CtxI, DT))
+ if (!isValidAssumeForContext(Assume, CtxI, DT, true))
return false;
if (RK.AttrKind == Attribute::Alignment)
AlignRK = std::max(AlignRK, RK);
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 6f10ab39a8ab8..0d251697f2567 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1050,7 +1050,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
return isDereferenceableAndAlignedPointer(
BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL,
- dyn_cast<Instruction>(OrgV ? OrgV : BasePtr));
+ dyn_cast<Instruction>(BasePtr));
}
/// getConstantPool - Return a MachinePointerInfo record that refers to the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3a3c19e5dc36b..e58ce40cfdfd9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4568,8 +4568,7 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
}
static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
- const Value *&PtrV, const Value *&CxtI,
- int64_t &Offset) {
+ const Value *&PtrV, int64_t &Offset) {
Align PrefAlign = DL.getPrefTypeAlign(Ty);
if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) {
@@ -4588,7 +4587,6 @@ static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
Align BaseAlignment =
getKnownAlignment(const_cast<Value *>(BasePtrV), DL, GEP);
if (ExpandAlign > Alignment) {
- CxtI = PtrV;
PtrV = BasePtrV;
Alignment = BaseAlignment;
Offset = OffsetAccumulated.getSExtValue();
@@ -4637,8 +4635,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// See visitStore comments.
int64_t Offset = 0;
- const Value *CxtI = nullptr;
- tryToImproveAlign(DL, Ty, Alignment, SV, CxtI, Offset);
+ tryToImproveAlign(DL, Ty, Alignment, SV, Offset);
SDValue Root;
bool ConstantMemory = false;
@@ -4688,8 +4685,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0,
- CxtI)
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4824,8 +4820,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// MPI: V = %p, Offset = 1
// SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
int64_t Offset = 0;
- const Value *CxtI = nullptr;
- tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, CxtI, Offset);
+ tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, Offset);
auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
@@ -4843,7 +4838,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset,
- 0, CxtI)
+ 0)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
>From e2505538b4da902123b8c411d0c436f95aa16c97 Mon Sep 17 00:00:00 2001
From: Acthinks <yangzhh at mail.ustc.edu.cn>
Date: Mon, 7 Jul 2025 14:00:39 +0800
Subject: [PATCH 4/4] add align_offset metadata
---
llvm/include/llvm/CodeGen/MachineMemOperand.h | 16 ++
llvm/include/llvm/IR/FixedMetadataKinds.def | 1 +
llvm/include/llvm/IR/MDBuilder.h | 2 +
llvm/lib/Analysis/Loads.cpp | 4 +-
llvm/lib/CodeGen/MachineOperand.cpp | 5 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 60 +-----
.../CodeGen/SelectionDAG/TargetLowering.cpp | 183 ++++++------------
llvm/lib/IR/MDBuilder.cpp | 9 +
llvm/lib/Transforms/Scalar/InferAlignment.cpp | 37 +++-
.../branch-folding-implicit-def-subreg.ll | 4 +-
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 2 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 54 +++---
.../unaligned-load-store-with-aligned.ll | 98 +++++-----
.../InferAlignment/gep-alignoffset.ll | 173 +++++++++++++++++
14 files changed, 388 insertions(+), 260 deletions(-)
create mode 100644 llvm/test/Transforms/InferAlignment/gep-alignoffset.ll
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index a297d3d8f8498..8848240692f56 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -20,7 +20,9 @@
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGenTypes/LowLevelType.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Value.h" // PointerLikeTypeTraits<Value*>
@@ -49,11 +51,22 @@ struct MachinePointerInfo {
unsigned AddrSpace = 0;
uint8_t StackID;
+ std::optional<std::pair<Align, int64_t>> AlignOffset = std::nullopt;
explicit MachinePointerInfo(const Value *v, int64_t offset = 0,
uint8_t ID = 0)
: V(v), Offset(offset), StackID(ID) {
AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0;
+ if (v && isa<Instruction>(v)) {
+ auto *I = cast<Instruction>(v);
+ if (auto *MDAO = I->getMetadata(LLVMContext::MD_align_offset)) {
+ Align Al(
+ mdconst::extract<ConstantInt>(MDAO->getOperand(0))->getZExtValue());
+ int64_t Offset =
+ mdconst::extract<ConstantInt>(MDAO->getOperand(1))->getSExtValue();
+ AlignOffset = std::make_pair(Al, Offset);
+ }
+ }
}
explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0,
@@ -233,6 +246,9 @@ class MachineMemOperand {
/// For PseudoSourceValue::FPRel values, this is the FrameIndex number.
int64_t getOffset() const { return PtrInfo.Offset; }
+ std::optional<std::pair<Align, int64_t>> getAlignOffset() const {
+ return PtrInfo.AlignOffset;
+ }
unsigned getAddrSpace() const { return PtrInfo.getAddrSpace(); }
/// Return the memory type of the memory reference. This should only be relied
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index df572e8791e13..a0c50f9327d8f 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -53,3 +53,4 @@ LLVM_FIXED_MD_KIND(MD_DIAssignID, "DIAssignID", 38)
LLVM_FIXED_MD_KIND(MD_coro_outside_frame, "coro.outside.frame", 39)
LLVM_FIXED_MD_KIND(MD_mmra, "mmra", 40)
LLVM_FIXED_MD_KIND(MD_noalias_addrspace, "noalias.addrspace", 41)
+LLVM_FIXED_MD_KIND(MD_align_offset, "align_offset", 42)
diff --git a/llvm/include/llvm/IR/MDBuilder.h b/llvm/include/llvm/IR/MDBuilder.h
index df1c66cf24fcb..d5d6a49c90b37 100644
--- a/llvm/include/llvm/IR/MDBuilder.h
+++ b/llvm/include/llvm/IR/MDBuilder.h
@@ -113,6 +113,8 @@ class MDBuilder {
/// Return metadata describing the range [Lo, Hi).
LLVM_ABI MDNode *createRange(Constant *Lo, Constant *Hi);
+ LLVM_ABI MDNode *createAlignOffset(Align Al, APInt Off);
+
//===------------------------------------------------------------------===//
// Callees metadata.
//===------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index ba23f8fc69d2d..880249588f0b2 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -111,7 +111,7 @@ static bool isDereferenceableAndAlignedPointer(
// anyway.
auto *I = dyn_cast<Instruction>(V);
if (I && !isa<AllocaInst>(I))
- return CtxI && isValidAssumeForContext(I, CtxI, DT, true);
+ return CtxI && isValidAssumeForContext(I, CtxI, DT);
return true;
};
if (IsKnownDeref()) {
@@ -183,7 +183,7 @@ static bool isDereferenceableAndAlignedPointer(
if (getKnowledgeForValue(
V, {Attribute::Dereferenceable, Attribute::Alignment}, *AC,
[&](RetainedKnowledge RK, Instruction *Assume, auto) {
- if (!isValidAssumeForContext(Assume, CtxI, DT, true))
+ if (!isValidAssumeForContext(Assume, CtxI, DT))
return false;
if (RK.AttrKind == Attribute::Alignment)
AlignRK = std::max(AlignRK, RK);
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d251697f2567..7908a1560bca5 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1136,7 +1136,10 @@ void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
/// getAlign - Return the minimum known alignment in bytes of the
/// actual memory reference.
Align MachineMemOperand::getAlign() const {
- return commonAlignment(getBaseAlign(), getOffset());
+ return getAlignOffset()
+ ? commonAlignment(getAlignOffset()->first,
+ getAlignOffset()->second + getOffset())
+ : commonAlignment(getBaseAlign(), getOffset());
}
void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e58ce40cfdfd9..ecd1ff87e7fbc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4567,39 +4567,10 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
return std::nullopt;
}
-static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
- const Value *&PtrV, int64_t &Offset) {
- Align PrefAlign = DL.getPrefTypeAlign(Ty);
- if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
- GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) {
- const Value *BasePtrV = GEP->getPointerOperand();
- APInt OffsetAccumulated =
- APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
- if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
- KnownBits Known = computeKnownBits(PtrV, DL);
- KnownBits SplitKnown =
- KnownBits::add(Known, KnownBits::makeConstant(APInt(
- Known.getBitWidth(), Alignment.value())));
- unsigned TrailZ = std::min(SplitKnown.countMinTrailingZeros(),
- +Value::MaxAlignmentExponent);
- Align ExpandAlign =
- Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
- Align BaseAlignment =
- getKnownAlignment(const_cast<Value *>(BasePtrV), DL, GEP);
- if (ExpandAlign > Alignment) {
- PtrV = BasePtrV;
- Alignment = BaseAlignment;
- Offset = OffsetAccumulated.getSExtValue();
- }
- }
- }
-}
-
void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
- const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
@@ -4621,7 +4592,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4631,11 +4602,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
- TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);
-
- // See visitStore comments.
- int64_t Offset = 0;
- tryToImproveAlign(DL, Ty, Alignment, SV, Offset);
+ TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
SDValue Root;
bool ConstantMemory = false;
@@ -4685,7 +4652,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0)
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4772,7 +4739,6 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
- const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
@@ -4793,8 +4759,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
- &MemVTs, &Offsets);
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
+ SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4811,18 +4777,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();
- // refine MPI: V + Offset
- // Example:
- // align 4 %p
- // %gep = getelementptr i8, ptr %p, i32 1
- // store i32 %v, ptr %len, align 1
- // ->
- // MPI: V = %p, Offset = 1
- // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
- int64_t Offset = 0;
- tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, Offset);
-
- auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
+ auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4837,8 +4792,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset,
- 0)
+ ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7f3983db095d2..80dd44b9e3a8f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10370,69 +10370,22 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
"Unaligned load of unsupported type.");
+ // Compute the new VTs that are aligned/half and remainder size of the old
+ // one. This is an integer MVT.
+ unsigned NumBits = LoadedVT.getSizeInBits();
+ assert(NumBits / 8 % 2 == 0 && "NumBits is not a multiple of 2bytes!");
Align BaseAlignment = LD->getBaseAlign();
Align Alignment = LD->getAlign();
+ unsigned NumBitsAlignedOrHalf =
+ LD->getPointerInfo().AlignOffset ? Alignment.value() * 8 : NumBits / 2;
+ unsigned NumBitsRemainder = NumBits - NumBitsAlignedOrHalf;
+ EVT AlignedLoadedVT, RemainderLoadedVT;
+ AlignedLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBitsAlignedOrHalf);
+ RemainderLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBitsRemainder);
- // Divide the load according to the latest align information
- if (commonAlignment(BaseAlignment,
- Alignment.value() + LD->getPointerInfo().Offset) >
- Alignment) {
- ISD::LoadExtType HiExtType = LD->getExtensionType();
-
- // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
- if (HiExtType == ISD::NON_EXTLOAD)
- HiExtType = ISD::ZEXTLOAD;
-
- bool IsLE = DAG.getDataLayout().isLittleEndian();
- unsigned NumBytes = LoadedVT.getSizeInBits() / 8;
- // LE/BE use the same initial Alignment
- unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
- unsigned RemainderBytes = NumBytes;
- SDValue Result = DAG.getConstant(0, dl, VT);
- SmallVector<SDValue, 4> Chains;
- while (RemainderBytes) {
- unsigned CurrBytes =
- std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
- ISD::LoadExtType ExtType = ISD::ZEXTLOAD;
- if (RemainderBytes + CurrBytes == NumBytes)
- ExtType = HiExtType;
-
- SDValue CurrLD = DAG.getExtLoad(
- ExtType, dl, VT, Chain,
- DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
- LD->getPointerInfo().getWithOffset(PtrOffset),
- EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
- LD->getMemOperand()->getFlags(), LD->getAAInfo());
- if (IsLE)
- Chains.push_back(CurrLD.getValue(1));
- else
- Chains.insert(Chains.begin(), CurrLD.getValue(1));
- SDValue CurrV = DAG.getNode(
- ISD::SHL, dl, VT, CurrLD,
- DAG.getShiftAmountConstant((NumBytes - RemainderBytes) * 8, VT, dl));
- Result = DAG.getNode(ISD::OR, dl, VT, CurrV, Result);
- RemainderBytes -= CurrBytes;
- if (RemainderBytes == 0)
- break;
- Alignment = commonAlignment(BaseAlignment,
- LD->getPointerInfo().Offset + PtrOffset +
- (IsLE ? CurrBytes : -CurrBytes));
- PtrOffset =
- IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
- }
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
- return std::make_pair(Result, TF);
- }
- // Compute the new VT that is half the size of the old one. This is an
- // integer MVT.
- unsigned NumBits = LoadedVT.getSizeInBits();
- EVT NewLoadedVT;
- NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
- NumBits >>= 1;
+ unsigned IncrementSize = NumBitsAlignedOrHalf / 8;
- unsigned IncrementSize = NumBits / 8;
ISD::LoadExtType HiExtType = LD->getExtensionType();
-
// If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
if (HiExtType == ISD::NON_EXTLOAD)
HiExtType = ISD::ZEXTLOAD;
@@ -10441,28 +10394,31 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
SDValue Lo, Hi;
if (DAG.getDataLayout().isLittleEndian()) {
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, BaseAlignment,
+ AlignedLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, BaseAlignment,
+ RemainderLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
} else {
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
- NewLoadedVT, BaseAlignment,
+ AlignedLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
- NewLoadedVT, BaseAlignment,
+ RemainderLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
}
// aggregate the two parts
- SDValue ShiftAmount = DAG.getShiftAmountConstant(NumBits, VT, dl);
+ SDValue ShiftAmount = DAG.getShiftAmountConstant(
+ DAG.getDataLayout().isLittleEndian() ? NumBitsAlignedOrHalf
+ : NumBitsRemainder,
+ VT, dl);
SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
@@ -10481,7 +10437,6 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Val = ST->getValue();
EVT VT = Val.getValueType();
Align BaseAlignment = ST->getBaseAlign();
- Align Alignment = ST->getAlign();
auto &MF = DAG.getMachineFunction();
EVT StoreMemVT = ST->getMemoryVT();
@@ -10569,81 +10524,53 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
"Unaligned store of unknown type.");
- // Divide the store value according to the latest align information
- if (commonAlignment(BaseAlignment,
- Alignment.value() + ST->getPointerInfo().Offset) >
- Alignment) {
- bool IsLE = DAG.getDataLayout().isLittleEndian();
- unsigned NumBytes = StoreMemVT.getFixedSizeInBits() / 8;
- SmallVector<SDValue, 8> Stores;
- // LE/BE use the same initial Alignment
- unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
- unsigned RemainderBytes = NumBytes;
- while (RemainderBytes) {
- unsigned CurrBytes =
- std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
- SDValue CurrST = DAG.getTruncStore(
- Chain, dl, Val,
- DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
- ST->getPointerInfo().getWithOffset(PtrOffset),
- EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
- ST->getMemOperand()->getFlags(), ST->getAAInfo());
- if (IsLE)
- Stores.push_back(CurrST);
- else
- Stores.insert(Stores.begin(), CurrST);
- RemainderBytes -= CurrBytes;
- if (RemainderBytes == 0)
- break;
-
- Val = DAG.getNode(ISD::SRL, dl, VT, Val,
- DAG.getShiftAmountConstant(CurrBytes * 8, VT, dl));
- Alignment = commonAlignment(BaseAlignment,
- ST->getPointerInfo().Offset + PtrOffset +
- (IsLE ? CurrBytes : -CurrBytes));
- PtrOffset =
- IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
- }
-
- SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
- return Result;
- }
-
- // Get the half-size VT
- EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
- unsigned NumBits = NewStoredVT.getFixedSizeInBits();
- unsigned IncrementSize = NumBits / 8;
+ Align Alignment = ST->getAlign();
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
- // Divide the stored value in two parts.
- SDValue ShiftAmount =
- DAG.getShiftAmountConstant(NumBits, Val.getValueType(), dl);
- SDValue Lo = Val;
+ // Divide the stored value in two parts: aligned/half and remainder
+ unsigned NumBits = StoreMemVT.getFixedSizeInBits();
+ assert(NumBits / 8 % 2 == 0 && "NumBits is not a multiple of 2bytes!");
+ unsigned NumBitsAlignedOrHalf =
+ ST->getPointerInfo().AlignOffset ? Alignment.value() * 8 : NumBits / 2;
+ unsigned NumBitsRemainder = NumBits - NumBitsAlignedOrHalf;
+ EVT StoredVTAligned =
+ EVT::getIntegerVT(*DAG.getContext(), NumBitsAlignedOrHalf);
+ EVT StoredVTRemainder =
+ EVT::getIntegerVT(*DAG.getContext(), NumBitsRemainder);
+
+ unsigned IncrementSize = NumBitsAlignedOrHalf / 8;
+ SDValue ShiftAmount = DAG.getShiftAmountConstant(
+ IsLE ? NumBitsAlignedOrHalf : NumBitsRemainder, Val.getValueType(), dl);
+ SDValue AlignedVal, RemainderVal;
+ AlignedVal = RemainderVal = Val;
// If Val is a constant, replace the upper bits with 0. The SRL will constant
// fold and not use the upper bits. A smaller constant may be easier to
// materialize.
- if (auto *C = dyn_cast<ConstantSDNode>(Lo); C && !C->isOpaque())
- Lo = DAG.getNode(
- ISD::AND, dl, VT, Lo,
- DAG.getConstant(APInt::getLowBitsSet(VT.getSizeInBits(), NumBits), dl,
- VT));
- SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
-
+ SDValue TempVal = Val;
+ if (auto *C = dyn_cast<ConstantSDNode>(Val); C && !C->isOpaque())
+ TempVal = DAG.getNode(
+ ISD::AND, dl, VT, Val,
+ DAG.getConstant(
+ APInt::getLowBitsSet(VT.getSizeInBits(), IsLE ? NumBitsAlignedOrHalf
+ : NumBitsRemainder),
+ dl, VT));
+ AlignedVal = IsLE ? TempVal : DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
+ RemainderVal =
+ IsLE ? DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount) : TempVal;
// Store the two parts
SDValue Store1, Store2;
- Store1 = DAG.getTruncStore(Chain, dl,
- DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
- Ptr, ST->getPointerInfo(), NewStoredVT,
- BaseAlignment, ST->getMemOperand()->getFlags());
+ Store1 = DAG.getTruncStore(Chain, dl, AlignedVal, Ptr, ST->getPointerInfo(),
+ StoredVTAligned, BaseAlignment,
+ ST->getMemOperand()->getFlags());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
- Store2 = DAG.getTruncStore(
- Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
- ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT,
- BaseAlignment, ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ Store2 = DAG.getTruncStore(Chain, dl, RemainderVal, Ptr,
+ ST->getPointerInfo().getWithOffset(IncrementSize),
+ StoredVTRemainder, BaseAlignment,
+ ST->getMemOperand()->getFlags(), ST->getAAInfo());
SDValue Result =
DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
-
return Result;
}
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index 893f99a19b936..633d47d5b3059 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -108,6 +108,15 @@ MDNode *MDBuilder::createRange(Constant *Lo, Constant *Hi) {
// Return the range [Lo, Hi).
return MDNode::get(Context, {createConstant(Lo), createConstant(Hi)});
}
+LLVM_ABI MDNode *MDBuilder::createAlignOffset(Align Al, APInt Off) {
+ if (Al.value() == 1)
+ return nullptr;
+ Type *Int64 = Type::getInt64Ty(Context);
+ return MDNode::get(Context,
+ {createConstant(ConstantInt::get(Int64, Al.value())),
+ createConstant(ConstantInt::get(
+ Type::getIntNTy(Context, Off.getBitWidth()), Off))});
+}
MDNode *MDBuilder::createCallees(ArrayRef<Function *> Callees) {
SmallVector<Metadata *, 4> Ops;
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 0ddc23152d84f..598859569e631 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -15,6 +15,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -65,7 +66,41 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
+Value::MaxAlignmentExponent);
- return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ Align NewAlign =
+ Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+
+ // add align_offset metadata
+ Align BetterAlign = std::max(NewAlign, OldAlign);
+ if (BetterAlign < PrefAlign) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrOp);
+ GEP && !GEP->getMetadata(LLVMContext::MD_align_offset)) {
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ KnownBits SplitKnown = KnownBits::add(
+ Known, KnownBits::makeConstant(APInt(
+ Known.getBitWidth(), BetterAlign.value())));
+ unsigned TrailZ = std::min(SplitKnown.countMinTrailingZeros(),
+ +Value::MaxAlignmentExponent);
+ Align ExpandAlign =
+ Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ if (ExpandAlign > BetterAlign) {
+ KnownBits BaseKnown = KnownBits::sub(
+ Known, KnownBits::makeConstant(OffsetAccumulated));
+ unsigned TrailZ =
+ std::min(BaseKnown.countMinTrailingZeros(),
+ +Value::MaxAlignmentExponent);
+ Align BaseAlignment = Align(
+ 1ull << std::min(BaseKnown.getBitWidth() - 1, TrailZ));
+ MDBuilder MDB(GEP->getContext());
+ llvm::MDNode *AONode =
+ MDB.createAlignOffset(BaseAlignment, OffsetAccumulated);
+ GEP->setMetadata(LLVMContext::MD_align_offset, AONode);
+ }
+ }
+ }
+ }
+ return NewAlign;
});
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index d04ce840ce5b8..ae90cfb631e8d 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -14,8 +14,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.f1.kernarg.segment + 24, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.f1.kernarg.segment + 40, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 557aa8f35001f..3303cb86c874e 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -113,7 +113,7 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
; GCN-NEXT: liveins: $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.uniform_trunc_i64_to_i1.kernarg.segment + 36, align 4, basealign 16, addrspace 4)
+ ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 18ec2144f13d4..a18b5b5396f63 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4708,12 +4708,13 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
-; SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:50
-; SI-NEXT: s_load_dword s3, s[4:5], 0xd
+; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
+; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
+; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -4724,9 +4725,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT: s_lshl_b32 s0, s3, 24
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v2, s0, v2
+; SI-NEXT: v_or_b32_e32 v3, v3, v6
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -4738,39 +4741,46 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: s_add_u32 s2, s4, 50
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_addc_u32 s3, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_load_dword s0, s[4:5], 0x34
+; VI-NEXT: s_add_u32 s0, s0, 3
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s4, 51
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s0, 24
+; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
+; VI-NEXT: flat_load_ubyte v9, v[2:3]
+; VI-NEXT: flat_load_ubyte v10, v[4:5]
+; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s3, s[4:5], 0x24
+; VI-NEXT: s_load_dword s2, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: v_or_b32_e32 v6, s2, v2
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: flat_store_dword v[2:3], v4
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dword v[2:3], v7
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dword v[2:3], v6
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
+; VI-NEXT: v_or_b32_e32 v4, v4, v9
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
+; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v4, v5, v4
+; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
index 721ef95a21866..dcefd755aec05 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
; RUN: | FileCheck %s
define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
@@ -12,7 +12,7 @@ define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
; CHECK-NEXT: sb a2, 4(a0)
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1, !align_offset !0
store i32 %v, ptr %len, align 1
ret void
}
@@ -27,7 +27,7 @@ define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
; CHECK-NEXT: sb a2, 6(a0)
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3, !align_offset !1
store i32 %v, ptr %len, align 1
ret void
}
@@ -41,7 +41,7 @@ define void @store_b64_basealign4_offset1(ptr align 4 %p) {
; CHECK-NEXT: sb zero, 8(a0)
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1, !align_offset !2
store i64 0, ptr %len, align 1
ret void
}
@@ -54,7 +54,7 @@ define void @store_b64_basealign4_offset2(ptr align 4 %p) {
; CHECK-NEXT: sh zero, 8(a0)
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2, !align_offset !3
store i64 0, ptr %len, align 2
ret void
}
@@ -63,15 +63,15 @@ define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lhu a1, 2(a0)
-; CHECK-NEXT: lbu a2, 1(a0)
-; CHECK-NEXT: lbu a0, 4(a0)
+; CHECK-NEXT: lbu a2, 4(a0)
+; CHECK-NEXT: lbu a0, 1(a0)
; CHECK-NEXT: slli a1, a1, 8
-; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: slli a2, a2, 24
+; CHECK-NEXT: or a1, a2, a1
+; CHECK-NEXT: or a0, a1, a0
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1, !align_offset !0
%v = load i32, ptr %len, align 1
ret i32 %v
}
@@ -80,63 +80,61 @@ define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
; CHECK-LABEL: load_b32_base_align2_offset3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lhu a1, 4(a0)
-; CHECK-NEXT: lbu a2, 3(a0)
-; CHECK-NEXT: lbu a0, 6(a0)
+; CHECK-NEXT: lbu a2, 6(a0)
+; CHECK-NEXT: lbu a0, 3(a0)
; CHECK-NEXT: slli a1, a1, 8
-; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: slli a2, a2, 24
+; CHECK-NEXT: or a1, a2, a1
+; CHECK-NEXT: or a0, a1, a0
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3, !align_offset !1
%v = load i32, ptr %len, align 1
ret i32 %v
}
-define i64 @load_b64_base_align2_offset1(ptr align 4 %p) {
-; CHECK-LABEL: load_b64_base_align2_offset1:
+define i64 @load_b64_base_align4_offset1(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align4_offset1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lbu a1, 3(a0)
-; CHECK-NEXT: lbu a2, 4(a0)
-; CHECK-NEXT: lbu a3, 5(a0)
-; CHECK-NEXT: lbu a4, 2(a0)
+; CHECK-NEXT: lbu a1, 8(a0)
+; CHECK-NEXT: lhu a2, 6(a0)
+; CHECK-NEXT: lhu a3, 2(a0)
+; CHECK-NEXT: lhu a4, 4(a0)
+; CHECK-NEXT: lbu a0, 1(a0)
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: slli a1, a1, 48
+; CHECK-NEXT: or a1, a1, a2
+; CHECK-NEXT: slli a4, a4, 16
+; CHECK-NEXT: or a3, a4, a3
+; CHECK-NEXT: or a1, a3, a1
; CHECK-NEXT: slli a1, a1, 8
-; CHECK-NEXT: slli a2, a2, 16
-; CHECK-NEXT: slli a3, a3, 24
-; CHECK-NEXT: or a1, a1, a4
-; CHECK-NEXT: or a2, a3, a2
-; CHECK-NEXT: lbu a3, 7(a0)
-; CHECK-NEXT: lbu a4, 6(a0)
-; CHECK-NEXT: lbu a5, 8(a0)
-; CHECK-NEXT: lbu a0, 9(a0)
-; CHECK-NEXT: slli a3, a3, 8
-; CHECK-NEXT: or a3, a3, a4
-; CHECK-NEXT: slli a5, a5, 16
-; CHECK-NEXT: slli a0, a0, 24
-; CHECK-NEXT: or a5, a0, a5
-; CHECK-NEXT: or a0, a2, a1
-; CHECK-NEXT: or a1, a5, a3
+; CHECK-NEXT: or a0, a1, a0
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1, !align_offset !2
%v = load i64, ptr %len, align 1
ret i64 %v
}
-define i64 @load_b64_base_align2_offset2(ptr align 4 %p) {
-; CHECK-LABEL: load_b64_base_align2_offset2:
+define i64 @load_b64_base_align4_offset2(ptr align 4 %p) {
+; CHECK-LABEL: load_b64_base_align4_offset2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lhu a1, 4(a0)
-; CHECK-NEXT: lhu a2, 2(a0)
-; CHECK-NEXT: lhu a3, 8(a0)
-; CHECK-NEXT: lhu a4, 6(a0)
-; CHECK-NEXT: slli a0, a1, 16
-; CHECK-NEXT: or a0, a0, a2
-; CHECK-NEXT: slli a1, a3, 16
-; CHECK-NEXT: or a1, a1, a4
+; CHECK-NEXT: lwu a1, 4(a0)
+; CHECK-NEXT: lhu a2, 8(a0)
+; CHECK-NEXT: lhu a0, 2(a0)
+; CHECK-NEXT: slli a1, a1, 16
+; CHECK-NEXT: slli a2, a2, 48
+; CHECK-NEXT: or a1, a2, a1
+; CHECK-NEXT: or a0, a1, a0
; CHECK-NEXT: ret
entry:
- %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2, !align_offset !3
%v = load i64, ptr %len, align 2
ret i64 %v
}
+
+!0 = !{i64 2, i64 1}
+!1 = !{i64 2, i64 3}
+!2 = !{i64 4, i64 1}
+!3 = !{i64 4, i64 2}
+
diff --git a/llvm/test/Transforms/InferAlignment/gep-alignoffset.ll b/llvm/test/Transforms/InferAlignment/gep-alignoffset.ll
new file mode 100644
index 0000000000000..cee4e1f55632c
--- /dev/null
+++ b/llvm/test/Transforms/InferAlignment/gep-alignoffset.ll
@@ -0,0 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=infer-alignment -S < %s | FileCheck %s
+
+define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: define void @store_b32_basealign2_offset1(
+; CHECK-SAME: ptr align 2 [[P:%.*]], i32 [[V:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 1, !align_offset [[META0:![0-9]+]]
+; CHECK-NEXT: store i32 [[V]], ptr [[LEN]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) {
+; CHECK-LABEL: define void @store_b32_basealign2_offset3(
+; CHECK-SAME: ptr align 2 [[P:%.*]], i32 [[V:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 3, !align_offset [[META1:![0-9]+]]
+; CHECK-NEXT: store i32 [[V]], ptr [[LEN]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset1(ptr align 4 %p) {
+; CHECK-LABEL: define void @store_b64_basealign4_offset1(
+; CHECK-SAME: ptr align 4 [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 1, !align_offset [[META2:![0-9]+]]
+; CHECK-NEXT: store i64 0, ptr [[LEN]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ store i64 0, ptr %len, align 1
+ ret void
+}
+
+define void @store_b64_basealign4_offset2(ptr align 4 %p) {
+; CHECK-LABEL: define void @store_b64_basealign4_offset2(
+; CHECK-SAME: ptr align 4 [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 2, !align_offset [[META3:![0-9]+]]
+; CHECK-NEXT: store i64 0, ptr [[LEN]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ store i64 0, ptr %len, align 2
+ ret void
+}
+
+define i32 @load_b32_base_align2_offset1(ptr align 2 %p) {
+; CHECK-LABEL: define i32 @load_b32_base_align2_offset1(
+; CHECK-SAME: ptr align 2 [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 1, !align_offset [[META0]]
+; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[LEN]], align 1
+; CHECK-NEXT: ret i32 [[V]]
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i32 @load_b32_base_align2_offset3(ptr align 2 %p) {
+; CHECK-LABEL: define i32 @load_b32_base_align2_offset3(
+; CHECK-SAME: ptr align 2 [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 3, !align_offset [[META1]]
+; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[LEN]], align 1
+; CHECK-NEXT: ret i32 [[V]]
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
+define i64 @load_b64_base_align4_offset1(ptr align 4 %p) {
+; CHECK-LABEL: define i64 @load_b64_base_align4_offset1(
+; CHECK-SAME: ptr align 4 [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 1, !align_offset [[META2]]
+; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[LEN]], align 1
+; CHECK-NEXT: ret i64 [[V]]
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 1
+ %v = load i64, ptr %len, align 1
+ ret i64 %v
+}
+
+define i64 @load_b64_base_align4_offset2(ptr align 4 %p) {
+; CHECK-LABEL: define i64 @load_b64_base_align4_offset2(
+; CHECK-SAME: ptr align 4 [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 2, !align_offset [[META3]]
+; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[LEN]], align 2
+; CHECK-NEXT: ret i64 [[V]]
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 2
+ %v = load i64, ptr %len, align 2
+ ret i64 %v
+}
+
+%struct.str_t = type <{ i8, i8, i8, i32, i8, i32, i32 }>
+
+define dso_local void @loop_base_align16_offset3(ptr align 16 %p, i32 signext %N) {
+; CHECK-LABEL: define dso_local void @loop_base_align16_offset3(
+; CHECK-SAME: ptr align 16 [[P:%.*]], i32 signext [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_STR_T:%.*]], ptr [[P]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 3, !align_offset [[META4:![0-9]+]]
+; CHECK-NEXT: store i32 0, ptr [[LEN]], align 1
+; CHECK-NEXT: [[LEN2:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT: store i32 2, ptr [[LEN2]], align 8
+; CHECK-NEXT: [[LEN3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT: store i32 3, ptr [[LEN3]], align 4
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw %struct.str_t, ptr %p, i64 %indvars.iv
+ %len = getelementptr inbounds nuw i8, ptr %arrayidx, i64 3
+ store i32 0, ptr %len, align 1
+ %len2 = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
+ store i32 2, ptr %len2, align 8
+ %len3 = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
+ store i32 3, ptr %len3, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+;.
+; CHECK: [[META0]] = !{i64 2, i64 1}
+; CHECK: [[META1]] = !{i64 2, i64 3}
+; CHECK: [[META2]] = !{i64 4, i64 1}
+; CHECK: [[META3]] = !{i64 4, i64 2}
+; CHECK: [[META4]] = !{i64 16, i64 3}
+;.
More information about the llvm-commits
mailing list