[llvm] [DAG] shouldReduceLoadWidth - add optional<unsigned> byte offset argument (PR #136723)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 23 01:54:41 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/136723
>From d2d0597f9ee1100662c21bcefbf6331172bb287c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 22 Apr 2025 17:04:22 +0100
Subject: [PATCH 1/3] [DAG] shouldReduceLoadWidth - add optional<unsigned> byte
offset argument
Based off feedback for #129695 - we need to be able to determine the load offset of smaller loads when trying to determine whether a multiple use load should be split (in particular for AVX subvector extractions).
This patch adds a std::optional<unsigned> ByteOffset argument to shouldReduceLoadWidth calls for where we know the constant offset to allow targets to make use of it in future patches.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 ++++--
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 +++++-----
.../CodeGen/SelectionDAG/TargetLowering.cpp | 20 +++++++++++--------
.../Target/AArch64/AArch64ISelLowering.cpp | 9 +++++----
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 ++--
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 ++++----
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 5 ++---
llvm/lib/Target/BPF/BPFISelLowering.h | 5 +++--
.../Target/Hexagon/HexagonISelLowering.cpp | 12 ++++++-----
llvm/lib/Target/Hexagon/HexagonISelLowering.h | 4 ++--
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++---
llvm/lib/Target/X86/X86ISelLowering.h | 8 ++++++--
12 files changed, 56 insertions(+), 42 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 00c36266a069f..818058c5bf8e3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1824,8 +1824,10 @@ class TargetLoweringBase {
/// Return true if it is profitable to reduce a load to a smaller type.
/// Example: (i16 (trunc (i32 (load x))) -> i16 load x
- virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
- EVT NewVT) const {
+ /// Example: (i16 (trunc (srl (i32 (load x)), 16)) -> i16 load x+2
+ virtual bool shouldReduceLoadWidth(
+ SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset = std::nullopt) const {
// By default, assume that it is cheaper to extract a subvector from a wide
// vector load rather than creating multiple narrow vector loads.
if (NewVT.isVector() && !SDValue(Load, 0).hasOneUse())
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3f3f87d1f5658..f03a9617da003 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6690,7 +6690,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
!TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
return false;
- if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
+ if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT, /*ByteOffset=*/0))
return false;
return true;
@@ -6701,9 +6701,11 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
unsigned ShAmt) {
if (!LDST)
return false;
+
// Only allow byte offsets.
if (ShAmt % 8)
return false;
+ const unsigned ByteShAmt = ShAmt / 8;
// Do not generate loads of non-round integer types since these can
// be expensive (and would be wrong if the type is not byte sized).
@@ -6727,8 +6729,6 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
// Ensure that this isn't going to produce an unsupported memory access.
if (ShAmt) {
- assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
- const unsigned ByteShAmt = ShAmt / 8;
const Align LDSTAlign = LDST->getAlign();
const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
@@ -6768,7 +6768,7 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
return false;
- if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
+ if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT, ByteShAmt))
return false;
} else {
assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
@@ -25268,7 +25268,8 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
TypeSize Offset = VT.getStoreSize() * (Index / NumElts);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
+ if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT,
+ /*ByteOffset=*/Offset.getFixedValue()))
return SDValue();
// The narrow load will be offset from the base address of the old load if
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3995216e3d689..bd85529a7d076 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4726,8 +4726,6 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
// for the narrowed load.
for (unsigned width = 8; width < origWidth; width *= 2) {
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), width);
- if (!shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT))
- continue;
APInt newMask = APInt::getLowBitsSet(maskWidth, width);
// Avoid accessing any padding here for now (we could use memWidth
// instead of origWidth here otherwise).
@@ -4737,8 +4735,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
unsigned ptrOffset =
Layout.isLittleEndian() ? offset : memWidth - width - offset;
unsigned IsFast = 0;
+ assert((ptrOffset % 8) == 0 && "Non-Bytealigned pointer offset");
Align NewAlign = commonAlignment(Lod->getAlign(), ptrOffset / 8);
- if (allowsMemoryAccess(
+ if (shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT,
+ ptrOffset / 8) &&
+ allowsMemoryAccess(
*DAG.getContext(), Layout, newVT, Lod->getAddressSpace(),
NewAlign, Lod->getMemOperand()->getFlags(), &IsFast) &&
IsFast) {
@@ -12175,17 +12176,17 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
ISD::LoadExtType ExtTy =
ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD;
- if (!isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
- !shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
+ if (!isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
return SDValue();
+ std::optional<unsigned> ByteOffset;
Align Alignment = OriginalLoad->getAlign();
MachinePointerInfo MPI;
if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
int Elt = ConstEltNo->getZExtValue();
- unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
- MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
- Alignment = commonAlignment(Alignment, PtrOff);
+ ByteOffset = VecEltVT.getSizeInBits() * Elt / 8;
+ MPI = OriginalLoad->getPointerInfo().getWithOffset(*ByteOffset);
+ Alignment = commonAlignment(Alignment, *ByteOffset);
} else {
// Discard the pointer info except the address space because the memory
// operand can't represent this new access since the offset is variable.
@@ -12193,6 +12194,9 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
}
+ if (!shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT, ByteOffset))
+ return SDValue();
+
unsigned IsFast = 0;
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
OriginalLoad->getAddressSpace(), Alignment,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 771eee1b3fecf..84d64bc835077 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16519,11 +16519,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
}
-bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
- ISD::LoadExtType ExtTy,
- EVT NewVT) const {
+bool AArch64TargetLowering::shouldReduceLoadWidth(
+ SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const {
// TODO: This may be worth removing. Check regression tests for diffs.
- if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
+ if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
+ ByteOffset))
return false;
// If we're reducing the load width in order to avoid having to use an extra
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d51ef2be8631..8b5d2ec9e6ddf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -695,8 +695,8 @@ class AArch64TargetLowering : public TargetLowering {
MachineFunction &MF,
unsigned Intrinsic) const override;
- bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
- EVT NewVT) const override;
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const override;
bool shouldRemoveRedundantExtend(SDValue Op) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2846405a2538c..236c373e70250 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -819,11 +819,11 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
}
-bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
- ISD::LoadExtType ExtTy,
- EVT NewVT) const {
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(
+ SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const {
// TODO: This may be worth removing. Check regression tests for diffs.
- if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
+ if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
return false;
unsigned NewSize = NewVT.getStoreSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fa9d61ec37c24..a42214865ccfd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -215,9 +215,8 @@ class AMDGPUTargetLowering : public TargetLowering {
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
bool ShouldShrinkFPConstant(EVT VT) const override;
- bool shouldReduceLoadWidth(SDNode *Load,
- ISD::LoadExtType ExtType,
- EVT ExtVT) const override;
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT,
+ std::optional<unsigned> ByteOffset) const override;
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG,
const MachineMemOperand &MMO) const final;
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index ad048ad05e6dd..8104895cb7f14 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -135,8 +135,9 @@ class BPFTargetLowering : public TargetLowering {
// ctx = ctx + reloc_offset
// ... (*(u8 *)(ctx + 1)) & 0x80 ...
// which will be rejected by the verifier.
- bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
- EVT NewVT) const override {
+ bool
+ shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const override {
return false;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 4c479ac41be12..fe12f99b91cd3 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3821,19 +3821,21 @@ HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
return TargetLowering::findRepresentativeClass(TRI, VT);
}
-bool HexagonTargetLowering::shouldReduceLoadWidth(SDNode *Load,
- ISD::LoadExtType ExtTy, EVT NewVT) const {
+bool HexagonTargetLowering::shouldReduceLoadWidth(
+ SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const {
// TODO: This may be worth removing. Check regression tests for diffs.
- if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
+ if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
+ ByteOffset))
return false;
auto *L = cast<LoadSDNode>(Load);
- std::pair<SDValue,int> BO = getBaseAndOffset(L->getBasePtr());
+ std::pair<SDValue, int> BO = getBaseAndOffset(L->getBasePtr());
// Small-data object, do not shrink.
if (BO.first.getOpcode() == HexagonISD::CONST32_GP)
return false;
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(BO.first)) {
- auto &HTM = static_cast<const HexagonTargetMachine&>(getTargetMachine());
+ auto &HTM = static_cast<const HexagonTargetMachine &>(getTargetMachine());
const auto *GO = dyn_cast_or_null<const GlobalObject>(GA->getGlobal());
return !GO || !HTM.getObjFileLowering()->isGlobalInSmallSection(GO, HTM);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 4df88b3a8abd7..1321bee44a295 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -342,8 +342,8 @@ class HexagonTargetLowering : public TargetLowering {
SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
const override;
- bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
- EVT NewVT) const override;
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const override;
void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 993118c52564e..f624eab61d5b8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3258,9 +3258,9 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
return false;
}
-bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
- ISD::LoadExtType ExtTy,
- EVT NewVT) const {
+bool X86TargetLowering::shouldReduceLoadWidth(
+ SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const {
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 4a2b35e9efe7c..74d46bb260399 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1346,6 +1346,9 @@ namespace llvm {
Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
+ (Op.getOpcode() == ISD::LOAD &&
+ getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
TargetLowering::isTargetCanonicalConstantNode(Op);
}
@@ -1501,8 +1504,9 @@ namespace llvm {
/// Return true if we believe it is correct and profitable to reduce the
/// load node to a smaller type.
- bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
- EVT NewVT) const override;
+ bool
+ shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const override;
/// Return true if the specified scalar FP type is computed in an SSE
/// register, not on the X87 floating point stack.
>From bddc9f8812c15521573abb892413151351ae0b9e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 22 Apr 2025 17:58:46 +0100
Subject: [PATCH 2/3] Add comment describing ByteOffset param
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 818058c5bf8e3..657d8637d6811 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1823,6 +1823,8 @@ class TargetLoweringBase {
virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
/// Return true if it is profitable to reduce a load to a smaller type.
+ /// \p ByteOffset is only set if we know the pointer offset at compile time
+ /// otherwise we should assume that additional pointer math is required.
/// Example: (i16 (trunc (i32 (load x))) -> i16 load x
/// Example: (i16 (trunc (srl (i32 (load x)), 16)) -> i16 load x+2
virtual bool shouldReduceLoadWidth(
>From b814b2a6a199b35d1aed1cfcb6e943e86fe40e6a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 23 Apr 2025 09:52:22 +0100
Subject: [PATCH 3/3] Remove isTargetCanonicalConstantNode change mistakenly
brought in from #129695
---
llvm/lib/Target/X86/X86ISelLowering.h | 3 ---
1 file changed, 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 74d46bb260399..7926292fc5bcf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1346,9 +1346,6 @@ namespace llvm {
Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
- Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
- (Op.getOpcode() == ISD::LOAD &&
- getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
TargetLowering::isTargetCanonicalConstantNode(Op);
}
More information about the llvm-commits
mailing list