[llvm] [DAGCombiner] Refactor and improve ReduceLoadOpStoreWidth (PR #119564)
Björn Pettersson via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 15 14:35:43 PST 2024
https://github.com/bjope updated https://github.com/llvm/llvm-project/pull/119564
>From 9e9812f5cc9d27fa74eac6ec91b97632e282ce55 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Fri, 6 Dec 2024 16:05:55 +0100
Subject: [PATCH 1/2] [DAGCombiner] Refactor and improve ReduceLoadOpStoreWidth
This patch make a couple of improvements to ReduceLoadOpStoreWidth.
When determining the minimum size of "NewBW" we now take byte
boundaries into account. If we for example touch bits 6-10 we
shouldn't accept NewBW=8, because we would fail later when detecting
that we can't access bits from two different bytes in memory using
a single load. Instead we make sure to align LSB/MSB according to
byte size boundaries up front before searching for a viable "NewBW".
In the past we only tried to find a "ShAmt" that was a multiple of
"NewBW", but now we use a sliding window technique to scan for a
viable "ShAmt" that is a multiple of the byte size. This can help
out finding more opportunities for optimization (specially if the
original type isn't byte sized, and for big-endian targets when
the original load/store is aligned on the most significant bit).
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 127 +++++++++---------
llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll | 16 +--
llvm/test/CodeGen/X86/apx/or.ll | 4 +-
.../CodeGen/X86/illegal-bitfield-loadstore.ll | 18 +--
llvm/test/CodeGen/X86/pr35763.ll | 10 +-
llvm/test/CodeGen/X86/store_op_load_fold.ll | 5 +-
6 files changed, 82 insertions(+), 98 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 907c4577d93d39..ab24715d1a7683 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20342,7 +20342,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
ST->getPointerInfo().getAddrSpace())
return SDValue();
- // Find the type to narrow it the load / op / store to.
+ // Find the type NewVT to narrow the load / op / store to.
SDValue N1 = Value.getOperand(1);
unsigned BitWidth = N1.getValueSizeInBits();
APInt Imm = N1->getAsAPIntVal();
@@ -20350,9 +20350,13 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
Imm ^= APInt::getAllOnes(BitWidth);
if (Imm == 0 || Imm.isAllOnes())
return SDValue();
- unsigned ShAmt = Imm.countr_zero();
- unsigned MSB = BitWidth - Imm.countl_zero() - 1;
- unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+ // Find least/most significant bit that need to be part of the narrowed
+ // operation. We assume target will need to address/access full bytes, so
+ // we make sure to align LSB and MSB at byte boundaries.
+ unsigned BitsPerByteMask = 7u;
+ unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
+ unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
+ unsigned NewBW = NextPowerOf2(MSB - LSB);
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
@@ -20367,68 +20371,69 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
if (NewBW >= BitWidth)
return SDValue();
- // TODO: For big-endian we probably want to align given the most significant
- // bit being modified instead of adjusting ShAmt based on least significant
- // bits. This to reduce the risk of failing on the alignment check below. If
- // for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to
- // find NewBW=16, and we want to load/store with a PtrOff set to 2. But then
- // ShAmt should be set to 8, which isn't a multiple of NewBW. But given
- // that isNarrowingProfitable doesn't seem to be overridden for any in-tree
- // big-endian target, then the support for big-endian here isn't covered by
- // any in-tree lit tests, so it is unfortunately not highly optimized
- // either. It should be possible to improve that by using
- // ReduceLoadOpStoreWidthForceNarrowingProfitable.
-
- // If the lsb that is modified does not start at the type bitwidth boundary,
- // align to start at the previous boundary.
- ShAmt = ShAmt - (ShAmt % NewBW);
-
- // Make sure we do not access memory outside the memory touched by the
- // original load/store.
- if (ShAmt + NewBW > VT.getStoreSizeInBits())
- return SDValue();
+ // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
+ // large enough to cover all bits that should be modified. This type might
+ // however be larger than really needed (such as i32 while we actually only
+ // need to modify one byte). Now we need to find our how to align the memory
+ // accesses to satisfy preferred alignments as well as avoiding to access
+ // memory outside the store size of the orignal access.
+
+ unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();
+
+ // Let ShAmt denote amount of bits to skip, counted from the least
+ // significant bits of Imm. And let PtrOff how much the pointer needs to be
+ // offsetted (in bytes) for the new access.
+ unsigned ShAmt = 0;
+ uint64_t PtrOff = 0;
+ for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
+ // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
+ if (ShAmt > LSB)
+ return SDValue();
+ if (ShAmt + NewBW < MSB)
+ continue;
- APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
- std::min(BitWidth, ShAmt + NewBW));
- if ((Imm & Mask) == Imm) {
- APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
- if (Opc == ISD::AND)
- NewImm ^= APInt::getAllOnes(NewBW);
- uint64_t PtrOff = ShAmt / 8;
- // For big endian targets, we need to adjust the offset to the pointer to
- // load the correct bytes.
- if (DAG.getDataLayout().isBigEndian())
- PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+ // Calculate PtrOff.
+ unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
+ ? VTStoreSize - NewBW - ShAmt
+ : ShAmt;
+ PtrOff = PtrAdjustmentInBits / 8;
+ // Now check if narrow access is allowed and fast, considering alignments.
unsigned IsFast = 0;
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
- if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
- LD->getAddressSpace(), NewAlign,
- LD->getMemOperand()->getFlags(), &IsFast) ||
- !IsFast)
- return SDValue();
-
- SDValue NewPtr =
- DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
- SDValue NewLD =
- DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
- LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
- LD->getMemOperand()->getFlags(), LD->getAAInfo());
- SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
- DAG.getConstant(NewImm, SDLoc(Value),
- NewVT));
- SDValue NewST =
- DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
- ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
-
- AddToWorklist(NewPtr.getNode());
- AddToWorklist(NewLD.getNode());
- AddToWorklist(NewVal.getNode());
- WorklistRemover DeadNodes(*this);
- DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
- ++OpsNarrowed;
- return NewST;
+ if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
+ LD->getAddressSpace(), NewAlign,
+ LD->getMemOperand()->getFlags(), &IsFast) &&
+ IsFast)
+ break;
}
+ // If loop above did not find any accepted ShAmt we need to exit here.
+ if (ShAmt + NewBW > VTStoreSize)
+ return SDValue();
+
+ APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
+ if (Opc == ISD::AND)
+ NewImm ^= APInt::getAllOnes(NewBW);
+ Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
+ SDValue NewPtr =
+ DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
+ SDValue NewLD =
+ DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
+ LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
+ SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
+ DAG.getConstant(NewImm, SDLoc(Value), NewVT));
+ SDValue NewST =
+ DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
+ ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
+
+ AddToWorklist(NewPtr.getNode());
+ AddToWorklist(NewLD.getNode());
+ AddToWorklist(NewVal.getNode());
+ WorklistRemover DeadNodes(*this);
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
+ ++OpsNarrowed;
+ return NewST;
}
return SDValue();
diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
index efdfa10f7ca07f..6819af3ba3e26c 100644
--- a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
+++ b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
@@ -32,10 +32,10 @@ define i16 @test(ptr %p1) {
;
; CHECK-LE-NARROW-LABEL: test:
; CHECK-LE-NARROW: @ %bb.0: @ %entry
-; CHECK-LE-NARROW-NEXT: ldrh r1, [r0, #8]
-; CHECK-LE-NARROW-NEXT: movw r2, #65534
-; CHECK-LE-NARROW-NEXT: orr r1, r1, r2
-; CHECK-LE-NARROW-NEXT: strh r1, [r0, #8]
+; CHECK-LE-NARROW-NEXT: ldr r1, [r0, #6]
+; CHECK-LE-NARROW-NEXT: orr r1, r1, #16646144
+; CHECK-LE-NARROW-NEXT: orr r1, r1, #-16777216
+; CHECK-LE-NARROW-NEXT: str r1, [r0, #6]
; CHECK-LE-NARROW-NEXT: mov r0, #0
; CHECK-LE-NARROW-NEXT: bx lr
;
@@ -50,10 +50,10 @@ define i16 @test(ptr %p1) {
;
; CHECK-BE-NARROW-LABEL: test:
; CHECK-BE-NARROW: @ %bb.0: @ %entry
-; CHECK-BE-NARROW-NEXT: ldrh r1, [r0]
-; CHECK-BE-NARROW-NEXT: movw r2, #65534
-; CHECK-BE-NARROW-NEXT: orr r1, r1, r2
-; CHECK-BE-NARROW-NEXT: strh r1, [r0]
+; CHECK-BE-NARROW-NEXT: ldr r1, [r0]
+; CHECK-BE-NARROW-NEXT: orr r1, r1, #16646144
+; CHECK-BE-NARROW-NEXT: orr r1, r1, #-16777216
+; CHECK-BE-NARROW-NEXT: str r1, [r0]
; CHECK-BE-NARROW-NEXT: mov r0, #0
; CHECK-BE-NARROW-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index e51ba9d9bf0391..995946c007dca4 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -906,13 +906,13 @@ entry:
define void @or64mi_legacy(ptr %a) {
; CHECK-LABEL: or64mi_legacy:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
; CHECK-NEXT: # imm = 0x1E240
; CHECK-NEXT: retq # encoding: [0xc3]
;
; NF-LABEL: or64mi_legacy:
; NF: # %bb.0: # %entry
-; NF-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
; NF-NEXT: # imm = 0x1E240
; NF-NEXT: retq # encoding: [0xc3]
entry:
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 7fb07c6b3163e7..338163d0e1cffa 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -6,22 +6,12 @@ define void @i24_or(ptr %a) {
; X86-LABEL: i24_or:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %ecx
-; X86-NEXT: movzbl 2(%eax), %edx
-; X86-NEXT: shll $16, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl $384, %edx # imm = 0x180
-; X86-NEXT: movw %dx, (%eax)
+; X86-NEXT: orw $384, (%eax) # imm = 0x180
; X86-NEXT: retl
;
; X64-LABEL: i24_or:
; X64: # %bb.0:
-; X64-NEXT: movzwl (%rdi), %eax
-; X64-NEXT: movzbl 2(%rdi), %ecx
-; X64-NEXT: shll $16, %ecx
-; X64-NEXT: orl %eax, %ecx
-; X64-NEXT: orl $384, %ecx # imm = 0x180
-; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: orw $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i24, ptr %a, align 1
%b = or i24 %aa, 384
@@ -103,12 +93,12 @@ define void @i56_or(ptr %a) {
; X86-LABEL: i56_or:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl $384, (%eax) # imm = 0x180
+; X86-NEXT: orw $384, (%eax) # imm = 0x180
; X86-NEXT: retl
;
; X64-LABEL: i56_or:
; X64: # %bb.0:
-; X64-NEXT: orl $384, (%rdi) # imm = 0x180
+; X64-NEXT: orw $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i56, ptr %a, align 1
%b = or i56 %aa, 384
diff --git a/llvm/test/CodeGen/X86/pr35763.ll b/llvm/test/CodeGen/X86/pr35763.ll
index 9d2ee84cf675bd..c10a24cffe5f2d 100644
--- a/llvm/test/CodeGen/X86/pr35763.ll
+++ b/llvm/test/CodeGen/X86/pr35763.ll
@@ -14,15 +14,7 @@ define dso_local void @PR35763() {
; CHECK-NEXT: movzwl z+2(%rip), %ecx
; CHECK-NEXT: orl %eax, %ecx
; CHECK-NEXT: movq %rcx, tf_3_var_136(%rip)
-; CHECK-NEXT: movl z+6(%rip), %eax
-; CHECK-NEXT: movzbl z+10(%rip), %ecx
-; CHECK-NEXT: shlq $32, %rcx
-; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
-; CHECK-NEXT: andq %rcx, %rax
-; CHECK-NEXT: movl %eax, z+6(%rip)
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movb %al, z+10(%rip)
+; CHECK-NEXT: andl $-33554177, z+7(%rip) # imm = 0xFE0000FF
; CHECK-NEXT: retq
entry:
%0 = load i16, ptr @z, align 8
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll
index 2915d1a7d7ae12..0309fa7bb03100 100644
--- a/llvm/test/CodeGen/X86/store_op_load_fold.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll
@@ -23,10 +23,7 @@ define void @test2() nounwind uwtable ssp {
; CHECK-LABEL: test2:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl L_s2$non_lazy_ptr, %eax
-; CHECK-NEXT: movzbl 22(%eax), %ecx
-; CHECK-NEXT: andl $-4, %ecx
-; CHECK-NEXT: movb %cl, 22(%eax)
-; CHECK-NEXT: movw $0, 20(%eax)
+; CHECK-NEXT: andl $-67108609, 19(%eax) ## imm = 0xFC0000FF
; CHECK-NEXT: retl
%bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
%bf.clear36 = and i56 %bf.load35, -1125895611875329
>From 9bf516751aed78c86a05b6c0a7ca1d693a240b33 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Sun, 15 Dec 2024 23:32:22 +0100
Subject: [PATCH 2/2] Fixup. Use flipAllBits.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ab24715d1a7683..9d888d9940a1cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20347,7 +20347,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
unsigned BitWidth = N1.getValueSizeInBits();
APInt Imm = N1->getAsAPIntVal();
if (Opc == ISD::AND)
- Imm ^= APInt::getAllOnes(BitWidth);
+ Imm.flipAllBits();
if (Imm == 0 || Imm.isAllOnes())
return SDValue();
// Find least/most significant bit that need to be part of the narrowed
@@ -20413,7 +20413,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
if (Opc == ISD::AND)
- NewImm ^= APInt::getAllOnes(NewBW);
+ NewImm.flipAllBits();
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
SDValue NewPtr =
DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
More information about the llvm-commits
mailing list