[llvm] Patch series related to DAGCombiner::ReduceLoadOpStoreWidth (PR #119140)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 8 10:35:34 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-arm
Author: Björn Pettersson (bjope)
<details>
<summary>Changes</summary>
These patches are included (to be submitted in top-to-bottom order):
[DAGCombiner] Simplify ShAmt alignment in ReduceLoadOpStoreWidth. NFC
[DAGCombiner] Add option to force isNarrowingProfitable in tests. NFC
[DAGCombiner] Pre-commit test case for ReduceLoadOpStoreWidth. NFC
[DAGCombiner] Fix to avoid writing outside original store in ReduceLoadOpStoreWidth
[DAGCombiner] Refactor and improve ReduceLoadOpStoreWidth
Main goals are to avoid accesses outside original load/store and avoid assertion failures for big-endian. Secondary goal is to improve the implemenation, to let the optimization kick in in situations that have been overlooked in the past.
---
Full diff: https://github.com/llvm/llvm-project/pull/119140.diff
6 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+77-48)
- (added) llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll (+66)
- (modified) llvm/test/CodeGen/X86/apx/or.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll (+4-14)
- (modified) llvm/test/CodeGen/X86/pr35763.ll (+1-9)
- (modified) llvm/test/CodeGen/X86/store_op_load_fold.ll (+1-1)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1aab0fce4e1e53..f14f6903232d13 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -138,6 +138,11 @@ static cl::opt<bool> EnableReduceLoadOpStoreWidth(
"combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
cl::desc("DAG combiner enable reducing the width of load/op/store "
"sequence"));
+static cl::opt<bool> ReduceLoadOpStoreWidthForceNarrowingProfitable(
+ "combiner-reduce-load-op-store-width-force-narrowing-profitable",
+ cl::Hidden, cl::init(false),
+ cl::desc("DAG combiner force override the narrowing profitable check when"
+ "reducing the width of load/op/store sequences"));
static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
"combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
@@ -20334,7 +20339,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
ST->getPointerInfo().getAddrSpace())
return SDValue();
- // Find the type to narrow it the load / op / store to.
+ // Find the type NewVT to narrow the load / op / store to.
SDValue N1 = Value.getOperand(1);
unsigned BitWidth = N1.getValueSizeInBits();
APInt Imm = N1->getAsAPIntVal();
@@ -20342,66 +20347,90 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
Imm ^= APInt::getAllOnes(BitWidth);
if (Imm == 0 || Imm.isAllOnes())
return SDValue();
- unsigned ShAmt = Imm.countr_zero();
- unsigned MSB = BitWidth - Imm.countl_zero() - 1;
- unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+ // Find least/most significant bit that need to be part of the narrowed
+ // operation. We assume target will need to address/access full bytes, so
+ // we make sure to align LSB and MSB at byte boundaries.
+ unsigned BitsPerByteMask = 7u;
+ unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
+ unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
+ unsigned NewBW = NextPowerOf2(MSB - LSB);
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
- while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
- !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
- !TLI.isNarrowingProfitable(N, VT, NewVT))) {
+ while (NewBW < BitWidth &&
+ (NewVT.getStoreSizeInBits() != NewBW ||
+ !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+ !(TLI.isNarrowingProfitable(N, VT, NewVT) ||
+ ReduceLoadOpStoreWidthForceNarrowingProfitable))) {
NewBW = NextPowerOf2(NewBW);
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
}
if (NewBW >= BitWidth)
return SDValue();
- // If the lsb changed does not start at the type bitwidth boundary,
- // start at the previous one.
- if (ShAmt % NewBW)
- ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
- APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
- std::min(BitWidth, ShAmt + NewBW));
- if ((Imm & Mask) == Imm) {
- APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
- if (Opc == ISD::AND)
- NewImm ^= APInt::getAllOnes(NewBW);
- uint64_t PtrOff = ShAmt / 8;
- // For big endian targets, we need to adjust the offset to the pointer to
- // load the correct bytes.
- if (DAG.getDataLayout().isBigEndian())
- PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+ // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
+ // large enough to cover all bits that should be modified. This type might
+ // however be larger than really needed (such as i32 while we actually only
+ // need to modify one byte). Now we need to find our how to align the memory
+ // accesses to satisfy preferred alignments as well as avoiding to access
+ // memory outside the store size of the orignal access.
+
+ unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();
+
+ // Let ShAmt denote amount of bits to skip, counted from the least
+ // significant bits of Imm. And let PtrOff how much the pointer needs to be
+ // offsetted (in bytes) for the new access.
+ unsigned ShAmt = 0;
+ uint64_t PtrOff = 0;
+ for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
+ // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
+ if (ShAmt > LSB)
+ return SDValue();
+ if (ShAmt + NewBW < MSB)
+ continue;
+
+ // Calculate PtrOff.
+ unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
+ ? VTStoreSize - NewBW - ShAmt
+ : ShAmt;
+ PtrOff = PtrAdjustmentInBits / 8;
+ // Now check if narrow access is allowed and fast, considering alignments.
unsigned IsFast = 0;
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
- if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
- LD->getAddressSpace(), NewAlign,
- LD->getMemOperand()->getFlags(), &IsFast) ||
- !IsFast)
- return SDValue();
-
- SDValue NewPtr =
- DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
- SDValue NewLD =
- DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
- LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
- LD->getMemOperand()->getFlags(), LD->getAAInfo());
- SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
- DAG.getConstant(NewImm, SDLoc(Value),
- NewVT));
- SDValue NewST =
- DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
- ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
-
- AddToWorklist(NewPtr.getNode());
- AddToWorklist(NewLD.getNode());
- AddToWorklist(NewVal.getNode());
- WorklistRemover DeadNodes(*this);
- DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
- ++OpsNarrowed;
- return NewST;
+ if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
+ LD->getAddressSpace(), NewAlign,
+ LD->getMemOperand()->getFlags(), &IsFast) &&
+ IsFast)
+ break;
}
+ // If loop above did not find any accepted ShAmt we need to exit here.
+ if (ShAmt + NewBW > VTStoreSize)
+ return SDValue();
+
+ APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
+ if (Opc == ISD::AND)
+ NewImm ^= APInt::getAllOnes(NewBW);
+ Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
+ SDValue NewPtr =
+ DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
+ SDValue NewLD =
+ DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
+ LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
+ SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
+ DAG.getConstant(NewImm, SDLoc(Value), NewVT));
+ SDValue NewST =
+ DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
+ ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
+
+ AddToWorklist(NewPtr.getNode());
+ AddToWorklist(NewLD.getNode());
+ AddToWorklist(NewVal.getNode());
+ WorklistRemover DeadNodes(*this);
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
+ ++OpsNarrowed;
+ return NewST;
}
return SDValue();
diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
new file mode 100644
index 00000000000000..6819af3ba3e26c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple armv7 -O1 | FileCheck %s -check-prefix=CHECK-LE-NORMAL
+; RUN: llc < %s -mtriple armv7 -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-LE-NARROW
+; RUN: llc < %s -mtriple armv7eb -O1 | FileCheck %s -check-prefix=CHECK-BE-NORMAL
+; RUN: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW
+
+; This is a reproducer for a bug when DAGCombiner::ReduceLoadOpStoreWidth
+; would end up narrowing the load-op-store sequence into this SDNode sequence
+; for little-endian
+;
+; t18: i32,ch = load<(load (s32) from %ir.p1 + 8, align 8)> t0, t17, undef:i32
+; t20: i32 = or t18, Constant:i32<65534>
+; t21: ch = store<(store (s32) into %ir.p1 + 8, align 8)> t18:1, t20, t17, undef:i32
+;
+; This was wrong since it accesses memory above %ir.p1+9 which is outside the
+; "store size" for the original store.
+;
+; For big-endian we used to hit an assertion due to passing a negative offset
+; to getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while
+; before that commit we got load/store instructions that accessed memory at a
+; negative offset from %p1).
+;
+define i16 @test(ptr %p1) {
+; CHECK-LE-NORMAL-LABEL: test:
+; CHECK-LE-NORMAL: @ %bb.0: @ %entry
+; CHECK-LE-NORMAL-NEXT: ldrh r1, [r0, #8]
+; CHECK-LE-NORMAL-NEXT: movw r2, #65534
+; CHECK-LE-NORMAL-NEXT: orr r1, r1, r2
+; CHECK-LE-NORMAL-NEXT: strh r1, [r0, #8]
+; CHECK-LE-NORMAL-NEXT: mov r0, #0
+; CHECK-LE-NORMAL-NEXT: bx lr
+;
+; CHECK-LE-NARROW-LABEL: test:
+; CHECK-LE-NARROW: @ %bb.0: @ %entry
+; CHECK-LE-NARROW-NEXT: ldr r1, [r0, #6]
+; CHECK-LE-NARROW-NEXT: orr r1, r1, #16646144
+; CHECK-LE-NARROW-NEXT: orr r1, r1, #-16777216
+; CHECK-LE-NARROW-NEXT: str r1, [r0, #6]
+; CHECK-LE-NARROW-NEXT: mov r0, #0
+; CHECK-LE-NARROW-NEXT: bx lr
+;
+; CHECK-BE-NORMAL-LABEL: test:
+; CHECK-BE-NORMAL: @ %bb.0: @ %entry
+; CHECK-BE-NORMAL-NEXT: ldrh r1, [r0]
+; CHECK-BE-NORMAL-NEXT: movw r2, #65534
+; CHECK-BE-NORMAL-NEXT: orr r1, r1, r2
+; CHECK-BE-NORMAL-NEXT: strh r1, [r0]
+; CHECK-BE-NORMAL-NEXT: mov r0, #0
+; CHECK-BE-NORMAL-NEXT: bx lr
+;
+; CHECK-BE-NARROW-LABEL: test:
+; CHECK-BE-NARROW: @ %bb.0: @ %entry
+; CHECK-BE-NARROW-NEXT: ldr r1, [r0]
+; CHECK-BE-NARROW-NEXT: orr r1, r1, #16646144
+; CHECK-BE-NARROW-NEXT: orr r1, r1, #-16777216
+; CHECK-BE-NARROW-NEXT: str r1, [r0]
+; CHECK-BE-NARROW-NEXT: mov r0, #0
+; CHECK-BE-NARROW-NEXT: bx lr
+entry:
+ %load = load i80, ptr %p1, align 32
+ %mask = shl i80 -1, 65
+ %op = or i80 %load, %mask
+ store i80 %op, ptr %p1, align 32
+ ret i16 0
+}
+
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index e51ba9d9bf0391..995946c007dca4 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -906,13 +906,13 @@ entry:
define void @or64mi_legacy(ptr %a) {
; CHECK-LABEL: or64mi_legacy:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
; CHECK-NEXT: # imm = 0x1E240
; CHECK-NEXT: retq # encoding: [0xc3]
;
; NF-LABEL: or64mi_legacy:
; NF: # %bb.0: # %entry
-; NF-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
; NF-NEXT: # imm = 0x1E240
; NF-NEXT: retq # encoding: [0xc3]
entry:
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 7fb07c6b3163e7..338163d0e1cffa 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -6,22 +6,12 @@ define void @i24_or(ptr %a) {
; X86-LABEL: i24_or:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %ecx
-; X86-NEXT: movzbl 2(%eax), %edx
-; X86-NEXT: shll $16, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl $384, %edx # imm = 0x180
-; X86-NEXT: movw %dx, (%eax)
+; X86-NEXT: orw $384, (%eax) # imm = 0x180
; X86-NEXT: retl
;
; X64-LABEL: i24_or:
; X64: # %bb.0:
-; X64-NEXT: movzwl (%rdi), %eax
-; X64-NEXT: movzbl 2(%rdi), %ecx
-; X64-NEXT: shll $16, %ecx
-; X64-NEXT: orl %eax, %ecx
-; X64-NEXT: orl $384, %ecx # imm = 0x180
-; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: orw $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i24, ptr %a, align 1
%b = or i24 %aa, 384
@@ -103,12 +93,12 @@ define void @i56_or(ptr %a) {
; X86-LABEL: i56_or:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl $384, (%eax) # imm = 0x180
+; X86-NEXT: orw $384, (%eax) # imm = 0x180
; X86-NEXT: retl
;
; X64-LABEL: i56_or:
; X64: # %bb.0:
-; X64-NEXT: orl $384, (%rdi) # imm = 0x180
+; X64-NEXT: orw $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i56, ptr %a, align 1
%b = or i56 %aa, 384
diff --git a/llvm/test/CodeGen/X86/pr35763.ll b/llvm/test/CodeGen/X86/pr35763.ll
index 9d2ee84cf675bd..c10a24cffe5f2d 100644
--- a/llvm/test/CodeGen/X86/pr35763.ll
+++ b/llvm/test/CodeGen/X86/pr35763.ll
@@ -14,15 +14,7 @@ define dso_local void @PR35763() {
; CHECK-NEXT: movzwl z+2(%rip), %ecx
; CHECK-NEXT: orl %eax, %ecx
; CHECK-NEXT: movq %rcx, tf_3_var_136(%rip)
-; CHECK-NEXT: movl z+6(%rip), %eax
-; CHECK-NEXT: movzbl z+10(%rip), %ecx
-; CHECK-NEXT: shlq $32, %rcx
-; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
-; CHECK-NEXT: andq %rcx, %rax
-; CHECK-NEXT: movl %eax, z+6(%rip)
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movb %al, z+10(%rip)
+; CHECK-NEXT: andl $-33554177, z+7(%rip) # imm = 0xFE0000FF
; CHECK-NEXT: retq
entry:
%0 = load i16, ptr @z, align 8
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll
index bd7068535eabcc..0309fa7bb03100 100644
--- a/llvm/test/CodeGen/X86/store_op_load_fold.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll
@@ -23,7 +23,7 @@ define void @test2() nounwind uwtable ssp {
; CHECK-LABEL: test2:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl L_s2$non_lazy_ptr, %eax
-; CHECK-NEXT: andl $-262144, 20(%eax) ## imm = 0xFFFC0000
+; CHECK-NEXT: andl $-67108609, 19(%eax) ## imm = 0xFC0000FF
; CHECK-NEXT: retl
%bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
%bf.clear36 = and i56 %bf.load35, -1125895611875329
``````````
</details>
https://github.com/llvm/llvm-project/pull/119140
More information about the llvm-commits
mailing list