[llvm] fold mov dec/inc to lea +- 1 (PR #185194)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 7 15:12:27 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Takashi Idobe (Takashiidobe)
<details>
<summary>Changes</summary>
Resolves: https://github.com/llvm/llvm-project/issues/182784
This PR folds this pattern:
```asm
mov eax, ecx
dec eax
```
to:
```asm
lea eax, [rcx - 1]
```
And the same inc pattern:
```asm
mov eax, ecx
inc eax
```
to:
```asm
lea eax, [rcx + 1]
```
To do so it checks for an inc/dec instruction, that EFLAGS is dead (since lea doesn't set flags), and that the dec/inc work on the same register (should be implicit here), and then checks the next previous debug instruction is of the form `mov dst, src`. Afterwards it makes a lea instruction with the relevant data, sets its debug info, deletes the mov and the dec/inc, and then continues onto the next instruction.
---
Full diff: https://github.com/llvm/llvm-project/pull/185194.diff
5 Files Affected:
- (modified) llvm/lib/Target/X86/X86FixupLEAs.cpp (+90)
- (modified) llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll (+1-2)
- (modified) llvm/test/CodeGen/X86/fold-loop-of-urem.ll (+2-4)
- (added) llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll (+100)
- (modified) llvm/test/CodeGen/X86/pr44412.ll (+2-4)
``````````diff
diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 07f656fc5ccfd..2837968b819cc 100644
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/X86BaseInfo.h"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
@@ -120,6 +121,13 @@ class FixupLEAsImpl {
MachineInstr *postRAConvertToLEA(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) const;
+ /// Fold adjacent mov/inc-dec into a single LEA:
+ /// mov dst, src
+ /// dec/inc dst (flags dead)
+ /// =>
+ /// lea dst, [src +/- 1]
+ bool foldMovIncDecToLEA(MachineBasicBlock &MBB, const X86Subtarget &ST) const;
+
public:
FixupLEAsImpl(ProfileSummaryInfo *PSI, MachineBlockFrequencyInfo *MBFI)
: PSI(PSI), MBFI(MBFI) {}
@@ -229,6 +237,16 @@ static bool isLEA(unsigned Opcode) {
Opcode == X86::LEA64_32r;
}
+static MachineBasicBlock::iterator
+getPrevNonDebugInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
+ while (I != MBB.begin()) {
+ --I;
+ if (!I->isDebugInstr())
+ return I;
+ }
+ return MBB.end();
+}
+
bool FixupLEAsImpl::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
bool IsSlowLEA = ST.slowLEA();
@@ -244,6 +262,8 @@ bool FixupLEAsImpl::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
for (MachineBasicBlock &MBB : MF) {
+ foldMovIncDecToLEA(MBB, ST);
+
// First pass. Try to remove or optimize existing LEAs.
bool OptIncDecPerBB =
OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
@@ -273,6 +293,76 @@ bool FixupLEAsImpl::runOnMachineFunction(MachineFunction &MF) {
return true;
}
+bool FixupLEAsImpl::foldMovIncDecToLEA(MachineBasicBlock &MBB,
+ const X86Subtarget &ST) const {
+ bool Changed = false;
+
+ for (auto I = MBB.begin(); I != MBB.end();) {
+ MachineInstr &MI = *I;
+ unsigned Opc = MI.getOpcode();
+
+ bool IsDec = Opc == X86::DEC32r || Opc == X86::DEC64r;
+ bool IsInc = Opc == X86::INC32r || Opc == X86::INC64r;
+ bool Is64BitIncDec = Opc == X86::DEC64r || Opc == X86::INC64r;
+ if (!IsDec && !IsInc) {
+ ++I;
+ continue;
+ }
+
+ if (!MI.registerDefIsDead(X86::EFLAGS, TRI)) {
+ ++I;
+ continue;
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!DstReg.isPhysical() || MI.getOperand(1).getReg() != DstReg) {
+ ++I;
+ continue;
+ }
+
+ auto Prev = getPrevNonDebugInstr(MBB, I);
+ if (Prev == MBB.end()) {
+ ++I;
+ continue;
+ }
+
+ unsigned MovOpc = Is64BitIncDec ? X86::MOV64rr : X86::MOV32rr;
+ if (Prev->getOpcode() != MovOpc || Prev->getOperand(0).getReg() != DstReg) {
+ ++I;
+ continue;
+ }
+
+ Register SrcReg = Prev->getOperand(1).getReg();
+ Register LEASrcReg = SrcReg;
+ unsigned LEAOpc = X86::LEA32r;
+ if (Is64BitIncDec) {
+ LEAOpc = X86::LEA64r;
+ } else if (ST.is64Bit()) {
+ LEAOpc = X86::LEA64_32r;
+ LEASrcReg = getX86SubSuperRegister(SrcReg, 64);
+ }
+
+ MachineInstr *NewMI =
+ BuildMI(MBB, I, MI.getDebugLoc(), TII->get(LEAOpc), DstReg)
+ .addReg(LEASrcReg)
+ .addImm(1)
+ .addReg(0)
+ .addImm(IsDec ? -1 : 1)
+ .addReg(0);
+
+ ++NumLEAs;
+ Changed = true;
+ MBB.getParent()->substituteDebugValuesForInst(MI, *NewMI, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*Prev, *NewMI, 1);
+
+ auto EraseIncDec = I++;
+ MBB.erase(EraseIncDec);
+ MBB.erase(Prev);
+ }
+
+ return Changed;
+}
+
FixupLEAsImpl::RegUsageState
FixupLEAsImpl::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
RegUsageState RegUsage = RU_NotUsed;
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
index 8a8e7a3b4df2c..abff79a5b232d 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -238,8 +238,7 @@ define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movl $buf, %ecx
; CHECK-NEXT: movl $32, %edx
-; CHECK-NEXT: movl %esi, %r8d
-; CHECK-NEXT: decl %r8d
+; CHECK-NEXT: leal -1(%rsi), %r8d
; CHECK-NEXT: jmp .LBB4_1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB4_3: # %if.false
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index cb1c078ee5129..37cb39dac265e 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -389,8 +389,7 @@ define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
; CHECK-NEXT: testb $1, %r14b
; CHECK-NEXT: je .LBB5_7
; CHECK-NEXT: # %bb.4: # in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT: movl %eax, %ebp
-; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: leal 1(%rax), %ebp
; CHECK-NEXT: jmp .LBB5_6
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_5: # %for.body2
@@ -901,8 +900,7 @@ define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwin
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
-; CHECK-NEXT: decl %ebp
+; CHECK-NEXT: leal -1(%rdi), %ebp
; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: xorl %r13d, %r13d
diff --git a/llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll b/llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll
new file mode 100644
index 0000000000000..4690fbd0cb5c3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
+
+define i64 @mov_dec(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_dec:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovmskb %ymm0, %ecx
+; CHECK-NEXT: leal -1(%rcx), %eax
+; CHECK-NEXT: shlq $32, %rcx
+; CHECK-NEXT: orq %rcx, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %cmp = icmp slt <32 x i8> %x, zeroinitializer
+ %mvmsk = bitcast <32 x i1> %cmp to i32
+ %dec = add i32 %mvmsk, -1
+ %ext = zext i32 %mvmsk to i64
+ %shl = shl nuw i64 %ext, 32
+ %dec.ext = zext i32 %dec to i64
+ %res = or disjoint i64 %shl, %dec.ext
+ ret i64 %res
+}
+
+define i64 @mov_inc(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_inc:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovmskb %ymm0, %ecx
+; CHECK-NEXT: leal 1(%rcx), %eax
+; CHECK-NEXT: shlq $32, %rcx
+; CHECK-NEXT: orq %rcx, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %cmp = icmp slt <32 x i8> %x, zeroinitializer
+ %mvmsk = bitcast <32 x i1> %cmp to i32
+ %inc = add i32 %mvmsk, 1
+ %ext = zext i32 %mvmsk to i64
+ %shl = shl nuw i64 %ext, 32
+ %inc.ext = zext i32 %inc to i64
+ %res = or disjoint i64 %shl, %inc.ext
+ ret i64 %res
+}
+
+define i64 @mov_inc_flags_live(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_inc_flags_live:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovmskb %ymm0, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: incl %eax
+; CHECK-NEXT: cmovneq %rcx, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %cmp = icmp slt <32 x i8> %x, zeroinitializer
+ %mvmsk = bitcast <32 x i1> %cmp to i32
+ %inc = add i32 %mvmsk, 1
+ %iszero = icmp eq i32 %inc, 0
+ %ext = zext i32 %mvmsk to i64
+ %inc.ext = zext i32 %inc to i64
+ %sel = select i1 %iszero, i64 %inc.ext, i64 %ext
+ ret i64 %sel
+}
+
+define i64 @mov_dec_flags_live(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_dec_flags_live:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovmskb %ymm0, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: decl %eax
+; CHECK-NEXT: cmovneq %rcx, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %cmp = icmp slt <32 x i8> %x, zeroinitializer
+ %mvmsk = bitcast <32 x i1> %cmp to i32
+ %dec = add i32 %mvmsk, -1
+ %iszero = icmp eq i32 %dec, 0
+ %ext = zext i32 %mvmsk to i64
+ %dec.ext = zext i32 %dec to i64
+ %sel = select i1 %iszero, i64 %dec.ext, i64 %ext
+ ret i64 %sel
+}
+
+define i64 @mov_inc_not_adjacent(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_inc_not_adjacent:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovmskb %ymm0, %eax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shlq $32, %rcx
+; CHECK-NEXT: incl %eax
+; CHECK-NEXT: orq %rcx, %rax
+; CHECK-NEXT: xorq $5, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %cmp = icmp slt <32 x i8> %x, zeroinitializer
+ %mvmsk = bitcast <32 x i1> %cmp to i32
+ %ext = zext i32 %mvmsk to i64
+ %shl = shl nuw i64 %ext, 32
+ %tmp = xor i64 %shl, 5
+ %inc = add i32 %mvmsk, 1
+ %inc.ext = zext i32 %inc to i64
+ %res = xor i64 %tmp, %inc.ext
+ ret i64 %res
+}
diff --git a/llvm/test/CodeGen/X86/pr44412.ll b/llvm/test/CodeGen/X86/pr44412.ll
index 546dbcc156129..331d9f0e7d45e 100644
--- a/llvm/test/CodeGen/X86/pr44412.ll
+++ b/llvm/test/CodeGen/X86/pr44412.ll
@@ -8,8 +8,7 @@ define void @bar(i32 %0, i32 %1) nounwind {
; CHECK-NEXT: je .LBB0_4
; CHECK-NEXT: # %bb.1: # %.preheader
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: movl %edi, %ebx
-; CHECK-NEXT: decl %ebx
+; CHECK-NEXT: leal -1(%rdi), %ebx
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movl %ebx, %edi
@@ -41,8 +40,7 @@ define void @baz(i32 %0, i32 %1) nounwind {
; CHECK-NEXT: je .LBB1_4
; CHECK-NEXT: # %bb.1: # %.preheader
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: movl %edi, %ebx
-; CHECK-NEXT: decl %ebx
+; CHECK-NEXT: leal -1(%rdi), %ebx
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movl %ebx, %edi
``````````
</details>
https://github.com/llvm/llvm-project/pull/185194
More information about the llvm-commits
mailing list