[llvm] fold mov dec/inc to lea +- 1 (PR #185194)

Sat Mar 7 15:12:27 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Takashi Idobe (Takashiidobe)

<details>
<summary>Changes</summary>

Resolves: https://github.com/llvm/llvm-project/issues/182784

This PR folds this pattern:

```asm
mov     eax, ecx
dec     eax
``` 

to:
```asm
lea     eax, [rcx - 1]
```

And the same inc pattern:

```asm
mov     eax, ecx
inc     eax
``` 

to:
```asm
lea     eax, [rcx + 1]
```

To do so it checks for an inc/dec instruction, that EFLAGS is dead (since lea doesn't set flags), and that the dec/inc work on the same register (should be implicit here), and then checks the next previous debug instruction is of the form `mov dst, src`. Afterwards it makes a lea instruction with the relevant data, sets its debug info, deletes the mov and the dec/inc, and then continues onto the next instruction.

---
Full diff: https://github.com/llvm/llvm-project/pull/185194.diff


5 Files Affected:

- (modified) llvm/lib/Target/X86/X86FixupLEAs.cpp (+90) 
- (modified) llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll (+1-2) 
- (modified) llvm/test/CodeGen/X86/fold-loop-of-urem.ll (+2-4) 
- (added) llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll (+100) 
- (modified) llvm/test/CodeGen/X86/pr44412.ll (+2-4) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 07f656fc5ccfd..2837968b819cc 100644
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
@@ -120,6 +121,13 @@ class FixupLEAsImpl {
   MachineInstr *postRAConvertToLEA(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator &MBBI) const;
 
+  /// Fold adjacent mov/inc-dec into a single LEA:
+  ///   mov dst, src
+  ///   dec/inc dst   (flags dead)
+  /// =>
+  ///   lea dst, [src +/- 1]
+  bool foldMovIncDecToLEA(MachineBasicBlock &MBB, const X86Subtarget &ST) const;
+
 public:
   FixupLEAsImpl(ProfileSummaryInfo *PSI, MachineBlockFrequencyInfo *MBFI)
       : PSI(PSI), MBFI(MBFI) {}
@@ -229,6 +237,16 @@ static bool isLEA(unsigned Opcode) {
          Opcode == X86::LEA64_32r;
 }
 
+static MachineBasicBlock::iterator
+getPrevNonDebugInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
+  while (I != MBB.begin()) {
+    --I;
+    if (!I->isDebugInstr())
+      return I;
+  }
+  return MBB.end();
+}
+
 bool FixupLEAsImpl::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   bool IsSlowLEA = ST.slowLEA();
@@ -244,6 +262,8 @@ bool FixupLEAsImpl::runOnMachineFunction(MachineFunction &MF) {
 
   LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   for (MachineBasicBlock &MBB : MF) {
+    foldMovIncDecToLEA(MBB, ST);
+
     // First pass. Try to remove or optimize existing LEAs.
     bool OptIncDecPerBB =
         OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
@@ -273,6 +293,76 @@ bool FixupLEAsImpl::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
+bool FixupLEAsImpl::foldMovIncDecToLEA(MachineBasicBlock &MBB,
+                                       const X86Subtarget &ST) const {
+  bool Changed = false;
+
+  for (auto I = MBB.begin(); I != MBB.end();) {
+    MachineInstr &MI = *I;
+    unsigned Opc = MI.getOpcode();
+
+    bool IsDec = Opc == X86::DEC32r || Opc == X86::DEC64r;
+    bool IsInc = Opc == X86::INC32r || Opc == X86::INC64r;
+    bool Is64BitIncDec = Opc == X86::DEC64r || Opc == X86::INC64r;
+    if (!IsDec && !IsInc) {
+      ++I;
+      continue;
+    }
+
+    if (!MI.registerDefIsDead(X86::EFLAGS, TRI)) {
+      ++I;
+      continue;
+    }
+
+    Register DstReg = MI.getOperand(0).getReg();
+    if (!DstReg.isPhysical() || MI.getOperand(1).getReg() != DstReg) {
+      ++I;
+      continue;
+    }
+
+    auto Prev = getPrevNonDebugInstr(MBB, I);
+    if (Prev == MBB.end()) {
+      ++I;
+      continue;
+    }
+
+    unsigned MovOpc = Is64BitIncDec ? X86::MOV64rr : X86::MOV32rr;
+    if (Prev->getOpcode() != MovOpc || Prev->getOperand(0).getReg() != DstReg) {
+      ++I;
+      continue;
+    }
+
+    Register SrcReg = Prev->getOperand(1).getReg();
+    Register LEASrcReg = SrcReg;
+    unsigned LEAOpc = X86::LEA32r;
+    if (Is64BitIncDec) {
+      LEAOpc = X86::LEA64r;
+    } else if (ST.is64Bit()) {
+      LEAOpc = X86::LEA64_32r;
+      LEASrcReg = getX86SubSuperRegister(SrcReg, 64);
+    }
+
+    MachineInstr *NewMI =
+        BuildMI(MBB, I, MI.getDebugLoc(), TII->get(LEAOpc), DstReg)
+            .addReg(LEASrcReg)
+            .addImm(1)
+            .addReg(0)
+            .addImm(IsDec ? -1 : 1)
+            .addReg(0);
+
+    ++NumLEAs;
+    Changed = true;
+    MBB.getParent()->substituteDebugValuesForInst(MI, *NewMI, 1);
+    MBB.getParent()->substituteDebugValuesForInst(*Prev, *NewMI, 1);
+
+    auto EraseIncDec = I++;
+    MBB.erase(EraseIncDec);
+    MBB.erase(Prev);
+  }
+
+  return Changed;
+}
+
 FixupLEAsImpl::RegUsageState
 FixupLEAsImpl::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
   RegUsageState RegUsage = RU_NotUsed;
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
index 8a8e7a3b4df2c..abff79a5b232d 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -238,8 +238,7 @@ define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movl $buf, %ecx
 ; CHECK-NEXT:    movl $32, %edx
-; CHECK-NEXT:    movl %esi, %r8d
-; CHECK-NEXT:    decl %r8d
+; CHECK-NEXT:    leal -1(%rsi), %r8d
 ; CHECK-NEXT:    jmp .LBB4_1
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB4_3: # %if.false
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index cb1c078ee5129..37cb39dac265e 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -389,8 +389,7 @@ define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    testb $1, %r14b
 ; CHECK-NEXT:    je .LBB5_7
 ; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    movl %eax, %ebp
-; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%rax), %ebp
 ; CHECK-NEXT:    jmp .LBB5_6
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB5_5: # %for.body2
@@ -901,8 +900,7 @@ define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwin
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    movl %esi, %ebx
-; CHECK-NEXT:    movl %edi, %ebp
-; CHECK-NEXT:    decl %ebp
+; CHECK-NEXT:    leal -1(%rdi), %ebp
 ; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    xorl %r14d, %r14d
 ; CHECK-NEXT:    xorl %r13d, %r13d
diff --git a/llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll b/llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll
new file mode 100644
index 0000000000000..4690fbd0cb5c3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mov-inc-dec-to-lea.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
+
+define i64 @mov_dec(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_dec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %ecx
+; CHECK-NEXT:    leal -1(%rcx), %eax
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    orq %rcx, %rax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %cmp = icmp slt <32 x i8> %x, zeroinitializer
+  %mvmsk = bitcast <32 x i1> %cmp to i32
+  %dec = add i32 %mvmsk, -1
+  %ext = zext i32 %mvmsk to i64
+  %shl = shl nuw i64 %ext, 32
+  %dec.ext = zext i32 %dec to i64
+  %res = or disjoint i64 %shl, %dec.ext
+  ret i64 %res
+}
+
+define i64 @mov_inc(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_inc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %ecx
+; CHECK-NEXT:    leal 1(%rcx), %eax
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    orq %rcx, %rax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %cmp = icmp slt <32 x i8> %x, zeroinitializer
+  %mvmsk = bitcast <32 x i1> %cmp to i32
+  %inc = add i32 %mvmsk, 1
+  %ext = zext i32 %mvmsk to i64
+  %shl = shl nuw i64 %ext, 32
+  %inc.ext = zext i32 %inc to i64
+  %res = or disjoint i64 %shl, %inc.ext
+  ret i64 %res
+}
+
+define i64 @mov_inc_flags_live(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_inc_flags_live:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    cmovneq %rcx, %rax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %cmp = icmp slt <32 x i8> %x, zeroinitializer
+  %mvmsk = bitcast <32 x i1> %cmp to i32
+  %inc = add i32 %mvmsk, 1
+  %iszero = icmp eq i32 %inc, 0
+  %ext = zext i32 %mvmsk to i64
+  %inc.ext = zext i32 %inc to i64
+  %sel = select i1 %iszero, i64 %inc.ext, i64 %ext
+  ret i64 %sel
+}
+
+define i64 @mov_dec_flags_live(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_dec_flags_live:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    cmovneq %rcx, %rax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %cmp = icmp slt <32 x i8> %x, zeroinitializer
+  %mvmsk = bitcast <32 x i1> %cmp to i32
+  %dec = add i32 %mvmsk, -1
+  %iszero = icmp eq i32 %dec, 0
+  %ext = zext i32 %mvmsk to i64
+  %dec.ext = zext i32 %dec to i64
+  %sel = select i1 %iszero, i64 %dec.ext, i64 %ext
+  ret i64 %sel
+}
+
+define i64 @mov_inc_not_adjacent(<32 x i8> %x) local_unnamed_addr {
+; CHECK-LABEL: mov_inc_not_adjacent:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %eax
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    orq %rcx, %rax
+; CHECK-NEXT:    xorq $5, %rax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %cmp = icmp slt <32 x i8> %x, zeroinitializer
+  %mvmsk = bitcast <32 x i1> %cmp to i32
+  %ext = zext i32 %mvmsk to i64
+  %shl = shl nuw i64 %ext, 32
+  %tmp = xor i64 %shl, 5
+  %inc = add i32 %mvmsk, 1
+  %inc.ext = zext i32 %inc to i64
+  %res = xor i64 %tmp, %inc.ext
+  ret i64 %res
+}
diff --git a/llvm/test/CodeGen/X86/pr44412.ll b/llvm/test/CodeGen/X86/pr44412.ll
index 546dbcc156129..331d9f0e7d45e 100644
--- a/llvm/test/CodeGen/X86/pr44412.ll
+++ b/llvm/test/CodeGen/X86/pr44412.ll
@@ -8,8 +8,7 @@ define void @bar(i32 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    je .LBB0_4
 ; CHECK-NEXT:  # %bb.1: # %.preheader
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edi, %ebx
-; CHECK-NEXT:    decl %ebx
+; CHECK-NEXT:    leal -1(%rdi), %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %ebx, %edi
@@ -41,8 +40,7 @@ define void @baz(i32 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    je .LBB1_4
 ; CHECK-NEXT:  # %bb.1: # %.preheader
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edi, %ebx
-; CHECK-NEXT:    decl %ebx
+; CHECK-NEXT:    leal -1(%rdi), %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_2: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %ebx, %edi

``````````

</details>


https://github.com/llvm/llvm-project/pull/185194