[llvm] [X86] Eliminate redundant zero-extension instructions (PR #161401)
Francisco Geiman Thiesen via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 30 09:54:29 PDT 2025
https://github.com/FranciscoThiesen created https://github.com/llvm/llvm-project/pull/161401
This pass eliminates redundant MOVZX32rr8 instructions when the source register is a sub-register of the destination and the destination's upper bits are already known to be zero.
For example, in loops processing byte values:
```
movzbl (%rdi), %ecx ; ECX upper 24 bits are zero
...
movzbl %cl, %ecx ; Redundant! CL is part of ECX, upper bits already 0
```
The optimization:
- Runs post-register allocation in the X86 backend pipeline
- Analyzes backward through basic blocks to verify upper bits are zero
- Handles cross-block analysis by checking predecessor definitions
- Only eliminates when provably safe (not heuristic)
This commonly occurs in loops that process byte values, saving one instruction per loop iteration and reducing code size by 3 bytes.
This is to solve the problem outlined in #160710
>From 622f767197a08e7a2fd307b7a4c9652540d25432 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Tue, 30 Sep 2025 09:52:34 -0700
Subject: [PATCH] [X86] Eliminate redundant zero-extension instructions This
pass eliminates redundant MOVZX32rr8 instructions when the source register is
a sub-register of the destination and the destination's upper bits are
already known to be zero.
For example, in loops processing byte values:
```
movzbl (%rdi), %ecx ; ECX upper 24 bits are zero
...
movzbl %cl, %ecx ; Redundant! CL is part of ECX, upper bits already 0
```
The optimization:
- Runs post-register allocation in the X86 backend pipeline
- Analyzes backward through basic blocks to verify upper bits are zero
- Handles cross-block analysis by checking predecessor definitions
- Only eliminates when provably safe (not heuristic)
This commonly occurs in loops that process byte values, saving one
instruction per loop iteration and reducing code size by 3 bytes.
---
llvm/lib/Target/X86/CMakeLists.txt | 1 +
llvm/lib/Target/X86/X86.h | 4 +
.../X86/X86EliminateRedundantZeroExtend.cpp | 292 ++++++++++++++++++
llvm/lib/Target/X86/X86TargetMachine.cpp | 1 +
.../CodeGen/X86/eliminate-redundant-zext.ll | 63 ++++
5 files changed, 361 insertions(+)
create mode 100644 llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
create mode 100644 llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..351ba623e2b6d 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -47,6 +47,7 @@ set(sources
X86FixupVectorConstants.cpp
X86AvoidStoreForwardingBlocks.cpp
X86DynAllocaExpander.cpp
+ X86EliminateRedundantZeroExtend.cpp
X86FixupSetCC.cpp
X86FlagsCopyLowering.cpp
X86FloatingPoint.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 6261fadf10a7a..cd59eb5c80149 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -127,6 +127,10 @@ FunctionPass *createX86CmovConverterPass();
/// the upper portions of registers, and to save code size.
FunctionPass *createX86FixupBWInsts();
+/// Return a Machine IR pass that eliminates redundant zero-extension
+/// instructions where the upper bits are already known to be zero.
+FunctionPass *createX86EliminateRedundantZeroExtend();
+
/// Return a Machine IR pass that reassigns instruction chains from one domain
/// to another, when profitable.
FunctionPass *createX86DomainReassignmentPass();
diff --git a/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
new file mode 100644
index 0000000000000..72717b1c64794
--- /dev/null
+++ b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
@@ -0,0 +1,292 @@
+//===-- X86EliminateRedundantZeroExtend.cpp - Eliminate Redundant ZExt ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass eliminates redundant zero-extension instructions where the source
+/// register is a sub-register of the destination and the destination's upper
+/// bits are known to be zero.
+///
+/// For example:
+/// movzbl (%rdi), %ecx ; ECX = zero-extend byte, upper 24 bits are zero
+/// ...
+/// movzbl %cl, %ecx ; Redundant! CL is part of ECX, upper bits already 0
+///
+/// This pattern commonly occurs in loops processing byte values.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-eliminate-zext"
+#define PASS_NAME "X86 Eliminate Redundant Zero Extension"
+
+namespace {
+class EliminateRedundantZeroExtend : public MachineFunctionPass {
+public:
+ static char ID;
+ EliminateRedundantZeroExtend() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return PASS_NAME; }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().setNoVRegs();
+ }
+
+private:
+ const X86InstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+
+ /// Check if the register's upper bits are known to be zero at this point.
+ /// This checks backward from MI to find the most recent definition of Reg.
+ bool hasZeroUpperBits(Register Reg, const MachineInstr &MI,
+ const MachineBasicBlock &MBB) const;
+
+ /// Try to eliminate a redundant MOVZX instruction.
+ bool tryEliminateRedundantZeroExtend(MachineInstr &MI,
+ MachineBasicBlock &MBB) const;
+};
+
+char EliminateRedundantZeroExtend::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86EliminateRedundantZeroExtend() {
+ return new EliminateRedundantZeroExtend();
+}
+
+bool EliminateRedundantZeroExtend::hasZeroUpperBits(
+ Register Reg, const MachineInstr &MI, const MachineBasicBlock &MBB) const {
+ // Walk backward from MI to find the most recent definition of Reg
+ MachineBasicBlock::const_reverse_iterator I = ++MI.getReverseIterator();
+ MachineBasicBlock::const_reverse_iterator E = MBB.rend();
+ for (; I != E; ++I) {
+ const MachineInstr &Inst = *I;
+
+ // Check if this instruction defines Reg
+ for (const MachineOperand &MO : Inst.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+
+ Register DefReg = MO.getReg();
+ if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+ // Found a definition - check if it zeros upper bits
+ unsigned Opc = Inst.getOpcode();
+ switch (Opc) {
+ // These instructions zero-extend to 32 bits
+ case X86::MOVZX32rm8:
+ case X86::MOVZX32rr8:
+ case X86::MOVZX32rm16:
+ case X86::MOVZX32rr16:
+ return true;
+ // XOR with self zeros the register
+ case X86::XOR32rr:
+ if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+ return true;
+ return false;
+ // MOV32r0 explicitly zeros
+ case X86::MOV32r0:
+ return true;
+ // ADD, SUB on 32-bit register (implicitly zero-extends to 64-bit)
+ case X86::ADD32rr:
+ case X86::ADD32ri:
+ case X86::ADD32rm:
+ case X86::SUB32rr:
+ case X86::SUB32ri:
+ case X86::SUB32rm:
+ case X86::LEA32r:
+ return true;
+ default:
+ // Any other definition might set upper bits, so not safe
+ return false;
+ }
+ }
+
+ // Check if this instruction modifies Reg (partial write or implicit use)
+ if (TRI->regsOverlap(DefReg, Reg)) {
+ // Partial register update - upper bits are unknown
+ return false;
+ }
+ }
+
+ // Check for implicit defs
+ for (const MachineOperand &MO : Inst.implicit_operands()) {
+ if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) {
+ return false;
+ }
+ }
+ }
+
+ // Didn't find a definition in this block - check predecessors
+ // If all predecessors define Reg with zero upper bits, it's safe
+ if (MBB.pred_empty())
+ return false;
+
+ // Check all predecessor blocks
+ for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+ bool FoundZeroExtend = false;
+
+ // SAFETY CHECK: If the sub-register is live-in to the predecessor,
+ // we make the CONSERVATIVE assumption that the parent register was
+ // zero-extended in an earlier block.
+ //
+ // This is safe because:
+ // 1. After register allocation, if $cl is live-in but $ecx is not,
+ // it means only the low 8 bits are meaningful
+ // 2. The register allocator ensures no other code modifies $ecx between
+ // the zero-extension and this point (otherwise $ecx would be live)
+ // 3. Any write to $ch or upper bits would show as a def of $ecx, which
+ // would be found in our backward scan below and handled correctly
+ //
+ // However, this is still conservative - we should verify the actual
+ // definition to be completely safe.
+ Register SubReg8 = TRI->getSubReg(Reg, X86::sub_8bit);
+ Register SubReg16 = TRI->getSubReg(Reg, X86::sub_16bit);
+ bool SubRegLiveIn = (SubReg8 && Pred->isLiveIn(SubReg8)) ||
+ (SubReg16 && Pred->isLiveIn(SubReg16));
+
+ if (SubRegLiveIn) {
+ // Sub-register is live-in. We'll verify this is safe by checking
+ // that no instructions in this block modify the parent register
+ // before we reach the end (where control flows to our block).
+ // If we find any such modification, we'll conservatively bail out.
+ bool SafeToAssume = true;
+ for (const MachineInstr &Inst : *Pred) {
+ for (const MachineOperand &MO : Inst.operands()) {
+ if (MO.isReg() && MO.isDef()) {
+ Register DefReg = MO.getReg();
+ // Check if this modifies Reg or overlaps with it (partial write)
+ if ((DefReg == Reg || TRI->regsOverlap(DefReg, Reg)) &&
+ DefReg != SubReg8 && DefReg != SubReg16) {
+ // Found a write to the parent register or overlapping register
+ // that's not just the sub-register we expect
+ SafeToAssume = false;
+ break;
+ }
+ }
+ }
+ if (!SafeToAssume)
+ break;
+ }
+
+ if (SafeToAssume) {
+ FoundZeroExtend = true;
+ goto next_predecessor;
+ }
+ }
+
+ // Walk backward through predecessor to find last definition of Reg
+ for (const MachineInstr &Inst : llvm::reverse(*Pred)) {
+ // Check if this instruction defines Reg
+ for (const MachineOperand &MO : Inst.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+
+ Register DefReg = MO.getReg();
+ if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+ // Found a definition - check if it zeros upper bits
+ unsigned Opc = Inst.getOpcode();
+ switch (Opc) {
+ case X86::MOVZX32rm8:
+ case X86::MOVZX32rr8:
+ case X86::MOVZX32rm16:
+ case X86::MOVZX32rr16:
+ case X86::MOV32r0:
+ case X86::ADD32rr:
+ case X86::ADD32ri:
+ case X86::ADD32rm:
+ case X86::SUB32rr:
+ case X86::SUB32ri:
+ case X86::SUB32rm:
+ case X86::LEA32r:
+ FoundZeroExtend = true;
+ break;
+ case X86::XOR32rr:
+ if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+ FoundZeroExtend = true;
+ break;
+ default:
+ // Found a definition that doesn't zero upper bits
+ return false;
+ }
+ // Found the definition in this predecessor
+ goto next_predecessor;
+ }
+
+ // Check for partial register updates
+ if (TRI->regsOverlap(DefReg, Reg)) {
+ return false;
+ }
+ }
+ }
+
+ next_predecessor:
+ // If we didn't find a zero-extending definition in this predecessor, fail
+ if (!FoundZeroExtend)
+ return false;
+ }
+
+ // All predecessors have zero-extending definitions
+ return true;
+}
+
+bool EliminateRedundantZeroExtend::tryEliminateRedundantZeroExtend(
+ MachineInstr &MI, MachineBasicBlock &MBB) const {
+ unsigned Opc = MI.getOpcode();
+
+ // Only handle MOVZX32rr8 for now (can extend to MOVZX32rr16 later)
+ if (Opc != X86::MOVZX32rr8)
+ return false;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ // Check if source is a sub-register of destination
+ // e.g., CL is sub-register of ECX
+ if (!TRI->isSubRegister(DstReg, SrcReg))
+ return false;
+
+ // Check if destination's upper bits are already zero
+ if (!hasZeroUpperBits(DstReg, MI, MBB))
+ return false;
+
+ // The MOVZX is redundant! Since SrcReg is part of DstReg and DstReg's
+ // upper bits are already zero, this instruction does nothing.
+ LLVM_DEBUG(dbgs() << "Eliminating redundant zero-extend: " << MI);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool EliminateRedundantZeroExtend::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ // Iterate through instructions - use a worklist to handle erasures
+ SmallVector<MachineInstr *, 4> ToErase;
+
+ for (MachineInstr &MI : MBB) {
+ if (tryEliminateRedundantZeroExtend(MI, MBB)) {
+ Changed = true;
+ // Note: MI is already erased in tryEliminateRedundantZeroExtend
+ break; // Restart iteration for this block
+ }
+ }
+ }
+
+ return Changed;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8dd6f3d97ccea..72835150e8277 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -558,6 +558,7 @@ void X86PassConfig::addPreEmitPass() {
if (getOptLevel() != CodeGenOptLevel::None) {
addPass(createX86FixupBWInsts());
+ addPass(createX86EliminateRedundantZeroExtend());
addPass(createX86PadShortFunctions());
addPass(createX86FixupLEAs());
addPass(createX86FixupInstTuning());
diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
new file mode 100644
index 0000000000000..2c9e46e043187
--- /dev/null
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+; Test that redundant MOVZX instructions are eliminated when the source
+; register is a sub-register of the destination and the destination's upper
+; bits are already known to be zero.
+
+; This is the original countholes test case from GitHub issue that demonstrates
+; the redundant movzbl %cl, %ecx in the loop
+define i32 @countholes(ptr %s) {
+; CHECK-LABEL: countholes:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzbl (%rdi), %ecx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $48, %cl
+; CHECK-NEXT: jb .LBB0_3
+; CHECK-NEXT: # %bb.1: # %while.body.preheader
+; CHECK-NEXT: incq %rdi
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: leaq pre_table(%rip), %rdx
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_2: # %while.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: addl $-48, %ecx
+; CHECK-NEXT: addl (%rdx,%rcx,4), %eax
+; CHECK-NEXT: movzbl (%rdi), %ecx
+; CHECK-NEXT: incq %rdi
+; CHECK-NEXT: cmpb $47, %cl
+; CHECK-NEXT: ja .LBB0_2
+; CHECK-NEXT: .LBB0_3: # %cleanup
+; CHECK-NEXT: retq
+entry:
+ %c.0 = load i8, ptr %s, align 1
+ %conv = zext i8 %c.0 to i32
+ %cmp = icmp ult i8 %c.0, 48
+ br i1 %cmp, label %cleanup, label %while.body.preheader
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %s.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %s, %while.body.preheader ]
+ %c.010 = phi i8 [ %c.1, %while.body ], [ %c.0, %while.body.preheader ]
+ %tot.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+ %conv3 = zext i8 %c.010 to i64
+ %sub = add nsw i64 %conv3, -48
+ %arrayidx = getelementptr inbounds [10 x i32], ptr @pre_table, i64 0, i64 %sub
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add i32 %0, %tot.09
+ %incdec.ptr = getelementptr inbounds i8, ptr %s.addr.011, i64 1
+ %c.1 = load i8, ptr %incdec.ptr, align 1
+ %cmp1 = icmp ult i8 %c.1, 48
+ br i1 %cmp1, label %cleanup.loopexit, label %while.body
+
+cleanup.loopexit:
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i32 [ 0, %entry ], [ %add, %cleanup.loopexit ]
+ ret i32 %retval.0
+}
+
+ at pre_table = internal constant [10 x i32] [i32 1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 2, i32 1], align 4
More information about the llvm-commits
mailing list