[llvm] [X86] Stop emitting CFI instructions on i386-windows (PR #135648)

Mon Apr 14 10:16:17 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Sergei Barannikov (s-barannikov)

<details>
<summary>Changes</summary>

`X86FrameLowering` normally emits CFI instructions when `needsDwarfCFI` returns true. Before this patch it was assumed that this method returns true on non-Windows target, but it also returns true on Windows i386, which results in erroneous generation of CFI instructions on that platform.

This behavior cannot be observed in the generated assembly because AsmPrinter suppresses printing of these instructions for WinEH exception model. I'm going to change this: the idea is that if a target has created a CFI instruction, it should be printed. If it should not be printed, it should not have been created in the first place.

There was a couple of places where `needsDwarfCFI` wasn't used, also resulting in erroneous generation of CFI instruction. Now fixed as well.

The changes in tests seem to be caused by `SlotIndexes` assigning different numbers to instructions, which affects live range lengths and consequently the register allocator heuristics.

---

Patch is 42.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135648.diff


11 Files Affected:

- (modified) llvm/lib/Target/X86/X86CallFrameOptimization.cpp (+1-1) 
- (modified) llvm/lib/Target/X86/X86FrameLowering.cpp (+5-4) 
- (modified) llvm/lib/Target/X86/X86FrameLowering.h (+2-2) 
- (modified) llvm/test/CodeGen/MIR/X86/diexpr-win32.mir (-2) 
- (modified) llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll (+7-7) 
- (modified) llvm/test/CodeGen/X86/andnot-patterns.ll (+10-10) 
- (modified) llvm/test/CodeGen/X86/fp128-cast.ll (+5-5) 
- (modified) llvm/test/CodeGen/X86/fp128-libcalls-strict.ll (+12-12) 
- (modified) llvm/test/CodeGen/X86/optimize-max-0.ll (+158-165) 
- (modified) llvm/test/CodeGen/X86/sbb-false-dep.ll (+11-11) 
- (modified) llvm/test/CodeGen/X86/sdiv_fix.ll (+36-38) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index 0e4add27cce02..c1441e48cf29d 100644
--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -570,7 +570,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
     // For debugging, when using SP-based CFA, we need to adjust the CFA
     // offset after each push.
     // TODO: This is needed only if we require precise CFA.
-    if (!TFL->hasFP(MF))
+    if (TFL->needsDwarfCFI(MF) && !TFL->hasFP(MF))
       TFL->BuildCFI(
           MBB, std::next(Push), DL,
           MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize));
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index b7374558604ec..4846b3c9735f6 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1468,7 +1468,9 @@ bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const {
 }
 
 bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {
-  return !isWin64Prologue(MF) && MF.needsFrameMoves();
+  return MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() !=
+             ExceptionHandling::WinEH &&
+         MF.needsFrameMoves();
 }
 
 /// Return true if an opcode is part of the REP group of instructions
@@ -3808,8 +3810,7 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
     Amount = alignTo(Amount, getStackAlign());
 
     const Function &F = MF.getFunction();
-    bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
-    bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();
+    bool DwarfCFI = needsDwarfCFI(MF);
 
     // If we have any exception handlers in this function, and we adjust
     // the SP before calls, we may need to indicate this to the unwinder
@@ -3818,7 +3819,7 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
     // GNU_ARGS_SIZE.
     // TODO: We don't need to reset this between subsequent functions,
     // if it didn't change.
-    bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
+    bool HasDwarfEHHandlers = DwarfCFI && !MF.getLandingPads().empty();
 
     if (HasDwarfEHHandlers && !isDestroy &&
         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index f1e3796f5fddd..6c6adc6cc035d 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -238,14 +238,14 @@ class X86FrameLowering : public TargetFrameLowering {
   /// frame of the top of stack function) as part of it's ABI.
   bool has128ByteRedZone(const MachineFunction& MF) const;
 
+  bool needsDwarfCFI(const MachineFunction &MF) const;
+
 protected:
   bool hasFPImpl(const MachineFunction &MF) const override;
 
 private:
   bool isWin64Prologue(const MachineFunction &MF) const;
 
-  bool needsDwarfCFI(const MachineFunction &MF) const;
-
   uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
 
   /// Emit target stack probe as a call to a helper function
diff --git a/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir b/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir
index d8d76758a08a0..54112dc9b12fc 100644
--- a/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir
+++ b/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir
@@ -190,8 +190,6 @@ body:             |
     liveins: $esi
 
     frame-setup PUSH32r killed $esi, implicit-def $esp, implicit $esp
-    CFI_INSTRUCTION def_cfa_offset 8
-    CFI_INSTRUCTION offset $esi, -8
     $esi = MOV32rm $esp, 1, _, 8, _ :: (load (s32) from %fixed-stack.2)
     DBG_VALUE $esp, 0, !26, !10, debug-location !25
     DBG_VALUE $esp, 0, !23, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_deref), debug-location !25
diff --git a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index b32afdc2214e0..a3a88bd07e65c 100644
--- a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -28,20 +28,20 @@ define i16 @SQLDriversW(ptr %henv, i16 zeroext  %fDir, ptr %szDrvDesc, i16 signe
 ; CHECK-NEXT:  ## %bb.4: ## %bb37
 ; CHECK-NEXT:    movw $0, 40(%edi)
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    leal (,%ecx,4), %ecx
-; CHECK-NEXT:    leal (,%ebx,4), %edx
+; CHECK-NEXT:    leal (,%ecx,4), %eax
+; CHECK-NEXT:    leal (,%ebx,4), %ecx
 ; CHECK-NEXT:    subl $12, %esp
-; CHECK-NEXT:    movzwl %bp, %eax
+; CHECK-NEXT:    movzwl %bp, %edx
+; CHECK-NEXT:    cwtl
 ; CHECK-NEXT:    movswl %cx, %ecx
-; CHECK-NEXT:    movswl %dx, %edx
 ; CHECK-NEXT:    pushl $87
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $0
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl $0
-; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    calll _SQLDrivers_Internal
 ; CHECK-NEXT:    addl $48, %esp
diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll
index fc573fbd4fc99..370f86dad0427 100644
--- a/llvm/test/CodeGen/X86/andnot-patterns.ll
+++ b/llvm/test/CodeGen/X86/andnot-patterns.ll
@@ -198,28 +198,28 @@ define i64 @andnot_rotl_i64_multiuse_rot(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    notl %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    jmp .LBB4_3
 ; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:  .LBB4_3:
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    andl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    andl %ebx, %edi
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll use_i64 at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    movl %esi, %eax
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484d47ba1..42e9b396fef5b 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -1139,19 +1139,19 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    calll __multf3
 ; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index a85b53ea62ac7..6d95ecc4880e5 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -3418,28 +3418,28 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
 ; X86-NEXT:    addl $32, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %bl
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __unordtf2
 ; X86-NEXT:    addl $32, %esp
@@ -3501,28 +3501,28 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    sete %bl
+; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
@@ -3640,28 +3640,28 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
 ; X86-NEXT:    addl $32, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %bl
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __unordtf2
 ; X86-NEXT:    addl $32, %esp
@@ -3723,28 +3723,28 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    setne %bl
+; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll
index 283c00e17f21a..7a8d2e97bbcbd 100644
--- a/llvm/test/CodeGen/X86/optimize-max-0.ll
+++ b/llvm/test/CodeGen/X86/optimize-max-0.ll
@@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $28, %esp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    imull %ebp, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    imull %esi, %eax
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; CHECK-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.1: ## %bb10.preheader
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    sarl $31, %eax
-; CHECK-NEXT:    shrl $30, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    sarl $2, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    sarl $31, %ebp
+; CHECK-NEXT:    shrl $30, %ebp
+; CHECK-NEXT:    addl %eax, %ebp
+; CHECK-NEXT:    sarl $2, %ebp
+; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.2: ## %bb.nph9
-; CHECK-NEXT:    testl %ebp, %ebp
+; CHECK-NEXT:    testl %esi, %esi
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.3: ## %bb.nph9.split
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    incl %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_4: ## %bb6
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movzbl (%eax,%esi,2), %ebx
-; CHECK-NEXT:    movb %bl, (%edx,%esi)
-; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %ebp, %esi
+; CHECK-NEXT:    movzbl (%eax,%edi,2), %ebx
+; CHECK-NEXT:    movb %bl, (%edx,%edi)
+; CHECK-NEXT:    incl %edi
+; CHECK-NEXT:    cmpl %esi, %edi
 ; CHECK-NEXT:    jl LBB0_4
 ; CHECK-NEXT:  ## %bb.5: ## %bb9
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl %ebp, %edx
-; CHECK-NEXT:    cmpl %edi, %ecx
+; CHECK-NEXT:    addl %esi, %edx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    je LBB0_12
 ; CHECK-NEXT:  ## %bb.6: ## %bb7.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    jmp LBB0_4
 ; CHECK-NEXT:  LBB0_12: ## %bb18.loopexit
+; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %ebp, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    jle LBB0_13
 ; CHECK-NEXT:  ## %bb.7: ## %bb.nph5
-; CHECK-NEXT:    cmpl $2, %ebp
+; CHECK-NEXT:    cmpl $2, %esi
 ; CHECK-NEXT:    jl LBB0_13
 ; CHECK-NEXT:  ## %bb.8: ## %bb.nph5.split
-; CHECK-NEXT:    movl %ebp, %edx
-; CHECK-NEXT:    shrl $31, %edx
-; CHECK-NEXT:    addl %ebp, %edx
-; CHECK-NEXT:    sarl %edx
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    shrl $31, %ebp
+; CHECK-NEXT:    addl %esi, %ebp
+; CHECK-NEXT:    sarl %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $31, %ecx
@@ -84,12 +84,12 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    addl $2, %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl %esi, %ecx
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_9: ## %bb13
@@ -97,89 +97,90 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    ## Child Loop BB0_10 Depth 2
 ; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addl %edx, %edi
 ; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_10: ## %bb14
 ; CHECK-NEXT:    ## Parent Loop BB0_9 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movzbl -2(%edi,%esi,4), %ebx
-; CHECK-NEXT:    movb %bl, (%ecx,%esi)
-; CHECK-NEXT:    movzbl (%edi,%esi,4), %ebx
-; CHECK-NEXT:    movb %bl, (%eax,%esi)
-; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %edx, %esi
+; CHECK-NEXT:    movzbl -2(%edi,%ebx,4), %edx
+; CHECK-NEXT:    movb %dl, (%ecx,%ebx)
+; CHECK-NEXT:    movzbl (%edi,%ebx,4), %edx
+; CHECK-NEXT:    movb %dl, (%eax,%ebx)
+; CHECK-NEXT:    incl %ebx
+; CHECK-NEXT:    cmpl %ebp, %ebx
 ; CHECK-NEXT:    jl LBB0_10
 ; CHECK-NEXT:  ## %bb.11: ## %bb17
 ; CHECK-NEXT:    ## in Loop: Header=BB0_9 Depth=1
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl $2, %esi
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %ebp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    addl %ebp, %ecx
 ; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; CHECK-NEXT:    jl LBB0_9
 ; CHECK-NEXT:  LBB0_13: ## %bb20
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $1, %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.14: ## %bb20
-; CHECK-NEXT:    cmpl $3, %eax
+; CHECK-NEXT:    cmpl $3, %ecx
 ; CHECK-NEXT:    jne LBB0_24
 ; CHECK-NEXT:  ## %bb.15: ## %bb22
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; CHECK-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; CHECK-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folde...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/135648