[llvm] [MCP] Optimize copies when src is used during backward propagation (PR #111130)

Fri Oct 4 03:29:27 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-backend-arm

Author: Vladimir Radosavljevic (vladimirradosavljevic)

<details>
<summary>Changes</summary>

Before this patch, redundant COPY couldn't be removed for the following case:
```
  $R0 = OP ...
  ... // Read of %R0
  $R1 = COPY killed $R0
```
This patch adds support for tracking the users of the source register during backward propagation, so that we can remove the redundant COPY in the above case and optimize it to:
```
  $R1 = OP ...
  ... // Replace all uses of %R0 with $R1
```

---

Patch is 1.37 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111130.diff


65 Files Affected:

- (modified) llvm/lib/CodeGen/MachineCopyPropagation.cpp (+75-2) 
- (modified) llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll (+2-3) 
- (modified) llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll (+30-38) 
- (modified) llvm/test/CodeGen/Mips/llvm-ir/srem.ll (+30-38) 
- (modified) llvm/test/CodeGen/Mips/llvm-ir/udiv.ll (+30-38) 
- (modified) llvm/test/CodeGen/Mips/llvm-ir/urem.ll (+30-38) 
- (modified) llvm/test/CodeGen/Mips/mcount.ll (+2-3) 
- (removed) llvm/test/CodeGen/Mips/micromips-gp-rc.ll (-18) 
- (modified) llvm/test/CodeGen/Mips/tailcall/tailcall.ll (+17-4) 
- (modified) llvm/test/CodeGen/Mips/tls.ll (+3-4) 
- (modified) llvm/test/CodeGen/X86/fp128-libcalls-strict.ll (+4-6) 
- (modified) llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll (+5-8) 
- (modified) llvm/test/CodeGen/X86/matrix-multiply.ll (+15-20) 
- (modified) llvm/test/CodeGen/X86/mul-i1024.ll (+29-40) 
- (modified) llvm/test/CodeGen/X86/mul-i512.ll (+6-8) 
- (modified) llvm/test/CodeGen/X86/pr46877.ll (+2-3) 
- (modified) llvm/test/CodeGen/X86/sdiv_fix.ll (+2-3) 
- (modified) llvm/test/CodeGen/X86/shift-and.ll (+2-3) 
- (modified) llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll (+2-3) 
- (modified) llvm/test/CodeGen/X86/sqrt-fastmath.ll (+3-4) 
- (modified) llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll (+2-3) 
- (modified) llvm/test/CodeGen/X86/vec_smulo.ll (+6-9) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll (+42-61) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll (+11-15) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll (+172-247) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll (+37-54) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll (+222-325) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll (+89-128) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll (+27-36) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll (+100-141) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll (+73-103) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll (+70-97) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll (+248-335) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll (+164-236) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll (+6-8) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll (+126-162) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll (+132-177) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll (+112-152) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll (+80-112) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll (+17-25) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll (+49-70) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll (+73-99) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll (+159-228) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll (+193-283) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll (+37-52) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll (+17-25) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll (+131-181) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+345-471) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll (+57-84) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll (+39-52) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll (+96-120) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll (+49-64) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll (+244-338) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll (+146-216) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll (+8-11) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll (+136-152) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll (+268-352) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+38-52) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+60-84) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+107-149) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll (+41-61) 
- (modified) llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll (+28-37) 
- (modified) llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll (+18-24) 
- (modified) llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll (+16-21) 
- (modified) llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll (+2-3) 


``````````diff

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 8bcc437cbfb865..8293aba823ed79 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -110,6 +110,7 @@ class CopyTracker {
   struct CopyInfo {
     MachineInstr *MI = nullptr;
     MachineInstr *LastSeenUseInCopy = nullptr;
+    SmallPtrSet<MachineInstr *, 4> SrcUsers;
     SmallVector<MCRegister, 4> DefRegs;
     bool Avail = false;
   };
@@ -224,6 +225,43 @@ class CopyTracker {
     }
   }
 
+  /// Track copy's src users, and return false if that can't be done.
+  /// We can only track if we have a COPY instruction which source is
+  /// the same as the Reg.
+  bool trackSrcUsers(MCRegister Reg, MachineInstr &MI,
+                     const TargetRegisterInfo &TRI, const TargetInstrInfo &TII,
+                     bool UseCopyInstr) {
+    MCRegUnit RU = *TRI.regunits(Reg).begin();
+    MachineInstr *AvailCopy = findCopyDefViaUnit(RU, TRI);
+    if (!AvailCopy)
+      return false;
+
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*AvailCopy, TII, UseCopyInstr);
+    Register Src = CopyOperands->Source->getReg();
+
+    // Bail out, if the source of the copy is not the same as the Reg.
+    if (Src != Reg)
+      return false;
+
+    auto I = Copies.find(RU);
+    if (I == Copies.end())
+      return false;
+
+    I->second.SrcUsers.insert(&MI);
+    return true;
+  }
+
+  /// Return the users for a given register.
+  SmallPtrSet<MachineInstr *, 4> getSrcUsers(MCRegister Reg,
+                                             const TargetRegisterInfo &TRI) {
+    MCRegUnit RU = *TRI.regunits(Reg).begin();
+    auto I = Copies.find(RU);
+    if (I == Copies.end())
+      return {};
+    return I->second.SrcUsers;
+  }
+
   /// Add this copy's registers into the tracker's copy maps.
   void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI,
                  const TargetInstrInfo &TII, bool UseCopyInstr) {
@@ -236,7 +274,7 @@ class CopyTracker {
 
     // Remember Def is defined by the copy.
     for (MCRegUnit Unit : TRI.regunits(Def))
-      Copies[Unit] = {MI, nullptr, {}, true};
+      Copies[Unit] = {MI, nullptr, {}, {}, true};
 
     // Remember source that's copied to Def. Once it's clobbered, then
     // it's no longer available for copy propagation.
@@ -427,6 +465,7 @@ class MachineCopyPropagation : public MachineFunctionPass {
   bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);
   bool hasOverlappingMultipleDef(const MachineInstr &MI,
                                  const MachineOperand &MODef, Register Def);
+  bool canUpdateSrcUsers(const MachineInstr &Copy, const MachineOperand &MODef);
 
   /// Candidates for deletion.
   SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
@@ -667,6 +706,26 @@ bool MachineCopyPropagation::hasOverlappingMultipleDef(
   return false;
 }
 
+/// Return true if it is safe to update the users of the source register of the
+/// copy.
+bool MachineCopyPropagation::canUpdateSrcUsers(const MachineInstr &Copy,
+                                               const MachineOperand &CopySrc) {
+  for (auto *SrcUser : Tracker.getSrcUsers(CopySrc.getReg(), *TRI)) {
+    if (hasImplicitOverlap(*SrcUser, CopySrc))
+      return false;
+
+    for (MachineOperand &MO : SrcUser->uses()) {
+      if (!MO.isReg() || MO.getReg() != CopySrc.getReg())
+        continue;
+      if (MO.isTied() || !MO.isRenamable() ||
+          !isBackwardPropagatableRegClassCopy(Copy, *SrcUser,
+                                              MO.getOperandNo()))
+        return false;
+    }
+  }
+  return true;
+}
+
 /// Look for available copies whose destination register is used by \p MI and
 /// replace the use in \p MI with the copy's source register.
 void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
@@ -1030,6 +1089,9 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
     if (hasOverlappingMultipleDef(MI, MODef, Def))
       continue;
 
+    if (!canUpdateSrcUsers(*Copy, *CopyOperands->Source))
+      continue;
+
     LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
                       << "\n     with " << printReg(Def, TRI) << "\n     in "
                       << MI << "     from " << *Copy);
@@ -1037,6 +1099,15 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
     MODef.setReg(Def);
     MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
 
+    for (auto *SrcUser : Tracker.getSrcUsers(Src, *TRI)) {
+      for (MachineOperand &MO : SrcUser->operands()) {
+        if (!MO.isReg() || !MO.isUse() || MO.getReg() != Src)
+          continue;
+        MO.setReg(Def);
+        MO.setIsRenamable(CopyOperands->Destination->isRenamable());
+      }
+    }
+
     LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
     MaybeDeadCopies.insert(Copy);
     Changed = true;
@@ -1102,7 +1173,9 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
               CopyDbgUsers[Copy].insert(&MI);
             }
           }
-        } else {
+        } else if (!Tracker.trackSrcUsers(MO.getReg().asMCReg(), MI, *TRI, *TII,
+                                          UseCopyInstr)) {
+          // If we can't track the source users, invalidate the register.
           Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
                                      UseCopyInstr);
         }
diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
index afd75940b45932..464808ec8861b3 100644
--- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
@@ -7,12 +7,11 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6:       @ %bb.0: @ %start
 ; ARMV6-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ARMV6-NEXT:    sub sp, sp, #28
-; ARMV6-NEXT:    ldr r7, [sp, #72]
+; ARMV6-NEXT:    ldr lr, [sp, #72]
 ; ARMV6-NEXT:    mov r6, r0
 ; ARMV6-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; ARMV6-NEXT:    ldr r4, [sp, #84]
-; ARMV6-NEXT:    umull r1, r0, r2, r7
-; ARMV6-NEXT:    mov lr, r7
+; ARMV6-NEXT:    umull r1, r0, r2, lr
 ; ARMV6-NEXT:    umull r5, r10, r4, r2
 ; ARMV6-NEXT:    str r1, [r6]
 ; ARMV6-NEXT:    ldr r6, [sp, #80]
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll
index 8d548861f43936..72cead18f89fab 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll
@@ -388,9 +388,8 @@ define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) {
 ; MMR3-NEXT:    .cfi_def_cfa_offset 24
 ; MMR3-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MMR3-NEXT:    .cfi_offset 31, -4
-; MMR3-NEXT:    addu $2, $2, $25
-; MMR3-NEXT:    lw $25, %call16(__divdi3)($2)
-; MMR3-NEXT:    move $gp, $2
+; MMR3-NEXT:    addu $gp, $2, $25
+; MMR3-NEXT:    lw $25, %call16(__divdi3)($gp)
 ; MMR3-NEXT:    jalr $25
 ; MMR3-NEXT:    nop
 ; MMR3-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
@@ -405,9 +404,8 @@ define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) {
 ; MMR6-NEXT:    .cfi_def_cfa_offset 24
 ; MMR6-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 31, -4
-; MMR6-NEXT:    addu $2, $2, $25
-; MMR6-NEXT:    lw $25, %call16(__divdi3)($2)
-; MMR6-NEXT:    move $gp, $2
+; MMR6-NEXT:    addu $gp, $2, $25
+; MMR6-NEXT:    lw $25, %call16(__divdi3)($gp)
 ; MMR6-NEXT:    jalr $25
 ; MMR6-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
 ; MMR6-NEXT:    addiu $sp, $sp, 24
@@ -549,65 +547,59 @@ define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) {
 ; MMR3:       # %bb.0: # %entry
 ; MMR3-NEXT:    lui $2, %hi(_gp_disp)
 ; MMR3-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MMR3-NEXT:    addiusp -48
-; MMR3-NEXT:    .cfi_def_cfa_offset 48
-; MMR3-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    swp $16, 36($sp)
+; MMR3-NEXT:    addiusp -40
+; MMR3-NEXT:    .cfi_def_cfa_offset 40
+; MMR3-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MMR3-NEXT:    sw $17, 32($sp) # 4-byte Folded Spill
 ; MMR3-NEXT:    .cfi_offset 31, -4
 ; MMR3-NEXT:    .cfi_offset 17, -8
-; MMR3-NEXT:    .cfi_offset 16, -12
-; MMR3-NEXT:    addu $16, $2, $25
+; MMR3-NEXT:    addu $gp, $2, $25
 ; MMR3-NEXT:    move $1, $7
-; MMR3-NEXT:    lw $7, 68($sp)
-; MMR3-NEXT:    lw $17, 72($sp)
-; MMR3-NEXT:    lw $3, 76($sp)
+; MMR3-NEXT:    lw $7, 60($sp)
+; MMR3-NEXT:    lw $17, 64($sp)
+; MMR3-NEXT:    lw $3, 68($sp)
 ; MMR3-NEXT:    move $2, $sp
 ; MMR3-NEXT:    sw16 $3, 28($2)
 ; MMR3-NEXT:    sw16 $17, 24($2)
 ; MMR3-NEXT:    sw16 $7, 20($2)
-; MMR3-NEXT:    lw $3, 64($sp)
+; MMR3-NEXT:    lw $3, 56($sp)
 ; MMR3-NEXT:    sw16 $3, 16($2)
-; MMR3-NEXT:    lw $25, %call16(__divti3)($16)
+; MMR3-NEXT:    lw $25, %call16(__divti3)($gp)
 ; MMR3-NEXT:    move $7, $1
-; MMR3-NEXT:    move $gp, $16
 ; MMR3-NEXT:    jalr $25
 ; MMR3-NEXT:    nop
-; MMR3-NEXT:    lwp $16, 36($sp)
-; MMR3-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    addiusp 48
+; MMR3-NEXT:    lw $17, 32($sp) # 4-byte Folded Reload
+; MMR3-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: sdiv_i128:
 ; MMR6:       # %bb.0: # %entry
 ; MMR6-NEXT:    lui $2, %hi(_gp_disp)
 ; MMR6-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MMR6-NEXT:    addiu $sp, $sp, -48
-; MMR6-NEXT:    .cfi_def_cfa_offset 48
-; MMR6-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $17, 40($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    addiu $sp, $sp, -40
+; MMR6-NEXT:    .cfi_def_cfa_offset 40
+; MMR6-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    sw $17, 32($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 31, -4
 ; MMR6-NEXT:    .cfi_offset 17, -8
-; MMR6-NEXT:    .cfi_offset 16, -12
-; MMR6-NEXT:    addu $16, $2, $25
+; MMR6-NEXT:    addu $gp, $2, $25
 ; MMR6-NEXT:    move $1, $7
-; MMR6-NEXT:    lw $7, 68($sp)
-; MMR6-NEXT:    lw $17, 72($sp)
-; MMR6-NEXT:    lw $3, 76($sp)
+; MMR6-NEXT:    lw $7, 60($sp)
+; MMR6-NEXT:    lw $17, 64($sp)
+; MMR6-NEXT:    lw $3, 68($sp)
 ; MMR6-NEXT:    move $2, $sp
 ; MMR6-NEXT:    sw16 $3, 28($2)
 ; MMR6-NEXT:    sw16 $17, 24($2)
 ; MMR6-NEXT:    sw16 $7, 20($2)
-; MMR6-NEXT:    lw $3, 64($sp)
+; MMR6-NEXT:    lw $3, 56($sp)
 ; MMR6-NEXT:    sw16 $3, 16($2)
-; MMR6-NEXT:    lw $25, %call16(__divti3)($16)
+; MMR6-NEXT:    lw $25, %call16(__divti3)($gp)
 ; MMR6-NEXT:    move $7, $1
-; MMR6-NEXT:    move $gp, $16
 ; MMR6-NEXT:    jalr $25
-; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $17, 40($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    addiu $sp, $sp, 48
+; MMR6-NEXT:    lw $17, 32($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = sdiv i128 %a, %b
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll
index 29cb34b8d970f1..72496fcc53a5ac 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll
@@ -336,9 +336,8 @@ define signext i64 @srem_i64(i64 signext %a, i64 signext %b) {
 ; MMR3-NEXT:    .cfi_def_cfa_offset 24
 ; MMR3-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MMR3-NEXT:    .cfi_offset 31, -4
-; MMR3-NEXT:    addu $2, $2, $25
-; MMR3-NEXT:    lw $25, %call16(__moddi3)($2)
-; MMR3-NEXT:    move $gp, $2
+; MMR3-NEXT:    addu $gp, $2, $25
+; MMR3-NEXT:    lw $25, %call16(__moddi3)($gp)
 ; MMR3-NEXT:    jalr $25
 ; MMR3-NEXT:    nop
 ; MMR3-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
@@ -353,9 +352,8 @@ define signext i64 @srem_i64(i64 signext %a, i64 signext %b) {
 ; MMR6-NEXT:    .cfi_def_cfa_offset 24
 ; MMR6-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 31, -4
-; MMR6-NEXT:    addu $2, $2, $25
-; MMR6-NEXT:    lw $25, %call16(__moddi3)($2)
-; MMR6-NEXT:    move $gp, $2
+; MMR6-NEXT:    addu $gp, $2, $25
+; MMR6-NEXT:    lw $25, %call16(__moddi3)($gp)
 ; MMR6-NEXT:    jalr $25
 ; MMR6-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
 ; MMR6-NEXT:    addiu $sp, $sp, 24
@@ -497,65 +495,59 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) {
 ; MMR3:       # %bb.0: # %entry
 ; MMR3-NEXT:    lui $2, %hi(_gp_disp)
 ; MMR3-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MMR3-NEXT:    addiusp -48
-; MMR3-NEXT:    .cfi_def_cfa_offset 48
-; MMR3-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    swp $16, 36($sp)
+; MMR3-NEXT:    addiusp -40
+; MMR3-NEXT:    .cfi_def_cfa_offset 40
+; MMR3-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MMR3-NEXT:    sw $17, 32($sp) # 4-byte Folded Spill
 ; MMR3-NEXT:    .cfi_offset 31, -4
 ; MMR3-NEXT:    .cfi_offset 17, -8
-; MMR3-NEXT:    .cfi_offset 16, -12
-; MMR3-NEXT:    addu $16, $2, $25
+; MMR3-NEXT:    addu $gp, $2, $25
 ; MMR3-NEXT:    move $1, $7
-; MMR3-NEXT:    lw $7, 68($sp)
-; MMR3-NEXT:    lw $17, 72($sp)
-; MMR3-NEXT:    lw $3, 76($sp)
+; MMR3-NEXT:    lw $7, 60($sp)
+; MMR3-NEXT:    lw $17, 64($sp)
+; MMR3-NEXT:    lw $3, 68($sp)
 ; MMR3-NEXT:    move $2, $sp
 ; MMR3-NEXT:    sw16 $3, 28($2)
 ; MMR3-NEXT:    sw16 $17, 24($2)
 ; MMR3-NEXT:    sw16 $7, 20($2)
-; MMR3-NEXT:    lw $3, 64($sp)
+; MMR3-NEXT:    lw $3, 56($sp)
 ; MMR3-NEXT:    sw16 $3, 16($2)
-; MMR3-NEXT:    lw $25, %call16(__modti3)($16)
+; MMR3-NEXT:    lw $25, %call16(__modti3)($gp)
 ; MMR3-NEXT:    move $7, $1
-; MMR3-NEXT:    move $gp, $16
 ; MMR3-NEXT:    jalr $25
 ; MMR3-NEXT:    nop
-; MMR3-NEXT:    lwp $16, 36($sp)
-; MMR3-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    addiusp 48
+; MMR3-NEXT:    lw $17, 32($sp) # 4-byte Folded Reload
+; MMR3-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: srem_i128:
 ; MMR6:       # %bb.0: # %entry
 ; MMR6-NEXT:    lui $2, %hi(_gp_disp)
 ; MMR6-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MMR6-NEXT:    addiu $sp, $sp, -48
-; MMR6-NEXT:    .cfi_def_cfa_offset 48
-; MMR6-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $17, 40($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    addiu $sp, $sp, -40
+; MMR6-NEXT:    .cfi_def_cfa_offset 40
+; MMR6-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    sw $17, 32($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 31, -4
 ; MMR6-NEXT:    .cfi_offset 17, -8
-; MMR6-NEXT:    .cfi_offset 16, -12
-; MMR6-NEXT:    addu $16, $2, $25
+; MMR6-NEXT:    addu $gp, $2, $25
 ; MMR6-NEXT:    move $1, $7
-; MMR6-NEXT:    lw $7, 68($sp)
-; MMR6-NEXT:    lw $17, 72($sp)
-; MMR6-NEXT:    lw $3, 76($sp)
+; MMR6-NEXT:    lw $7, 60($sp)
+; MMR6-NEXT:    lw $17, 64($sp)
+; MMR6-NEXT:    lw $3, 68($sp)
 ; MMR6-NEXT:    move $2, $sp
 ; MMR6-NEXT:    sw16 $3, 28($2)
 ; MMR6-NEXT:    sw16 $17, 24($2)
 ; MMR6-NEXT:    sw16 $7, 20($2)
-; MMR6-NEXT:    lw $3, 64($sp)
+; MMR6-NEXT:    lw $3, 56($sp)
 ; MMR6-NEXT:    sw16 $3, 16($2)
-; MMR6-NEXT:    lw $25, %call16(__modti3)($16)
+; MMR6-NEXT:    lw $25, %call16(__modti3)($gp)
 ; MMR6-NEXT:    move $7, $1
-; MMR6-NEXT:    move $gp, $16
 ; MMR6-NEXT:    jalr $25
-; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $17, 40($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    addiu $sp, $sp, 48
+; MMR6-NEXT:    lw $17, 32($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = srem i128 %a, %b
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll
index cc2c6614e69c8f..9451f1e9be0967 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll
@@ -336,9 +336,8 @@ define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) {
 ; MMR3-NEXT:    .cfi_def_cfa_offset 24
 ; MMR3-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MMR3-NEXT:    .cfi_offset 31, -4
-; MMR3-NEXT:    addu $2, $2, $25
-; MMR3-NEXT:    lw $25, %call16(__udivdi3)($2)
-; MMR3-NEXT:    move $gp, $2
+; MMR3-NEXT:    addu $gp, $2, $25
+; MMR3-NEXT:    lw $25, %call16(__udivdi3)($gp)
 ; MMR3-NEXT:    jalr $25
 ; MMR3-NEXT:    nop
 ; MMR3-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
@@ -353,9 +352,8 @@ define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) {
 ; MMR6-NEXT:    .cfi_def_cfa_offset 24
 ; MMR6-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 31, -4
-; MMR6-NEXT:    addu $2, $2, $25
-; MMR6-NEXT:    lw $25, %call16(__udivdi3)($2)
-; MMR6-NEXT:    move $gp, $2
+; MMR6-NEXT:    addu $gp, $2, $25
+; MMR6-NEXT:    lw $25, %call16(__udivdi3)($gp)
 ; MMR6-NEXT:    jalr $25
 ; MMR6-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
 ; MMR6-NEXT:    addiu $sp, $sp, 24
@@ -497,65 +495,59 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) {
 ; MMR3:       # %bb.0: # %entry
 ; MMR3-NEXT:    lui $2, %hi(_gp_disp)
 ; MMR3-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MMR3-NEXT:    addiusp -48
-; MMR3-NEXT:    .cfi_def_cfa_offset 48
-; MMR3-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    swp $16, 36($sp)
+; MMR3-NEXT:    addiusp -40
+; MMR3-NEXT:    .cfi_def_cfa_offset 40
+; MMR3-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MMR3-NEXT:    sw $17, 32($sp) # 4-byte Folded Spill
 ; MMR3-NEXT:    .cfi_offset 31, -4
 ; MMR3-NEXT:    .cfi_offset 17, -8
-; MMR3-NEXT:    .cfi_offset 16, -12
-; MMR3-NEXT:    addu $16, $2, $25
+; MMR3-NEXT:    addu $gp, $2, $25
 ; MMR3-NEXT:    move $1, $7
-; MMR3-NEXT:    lw $7, 68($sp)
-; MMR3-NEXT:    lw $17, 72($sp)
-; MMR3-NEXT:    lw $3, 76($sp)
+; MMR3-NEXT:    lw $7, 60($sp)
+; MMR3-NEXT:    lw $17, 64($sp)
+; MMR3-NEXT:    lw $3, 68($sp)
 ; MMR3-NEXT:    move $2, $sp
 ; MMR3-NEXT:    sw16 $3, 28($2)
 ; MMR3-NEXT:    sw16 $17, 24($2)
 ; MMR3-NEXT:    sw16 $7, 20($2)
-; MMR3-NEXT:    lw $3, 64($sp)
+; MMR3-NEXT:    lw $3, 56($sp)
 ; MMR3-NEXT:    sw16 $3, 16($2)
-; MMR3-NEXT:    lw $25, %call16(__udivti3)($16)
+; MMR3-NEXT:    lw $25, %call16(__udivti3)($gp)
 ; MMR3-NEXT:    move $7, $1
-; MMR3-NEXT:    move $gp, $16
 ; MMR3-NEXT:    jalr $25
 ; MMR3-NEXT:    nop
-; MMR3-NEXT:    lwp $16, 36($sp)
-; MMR3-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    addiusp 48
+; MMR3-NEXT:    lw $17, 32($sp) # 4-byte Folded Reload
+; MMR3-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: udiv_i128:
 ; MMR6:       # %bb.0: # %entry
 ; MMR6-NEXT:    lui $2, %hi(_gp_disp)
 ; MMR6-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MMR6-NEXT:    addiu $sp, $sp, -48
-; MMR6-NEXT:    .cfi_def_cfa_offset 48
-; MMR6-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $17, 40($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    addiu $sp, $sp, -40
+; MMR6-NEXT:    .cfi_def_cfa_offset 40
+; MMR6-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    sw $17, 32($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 31, -4
 ; MMR6-NEXT:    .cfi_offset 17, -8
-; MMR6-NEXT:    .cfi_offset 16, -12
-; MMR6-NEXT:    addu $16, $2, $25
+; MMR6-NEXT:    addu $gp, $2, $25
 ; MMR6-NEXT:    move $1, $7
-; MMR6-NEXT:    lw $7, 68($sp)
-; MMR6-NEXT:    lw $17, 72($sp)
-; MMR6-NEXT:    lw $3, 76($sp)
+; MMR6-NEXT:    lw $7, 60($sp)
+; MMR6-NEXT:    lw $17, 64($sp)
+; MMR6-NEXT:    lw $3, 68($sp)
 ; MMR6-NEXT:    move $2, $sp
 ; MMR6-NEXT:    sw16 $3, 28($2)
 ; MMR6-NEXT:    sw16 $17, 24($2)
 ; MMR6-NEXT:    sw16 $7, 20($2)
-; MMR6-NEXT:    lw $3, 64($sp)
+; MMR6-NEXT:    lw $3, 56($sp)
 ; MMR6-NEXT:    sw16 $3, 16($2)
-; MMR6-NEXT:    lw $25, %call16(__udivti3)($16)
+; MMR6-NEXT:    lw $25, %call16(__udivti3)($gp)
 ; MMR6-NEXT:    move $7, $1
-; MMR6-NEXT:    move $gp, $16
 ; MMR6-NEXT:    jalr $25
-; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $17, 40($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    addiu $sp, $sp, 48
+; MMR6-NEXT:    lw $17, 32($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    lw $ra, 36($sp) # 4-byte Folded Relo...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/111130