[llvm] [MachineCSE] Trivially coalesce subreg copies to expose CSE (PR #153120)

Mon Aug 11 18:52:38 PDT 2025

https://github.com/AZero13 created https://github.com/llvm/llvm-project/pull/153120

None

>From 2af22bd273cf334d0310efb6b9e1bf5d6d1265bd Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Mon, 11 Aug 2025 21:52:17 -0400
Subject: [PATCH] [MachineCSE] Trivially coalesce subreg copies to expose CSE

---
 llvm/lib/CodeGen/MachineCSE.cpp               | 54 ++++++++++++++-----
 llvm/test/CodeGen/AArch64/zext-to-tbl.ll      | 16 +++---
 .../test/CodeGen/X86/cse-add-with-overflow.ll |  3 +-
 3 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 780ed92bc8581..a1d236c7955ec 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -186,20 +186,48 @@ bool MachineCSEImpl::PerformTrivialCopyPropagation(MachineInstr *MI,
     Register SrcReg = DefMI->getOperand(1).getReg();
     if (!SrcReg.isVirtual())
       continue;
-    // FIXME: We should trivially coalesce subregister copies to expose CSE
-    // opportunities on instructions with truncated operands (see
-    // cse-add-with-overflow.ll). This can be done here as follows:
-    // if (SrcSubReg)
-    //  RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC,
-    //                                     SrcSubReg);
-    // MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
-    //
-    // The 2-addr pass has been updated to handle coalesced subregs. However,
-    // some machine-specific code still can't handle it.
-    // To handle it properly we also need a way find a constrained subregister
-    // class given a super-reg class and subreg index.
-    if (DefMI->getOperand(1).getSubReg())
+    unsigned SrcSubReg = DefMI->getOperand(1).getSubReg();
+    if (SrcSubReg) {
+      const TargetRegisterClass *UseRC = MRI->getRegClassOrNull(Reg);
+      const TargetRegisterClass *SrcRC = MRI->getRegClassOrNull(SrcReg);
+      if (!UseRC || !SrcRC)
+        continue;
+
+      const TargetRegisterClass *NewSuperRC =
+          TRI->getMatchingSuperRegClass(SrcRC, UseRC, SrcSubReg);
+      if (!NewSuperRC)
+        continue;
+
+      if (!MRI->constrainRegClass(SrcReg, NewSuperRC))
+        continue;
+
+      // Note: We don't call constrainRegAttrs(SrcReg, Reg) here because
+      // we're replacing uses of Reg with SrcReg:SrcSubReg, not merging
+      // their constraints. The substVirtReg call will handle the substitution.
+
+      LLVM_DEBUG(dbgs() << "Coalescing (subreg): " << *DefMI);
+      LLVM_DEBUG(dbgs() << "***              to: " << *MI);
+
+      // Propagate SrcReg:SrcSubReg of copies to MI.
+      MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
+      MRI->clearKillFlags(SrcReg);
+
+      if (OnlyOneUse) {
+        SmallVector<MachineOperand *, 4> DbgUses;
+        for (auto &U : MRI->use_operands(Reg)) {
+          MachineInstr *UDI = U.getParent();
+          if (UDI->isDebugValue())
+            DbgUses.push_back(&U);
+        }
+        for (MachineOperand *U : DbgUses)
+          U->substVirtReg(SrcReg, SrcSubReg, *TRI);
+
+        DefMI->eraseFromParent();
+        ++NumCoalesces;
+      }
+      Changed = true;
       continue;
+    }
     if (!MRI->constrainRegAttrs(SrcReg, Reg))
       continue;
     LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 74a717f1635a3..8f5974881c489 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1247,12 +1247,12 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    ubfx x12, x9, #48, #4
 ; CHECK-NEXT:    lsr x10, x9, #52
-; CHECK-NEXT:    ubfx x13, x9, #32, #4
+; CHECK-NEXT:    ubfx x14, x9, #32, #4
 ; CHECK-NEXT:    ubfx w15, w9, #16, #4
 ; CHECK-NEXT:    lsr x11, x9, #36
-; CHECK-NEXT:    lsr w14, w9, #20
+; CHECK-NEXT:    lsr w13, w9, #20
 ; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    fmov s2, w13
+; CHECK-NEXT:    fmov s2, w14
 ; CHECK-NEXT:    lsr w12, w9, #4
 ; CHECK-NEXT:    fmov s3, w15
 ; CHECK-NEXT:    mov.h v1[1], w10
@@ -1260,7 +1260,7 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    mov.h v2[1], w11
 ; CHECK-NEXT:    fmov s4, w10
 ; CHECK-NEXT:    lsr x11, x9, #56
-; CHECK-NEXT:    mov.h v3[1], w14
+; CHECK-NEXT:    mov.h v3[1], w13
 ; CHECK-NEXT:    lsr x10, x9, #40
 ; CHECK-NEXT:    mov.h v4[1], w12
 ; CHECK-NEXT:    lsr w12, w9, #24
@@ -1301,14 +1301,14 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    add x8, x8, #16
 ; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ubfx w11, w9, #12, #4
-; CHECK-BE-NEXT:    lsr w14, w9, #28
+; CHECK-BE-NEXT:    lsr w13, w9, #28
 ; CHECK-BE-NEXT:    lsr w10, w9, #8
 ; CHECK-BE-NEXT:    ubfx x15, x9, #44, #4
 ; CHECK-BE-NEXT:    lsr w12, w9, #24
-; CHECK-BE-NEXT:    lsr x13, x9, #40
+; CHECK-BE-NEXT:    lsr x14, x9, #40
 ; CHECK-BE-NEXT:    fmov s1, w11
 ; CHECK-BE-NEXT:    lsr x11, x9, #60
-; CHECK-BE-NEXT:    fmov s2, w14
+; CHECK-BE-NEXT:    fmov s2, w13
 ; CHECK-BE-NEXT:    fmov s3, w15
 ; CHECK-BE-NEXT:    fmov s4, w11
 ; CHECK-BE-NEXT:    lsr w11, w9, #20
@@ -1316,7 +1316,7 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    lsr x10, x9, #56
 ; CHECK-BE-NEXT:    mov v2.h[1], w12
 ; CHECK-BE-NEXT:    lsr w12, w9, #4
-; CHECK-BE-NEXT:    mov v3.h[1], w13
+; CHECK-BE-NEXT:    mov v3.h[1], w14
 ; CHECK-BE-NEXT:    mov v4.h[1], w10
 ; CHECK-BE-NEXT:    lsr x10, x9, #36
 ; CHECK-BE-NEXT:    mov v1.h[2], w12
diff --git a/llvm/test/CodeGen/X86/cse-add-with-overflow.ll b/llvm/test/CodeGen/X86/cse-add-with-overflow.ll
index 40214267e1743..ff71472a65f7d 100644
--- a/llvm/test/CodeGen/X86/cse-add-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/cse-add-with-overflow.ll
@@ -1,12 +1,11 @@
 ; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=generic | FileCheck %s
-; XFAIL: *
 ; rdar:15661073 simple example of redundant adds
 ;
 ; MachineCSE should coalesce trivial subregister copies.
 ;
 ; The extra movl+addl should be removed during MachineCSE.
 ; CHECK-LABEL: redundantadd
-; CHECK: cmpq
+; CHECK: cmpl
 ; CHECK: movq
 ; CHECK-NOT: movl
 ; CHECK: addl