[llvm] Greedy: Make trySplitAroundHintReg try to match hints with subreg copies (PR #160294)

Tue Sep 23 06:09:36 PDT 2025

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/160294

This is essentially the same patch as 116ca9522e89f1e4e02676b5bbe505e80c4d4933;
when trying to match a physreg hint, try to find a compatible physreg if there is
a subregister copy. This has the slight difference of using getSubReg on the hint
instead of getMatchingSuperReg (the other use should also use getSubReg instead,
it's faster).

At the moment this turns out to have very little effect. The adjacent code needs
better handling of subregisters, so continue adding this piecemeal. The X86 test
shows a net reduction in real instructions, plus a few new kills.

>From 35f1e158c17351fcd02abfb0b50e32fc2c630c28 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 18 Sep 2025 23:39:05 +0900
Subject: [PATCH] Greedy: Make trySplitAroundHintReg try to match hints with
 subreg copies

This is essentially the same patch as 116ca9522e89f1e4e02676b5bbe505e80c4d4933;
when trying to match a physreg hint, try to find a compatible physreg if there is
a subregister copy. This has the slight difference of using getSubReg on the hint
instead of getMatchingSuperReg (the other use should also use getSubReg instead,
it's faster).

At the moment this turns out to have very little effect. The adjacent code needs
better handling of subregisters, so continue adding this piecemeal. The X86 test
shows a net reduction in real instructions, plus a few new kills.
---
 llvm/lib/CodeGen/RegAllocGreedy.cpp      | 30 +++++++++++++-----
 llvm/test/CodeGen/X86/atomic-bit-test.ll | 40 +++++++++++++-----------
 2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index d004815d2c17a..76ed9dad3456d 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1383,21 +1383,35 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
   // Compute the cost of assigning a non Hint physical register to VirtReg.
   // We define it as the total frequency of broken COPY instructions to/from
   // Hint register, and after split, they can be deleted.
-  for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
-    if (!TII->isFullCopyInstr(Instr))
+
+  // FIXME: This is miscounting the costs with subregisters. In particular, this
+  // should support recognizing SplitKit formed copy bundles instead of direct
+  // copy instructions.
+  for (const MachineOperand &Opnd : MRI->reg_nodbg_operands(Reg)) {
+    const MachineInstr &Instr = *Opnd.getParent();
+    if (!Instr.isCopy() || Opnd.isImplicit())
       continue;
-    Register OtherReg = Instr.getOperand(1).getReg();
-    if (OtherReg == Reg) {
-      OtherReg = Instr.getOperand(0).getReg();
-      if (OtherReg == Reg)
-        continue;
+
+    // Look for the other end of the copy.
+    const bool IsDef = Opnd.isDef();
+    const MachineOperand &OtherOpnd = Instr.getOperand(IsDef);
+    Register OtherReg = OtherOpnd.getReg();
+    assert(Reg == Opnd.getReg());
+    if (OtherReg == Reg)
+      continue;
+
+    unsigned SubReg = Opnd.getSubReg();
+    if (!IsDef) {
       // Check if VirtReg interferes with OtherReg after this COPY instruction.
       if (VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot()))
         continue;
     }
+
     MCRegister OtherPhysReg =
         OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
-    if (OtherPhysReg == Hint)
+    MCRegister ThisHint =
+        SubReg ? TRI->getSubReg(Hint, SubReg) : MCRegister(Hint);
+    if (OtherPhysReg == ThisHint)
       Cost += MBFI->getBlockFreq(Instr.getParent());
   }
 
diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll
index 8f91f4120842b..b06bef44a5e9e 100644
--- a/llvm/test/CodeGen/X86/atomic-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll
@@ -469,52 +469,56 @@ entry:
 define i16 @use_in_diff_bb() nounwind {
 ; X86-LABEL: use_in_diff_bb:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzwl v16, %esi
+; X86-NEXT:    movzwl v16, %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB17_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    orl $1, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB17_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    jne .LBB17_4
 ; X86-NEXT:  # %bb.3:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    calll foo at PLT
-; X86-NEXT:  .LBB17_4:
-; X86-NEXT:    andl $1, %esi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:  .LBB17_4:
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: use_in_diff_bb:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl v16(%rip), %ebx
+; X64-NEXT:    movzwl v16(%rip), %eax
 ; X64-NEXT:    .p2align 4
 ; X64-NEXT:  .LBB17_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %ebx, %ecx
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    orl $1, %ecx
-; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    movl %eax, %ebx
+; X64-NEXT:    # kill: def $ax killed $ax def $eax
 ; X64-NEXT:    jne .LBB17_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    testb %al, %al
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testb %cl, %cl
 ; X64-NEXT:    jne .LBB17_4
 ; X64-NEXT:  # %bb.3:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movl %eax, %ebx
 ; X64-NEXT:    callq foo at PLT
-; X64-NEXT:  .LBB17_4:
-; X64-NEXT:    andl $1, %ebx
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
+; X64-NEXT:  .LBB17_4:
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw or ptr @v16, i16 1 monotonic, align 2