[llvm] [AArch64][SME] Avoid clobbering X0 in the MachineSMEABIPass (PR #170131)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 1 05:14:05 PST 2025


https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/170131

This tweaks `findStateChangeInsertionPoint` to also avoid clobbering X0, which should be possible in most cases (since X0's live ranges are likely to be very short before register allocation).

This improves codegen in a few cases, as not all redundant copies to/from X0 are eliminated.


>From f42b0a27a13bf1d389aaa4e5e7ac73fde45e2dca Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 28 Nov 2025 13:24:20 +0000
Subject: [PATCH] [AArch64][SME] Avoid clobbering X0 in the MachineSMEABIPass

This tweaks `findStateChangeInsertionPoint` to also avoid clobbering X0,
which should be possible in most cases (since X0's live ranges are
likely to be very short before register allocation).

This improves codegen in a few cases, as not all redundant copies
to/from X0 are eliminated.

Change-Id: I38d8f3b40f6b1c2143ec9efb95637d244ad264e3
---
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp    | 16 +++++++++++-----
 .../AArch64/machine-sme-abi-find-insert-pt.mir   |  4 +---
 llvm/test/CodeGen/AArch64/sme-agnostic-za.ll     | 16 ++++++++--------
 llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll     |  6 ++----
 llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll  |  8 ++++----
 .../CodeGen/AArch64/sve-stack-frame-layout.ll    | 14 ++++++--------
 6 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index b96f6f12a58d6..e1c84ab76e1ba 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -632,8 +632,8 @@ MachineSMEABI::findStateChangeInsertionPoint(
     PhysLiveRegs = Block.PhysLiveRegsAtExit;
   }
 
-  if (!(PhysLiveRegs & LiveRegs::NZCV))
-    return {InsertPt, PhysLiveRegs}; // Nothing to do (no live flags).
+  if (PhysLiveRegs == LiveRegs::None)
+    return {InsertPt, PhysLiveRegs}; // Nothing to do (no live regs).
 
   // Find the previous state change. We can not move before this point.
   MachineBasicBlock::iterator PrevStateChangeI;
@@ -650,15 +650,21 @@ MachineSMEABI::findStateChangeInsertionPoint(
   // Note: LiveUnits will only accurately track X0 and NZCV.
   LiveRegUnits LiveUnits(*TRI);
   setPhysLiveRegs(LiveUnits, PhysLiveRegs);
+  auto BestCandidate = std::make_pair(InsertPt, PhysLiveRegs);
   for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) {
     // Don't move before/into a call (which may have a state change before it).
     if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall())
       break;
     LiveUnits.stepBackward(*I);
-    if (LiveUnits.available(AArch64::NZCV))
-      return {I, getPhysLiveRegs(LiveUnits)};
+    LiveRegs CurrentPhysLiveRegs = getPhysLiveRegs(LiveUnits);
+    // Find places where NZCV is available, but keep looking for locations where
+    // both NZCV and X0 are available, which can avoid some copies.
+    if (!(CurrentPhysLiveRegs & LiveRegs::NZCV))
+      BestCandidate = {I, getPhysLiveRegs(LiveUnits)};
+    if (CurrentPhysLiveRegs == LiveRegs::None)
+      break;
   }
-  return {InsertPt, PhysLiveRegs};
+  return BestCandidate;
 }
 
 void MachineSMEABI::insertStateChanges(EmitContext &Context,
diff --git a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
index 3f174a62128a8..ed768dec77998 100644
--- a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
+++ b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
@@ -79,14 +79,12 @@ body:             |
     ; CHECK-NEXT: RequiresZASavePseudo
     ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-    ; CHECK-NEXT: $x0 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
     ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
     ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
     ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
     ; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
     ; CHECK-NEXT: MSR 56965, $xzr
-    ; CHECK-NEXT: $x0 = COPY [[COPY2]]
+    ; CHECK-NEXT: $x0 = IMPLICIT_DEF
     ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
     ; CHECK-NEXT: FAKE_USE $x0
     ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 30dbd1cb34667..0906e10b551b7 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -67,10 +67,10 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x8
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
-; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    mov x1, x0
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x19
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    mov x0, x1
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x29
 ; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
@@ -170,11 +170,11 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x8
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    mov x1, x0
 ; CHECK-NEWLOWERING-NEXT:    smstart sm
-; CHECK-NEWLOWERING-NEXT:    mov x8, x0
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x20
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    mov x0, x1
 ; CHECK-NEWLOWERING-NEXT:    sub sp, x29, #64
 ; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
@@ -267,14 +267,14 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x8
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    mov x1, x0
 ; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_4
 ; CHECK-NEWLOWERING-NEXT:  // %bb.3:
 ; CHECK-NEWLOWERING-NEXT:    smstart sm
 ; CHECK-NEWLOWERING-NEXT:  .LBB5_4:
-; CHECK-NEWLOWERING-NEXT:    mov x8, x0
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x19
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    mov x0, x1
 ; CHECK-NEWLOWERING-NEXT:    sub sp, x29, #64
 ; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
@@ -336,10 +336,10 @@ define i64  @test_many_callee_arguments(
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x8
 ; CHECK-NEWLOWERING-NEXT:    bl many_args_private_za_callee
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    mov x1, x0
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x19
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    mov x0, x1
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x29
 ; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll
index 0c886c643c5fb..87a63fed0546c 100644
--- a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll
+++ b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll
@@ -87,8 +87,7 @@ define i32 @load_tls_shared_za() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    .tlsdesccall x
 ; CHECK-NEXT:    blr x1
 ; CHECK-NEXT:    mrs x8, TPIDR_EL0
-; CHECK-NEXT:    ldr w0, [x8, x0]
-; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ldr w8, [x8, x0]
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x9, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
@@ -133,8 +132,7 @@ define i32 @load_tls_streaming_shared_za() nounwind "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    blr x1
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    mrs x8, TPIDR_EL0
-; CHECK-NEXT:    ldr w0, [x8, x0]
-; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ldr w8, [x8, x0]
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x9, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #80
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 50dd0c699284c..e672f777703a6 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -621,15 +621,15 @@ define i64  @test_many_callee_arguments(
 ; CHECK-NEWLOWERING-NEXT:    stp x10, x11, [sp, #-16]!
 ; CHECK-NEWLOWERING-NEXT:    bl many_args_private_za_callee
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    mov x1, x0
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mrs x9, TPIDR2_EL0
+; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT:    cbnz x9, .LBB9_2
+; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB9_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1:
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEWLOWERING-NEXT:  .LBB9_2:
-; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    mov x0, x1
 ; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x29
 ; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 3aaae5e73ff23..37adfb89e4762 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -33,7 +33,7 @@ define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible"
 ; CHECK-COMMON-NEXT:    ldr x29, [sp, #8] // 8-byte Reload
 ; CHECK-COMMON-NEXT:    ldr d8, [sp], #16 // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
-; CHECK-COMMON-NE
+; CHECK-NE
 entry:
   %a = alloca <vscale x 4 x i32>
   %b = alloca i32
@@ -626,23 +626,21 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEWLOWERING-NEXT:    mov x9, sp
 ; CHECK-NEWLOWERING-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x9
-; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #80
 ; CHECK-NEWLOWERING-NEXT:    mov w20, w0
+; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #80
 ; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-80]
 ; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEWLOWERING-NEXT:    smstop sm
 ; CHECK-NEWLOWERING-NEXT:    bl other
 ; CHECK-NEWLOWERING-NEXT:    smstart sm
-; CHECK-NEWLOWERING-NEXT:    mov w0, w20
-; CHECK-NEWLOWERING-NEXT:    mov w8, w0
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mrs x9, TPIDR2_EL0
+; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #80
-; CHECK-NEWLOWERING-NEXT:    cbnz x9, .LBB8_2
+; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB8_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1: // %entry
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEWLOWERING-NEXT:  .LBB8_2: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov w0, w8
+; CHECK-NEWLOWERING-NEXT:    mov w0, w20
 ; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEWLOWERING-NEXT:    sub sp, x29, #64
 ; CHECK-NEWLOWERING-NEXT:    .cfi_def_cfa wsp, 112
@@ -671,4 +669,4 @@ entry:
   tail call void @other()
   ret i32 %x
 }
-declare void @other()
\ No newline at end of file
+declare void @other()



More information about the llvm-commits mailing list