[llvm] [AArch64][SME] Avoid clobbering X0 in the MachineSMEABIPass (PR #170131)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 1 05:14:05 PST 2025
https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/170131
This tweaks `findStateChangeInsertionPoint` to also avoid clobbering X0, which should be possible in most cases (since X0's live ranges are likely to be very short before register allocation).
This improves codegen in a few cases, as not all redundant copies to/from X0 are eliminated.
>From f42b0a27a13bf1d389aaa4e5e7ac73fde45e2dca Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 28 Nov 2025 13:24:20 +0000
Subject: [PATCH] [AArch64][SME] Avoid clobbering X0 in the MachineSMEABIPass
This tweaks `findStateChangeInsertionPoint` to also avoid clobbering X0,
which should be possible in most cases (since X0's live ranges are
likely to be very short before register allocation).
This improves codegen in a few cases, as not all redundant copies
to/from X0 are eliminated.
Change-Id: I38d8f3b40f6b1c2143ec9efb95637d244ad264e3
---
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 16 +++++++++++-----
.../AArch64/machine-sme-abi-find-insert-pt.mir | 4 +---
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 16 ++++++++--------
llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll | 6 ++----
llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll | 8 ++++----
.../CodeGen/AArch64/sve-stack-frame-layout.ll | 14 ++++++--------
6 files changed, 32 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index b96f6f12a58d6..e1c84ab76e1ba 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -632,8 +632,8 @@ MachineSMEABI::findStateChangeInsertionPoint(
PhysLiveRegs = Block.PhysLiveRegsAtExit;
}
- if (!(PhysLiveRegs & LiveRegs::NZCV))
- return {InsertPt, PhysLiveRegs}; // Nothing to do (no live flags).
+ if (PhysLiveRegs == LiveRegs::None)
+ return {InsertPt, PhysLiveRegs}; // Nothing to do (no live regs).
// Find the previous state change. We can not move before this point.
MachineBasicBlock::iterator PrevStateChangeI;
@@ -650,15 +650,21 @@ MachineSMEABI::findStateChangeInsertionPoint(
// Note: LiveUnits will only accurately track X0 and NZCV.
LiveRegUnits LiveUnits(*TRI);
setPhysLiveRegs(LiveUnits, PhysLiveRegs);
+ auto BestCandidate = std::make_pair(InsertPt, PhysLiveRegs);
for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) {
// Don't move before/into a call (which may have a state change before it).
if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall())
break;
LiveUnits.stepBackward(*I);
- if (LiveUnits.available(AArch64::NZCV))
- return {I, getPhysLiveRegs(LiveUnits)};
+ LiveRegs CurrentPhysLiveRegs = getPhysLiveRegs(LiveUnits);
+ // Find places where NZCV is available, but keep looking for locations where
+ // both NZCV and X0 are available, which can avoid some copies.
+ if (!(CurrentPhysLiveRegs & LiveRegs::NZCV))
+ BestCandidate = {I, getPhysLiveRegs(LiveUnits)};
+ if (CurrentPhysLiveRegs == LiveRegs::None)
+ break;
}
- return {InsertPt, PhysLiveRegs};
+ return BestCandidate;
}
void MachineSMEABI::insertStateChanges(EmitContext &Context,
diff --git a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
index 3f174a62128a8..ed768dec77998 100644
--- a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
+++ b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
@@ -79,14 +79,12 @@ body: |
; CHECK-NEXT: RequiresZASavePseudo
; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-NEXT: $x0 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
; CHECK-NEXT: MSR 56965, $xzr
- ; CHECK-NEXT: $x0 = COPY [[COPY2]]
+ ; CHECK-NEXT: $x0 = IMPLICIT_DEF
; CHECK-NEXT: $nzcv = IMPLICIT_DEF
; CHECK-NEXT: FAKE_USE $x0
; CHECK-NEXT: $zab0 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 30dbd1cb34667..0906e10b551b7 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -67,10 +67,10 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x1, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov x0, x1
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
@@ -170,11 +170,11 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: mov x1, x0
; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x20
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov x0, x1
; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
@@ -267,14 +267,14 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: mov x1, x0
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
; CHECK-NEWLOWERING-NEXT: // %bb.3:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov x0, x1
; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
@@ -336,10 +336,10 @@ define i64 @test_many_callee_arguments(
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee
; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x1, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov x0, x1
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll
index 0c886c643c5fb..87a63fed0546c 100644
--- a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll
+++ b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll
@@ -87,8 +87,7 @@ define i32 @load_tls_shared_za() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .tlsdesccall x
; CHECK-NEXT: blr x1
; CHECK-NEXT: mrs x8, TPIDR_EL0
-; CHECK-NEXT: ldr w0, [x8, x0]
-; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: ldr w8, [x8, x0]
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x9, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
@@ -133,8 +132,7 @@ define i32 @load_tls_streaming_shared_za() nounwind "aarch64_inout_za" "aarch64_
; CHECK-NEXT: blr x1
; CHECK-NEXT: smstart sm
; CHECK-NEXT: mrs x8, TPIDR_EL0
-; CHECK-NEXT: ldr w0, [x8, x0]
-; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: ldr w8, [x8, x0]
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x9, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #80
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 50dd0c699284c..e672f777703a6 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -621,15 +621,15 @@ define i64 @test_many_callee_arguments(
; CHECK-NEWLOWERING-NEXT: stp x10, x11, [sp, #-16]!
; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee
; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x1, x0
; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0
+; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB9_2
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_2
; CHECK-NEWLOWERING-NEXT: // %bb.1:
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB9_2:
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov x0, x1
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 3aaae5e73ff23..37adfb89e4762 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -33,7 +33,7 @@ define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible"
; CHECK-COMMON-NEXT: ldr x29, [sp, #8] // 8-byte Reload
; CHECK-COMMON-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ret
-; CHECK-COMMON-NE
+; CHECK-NE
entry:
%a = alloca <vscale x 4 x i32>
%b = alloca i32
@@ -626,23 +626,21 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK-NEWLOWERING-NEXT: mov x9, sp
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80
; CHECK-NEWLOWERING-NEXT: mov w20, w0
+; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: bl other
; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: mov w0, w20
-; CHECK-NEWLOWERING-NEXT: mov w8, w0
; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0
+; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80
-; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB8_2
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %entry
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %entry
-; CHECK-NEWLOWERING-NEXT: mov w0, w8
+; CHECK-NEWLOWERING-NEXT: mov w0, w20
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa wsp, 112
@@ -671,4 +669,4 @@ entry:
tail call void @other()
ret i32 %x
}
-declare void @other()
\ No newline at end of file
+declare void @other()
More information about the llvm-commits
mailing list