[llvm] 20711c1 - [AArch64][SME] Allow SME peephole optimizations across SME pseudos (#157655)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 24 04:14:57 PDT 2025
Author: Benjamin Maxwell
Date: 2025-09-24T12:14:53+01:00
New Revision: 20711c1ecb7a35fa0fc0bf99461da3afe682e355
URL: https://github.com/llvm/llvm-project/commit/20711c1ecb7a35fa0fc0bf99461da3afe682e355
DIFF: https://github.com/llvm/llvm-project/commit/20711c1ecb7a35fa0fc0bf99461da3afe682e355.diff
LOG: [AArch64][SME] Allow SME peephole optimizations across SME pseudos (#157655)
This allows folding `smstart/stops` in more cases.
Added:
Modified:
llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index 85cca1de47b78..2a563663a34d1 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -184,6 +184,17 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
isSVERegOp(TRI, MRI, MI.getOperand(1)))
Prev = nullptr;
break;
+ case AArch64::RestoreZAPseudo:
+ case AArch64::InOutZAUsePseudo:
+ case AArch64::CommitZASavePseudo:
+ case AArch64::SMEStateAllocPseudo:
+ case AArch64::RequiresZASavePseudo:
+ // These instructions only depend on the ZA state, not the streaming mode,
+ // so if the pair of smstart/stop is only changing the streaming mode, we
+ // can permit these instructions.
+ if (Prev->getOperand(0).getImm() != AArch64SVCR::SVCRSM)
+ Prev = nullptr;
+ break;
case AArch64::ADJCALLSTACKDOWN:
case AArch64::ADJCALLSTACKUP:
case AArch64::ANDXri:
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index a0a14f2ffae3f..e3007a3723484 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: mov x8, x0
@@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEWLOWERING-NEXT: .LBB5_2:
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
; CHECK-NEWLOWERING-NEXT: // %bb.3:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
-; CHECK-NEWLOWERING-NEXT: // %bb.5:
-; CHECK-NEWLOWERING-NEXT: smstop sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_6:
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
-; CHECK-NEWLOWERING-NEXT: // %bb.7:
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_8:
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index cab094e638cdf..ced0d41c22dab 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
declare void @callee()
declare void @callee_sm() "aarch64_pstate_sm_enabled"
@@ -563,3 +563,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
store <vscale x 4 x float> %res1, ptr %ptr
ret void
}
+
+; normal caller -> streaming callees (with ZA state)
+define void @test14(ptr %callee) nounwind "aarch64_inout_za" {
+; CHECK-LABEL: test14:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: sub x10, x29, #80
+; CHECK-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #80
+; CHECK-NEXT: cbnz x8, .LBB15_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB15_2:
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee_sm()
+ call void @callee_sm()
+ ret void
+}
+
+; normal caller -> streaming callees (with ZA agnostic state)
+define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" {
+; CHECK-LABEL: test15:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state_size
+; CHECK-NEXT: sub sp, sp, x0
+; CHECK-NEXT: mov x20, sp
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: bl __arm_sme_save
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: bl __arm_sme_restore
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee_sm()
+ call void @callee_sm()
+ ret void
+}
+
+; locally streaming caller -> normal callees (with ZA state)
+define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" {
+; CHECK-LABEL: test16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: cbz x8, .LBB17_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_save
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: .LBB17_2:
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: sub x8, x29, #80
+; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: smstop za
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee()
+ call void @callee()
+ ret void
+}
More information about the llvm-commits
mailing list