[llvm] [AArch64][SME] Allow SME peephole optimizations across SME pseudos (PR #157655)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 9 05:03:50 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Benjamin Maxwell (MacDue)
<details>
<summary>Changes</summary>
This allows folding `smstart/stops` in more cases.
---
Full diff: https://github.com/llvm/llvm-project/pull/157655.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp (+5)
- (modified) llvm/test/CodeGen/AArch64/sme-agnostic-za.ll (+1-11)
- (modified) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (+126-1)
``````````diff
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index 85cca1de47b78..ec70ddfb5fcf1 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -184,6 +184,11 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
isSVERegOp(TRI, MRI, MI.getOperand(1)))
Prev = nullptr;
break;
+ case AArch64::RestoreZAPseudo:
+ case AArch64::InOutZAUsePseudo:
+ case AArch64::CommitZASavePseudo:
+ case AArch64::SMEStateAllocPseudo:
+ case AArch64::RequiresZASavePseudo:
case AArch64::ADJCALLSTACKDOWN:
case AArch64::ADJCALLSTACKUP:
case AArch64::ANDXri:
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index a0a14f2ffae3f..e3007a3723484 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: mov x8, x0
@@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEWLOWERING-NEXT: .LBB5_2:
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
; CHECK-NEWLOWERING-NEXT: // %bb.3:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
-; CHECK-NEWLOWERING-NEXT: // %bb.5:
-; CHECK-NEWLOWERING-NEXT: smstop sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_6:
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
-; CHECK-NEWLOWERING-NEXT: // %bb.7:
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_8:
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 80827c2547780..442636cfc8398 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
declare void @callee()
declare void @callee_sm() "aarch64_pstate_sm_enabled"
@@ -554,3 +554,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
store <vscale x 4 x float> %res1, ptr %ptr
ret void
}
+
+; normal caller -> streaming callees (with ZA state)
+define void @test14(ptr %callee) nounwind "aarch64_inout_za" {
+; CHECK-LABEL: test14:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: sub x10, x29, #80
+; CHECK-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #80
+; CHECK-NEXT: cbnz x8, .LBB15_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB15_2:
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee_sm()
+ call void @callee_sm()
+ ret void
+}
+
+; normal caller -> streaming callees (with ZA agnostic state)
+define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" {
+; CHECK-LABEL: test15:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state_size
+; CHECK-NEXT: sub sp, sp, x0
+; CHECK-NEXT: mov x20, sp
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: bl __arm_sme_save
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: bl callee_sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: bl __arm_sme_restore
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee_sm()
+ call void @callee_sm()
+ ret void
+}
+
+; locally streaming caller -> normal callees (with ZA state)
+define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" {
+; CHECK-LABEL: test16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: cbz x8, .LBB17_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_save
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: .LBB17_2:
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: sub x8, x29, #80
+; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: smstop za
+; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee()
+ call void @callee()
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/157655
More information about the llvm-commits
mailing list