[llvm] 20711c1 - [AArch64][SME] Allow SME peephole optimizations across SME pseudos (#157655)

via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 24 04:14:57 PDT 2025


Author: Benjamin Maxwell
Date: 2025-09-24T12:14:53+01:00
New Revision: 20711c1ecb7a35fa0fc0bf99461da3afe682e355

URL: https://github.com/llvm/llvm-project/commit/20711c1ecb7a35fa0fc0bf99461da3afe682e355
DIFF: https://github.com/llvm/llvm-project/commit/20711c1ecb7a35fa0fc0bf99461da3afe682e355.diff

LOG: [AArch64][SME] Allow SME peephole optimizations across SME pseudos (#157655)

This allows folding `smstart/stops` in more cases.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
    llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
    llvm/test/CodeGen/AArch64/sme-peephole-opts.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index 85cca1de47b78..2a563663a34d1 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -184,6 +184,17 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
           isSVERegOp(TRI, MRI, MI.getOperand(1)))
         Prev = nullptr;
       break;
+    case AArch64::RestoreZAPseudo:
+    case AArch64::InOutZAUsePseudo:
+    case AArch64::CommitZASavePseudo:
+    case AArch64::SMEStateAllocPseudo:
+    case AArch64::RequiresZASavePseudo:
+      // These instructions only depend on the ZA state, not the streaming mode,
+      // so if the pair of smstart/stop is only changing the streaming mode, we
+      // can permit these instructions.
+      if (Prev->getOperand(0).getImm() != AArch64SVCR::SVCRSM)
+        Prev = nullptr;
+      break;
     case AArch64::ADJCALLSTACKDOWN:
     case AArch64::ADJCALLSTACKUP:
     case AArch64::ANDXri:

diff  --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index a0a14f2ffae3f..e3007a3723484 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
 ; CHECK-NEWLOWERING-NEXT:    smstop sm
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x8
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
-; CHECK-NEWLOWERING-NEXT:    smstart sm
-; CHECK-NEWLOWERING-NEXT:    smstop sm
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
 ; CHECK-NEWLOWERING-NEXT:    smstart sm
 ; CHECK-NEWLOWERING-NEXT:    mov x8, x0
@@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
 ; CHECK-NEWLOWERING-NEXT:  .LBB5_2:
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x8
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
 ; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_4
 ; CHECK-NEWLOWERING-NEXT:  // %bb.3:
 ; CHECK-NEWLOWERING-NEXT:    smstart sm
 ; CHECK-NEWLOWERING-NEXT:  .LBB5_4:
-; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_6
-; CHECK-NEWLOWERING-NEXT:  // %bb.5:
-; CHECK-NEWLOWERING-NEXT:    smstop sm
-; CHECK-NEWLOWERING-NEXT:  .LBB5_6:
-; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
-; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_8
-; CHECK-NEWLOWERING-NEXT:  // %bb.7:
-; CHECK-NEWLOWERING-NEXT:    smstart sm
-; CHECK-NEWLOWERING-NEXT:  .LBB5_8:
 ; CHECK-NEWLOWERING-NEXT:    mov x8, x0
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x19
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore

diff  --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index cab094e638cdf..ced0d41c22dab 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
 
 declare void @callee()
 declare void @callee_sm() "aarch64_pstate_sm_enabled"
@@ -563,3 +563,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
   store <vscale x 4 x float> %res1, ptr %ptr
   ret void
 }
+
+; normal caller -> streaming callees (with ZA state)
+define void @test14(ptr %callee) nounwind "aarch64_inout_za" {
+; CHECK-LABEL: test14:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    sub x10, x29, #80
+; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    bl callee_sm
+; CHECK-NEXT:    bl callee_sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB15_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB15_2:
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee_sm()
+  call void @callee_sm()
+  ret void
+}
+
+; normal caller -> streaming callees (with ZA agnostic state)
+define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" {
+; CHECK-LABEL: test15:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state_size
+; CHECK-NEXT:    sub sp, sp, x0
+; CHECK-NEXT:    mov x20, sp
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    bl callee_sm
+; CHECK-NEXT:    bl callee_sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee_sm()
+  call void @callee_sm()
+  ret void
+}
+
+; locally streaming caller -> normal callees (with ZA state)
+define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" {
+; CHECK-LABEL: test16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    cbz x8, .LBB17_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    bl __arm_tpidr2_save
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    zero {za}
+; CHECK-NEXT:  .LBB17_2:
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    sub x8, x29, #80
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee()
+  call void @callee()
+  ret void
+}


        


More information about the llvm-commits mailing list