[clang] [llvm] [AArch64] Enable the new SME ABI lowering (`-aarch64-new-sme-abi`) by default (PR #172642)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 19 00:09:19 PST 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/172642
>From c6bc9381e74de9564a3bd548a705b15e63db1d33 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 17 Dec 2025 10:57:52 +0000
Subject: [PATCH] [AArch64] Enable the new SME ABI lowering
(`-aarch64-new-sme-abi`) by default
The previous SelectionDAG lowering is still available via
`-aarch64-new-sme-abi=false` (this will stay around until at least
LLVM 23).
In tests that contained `CHECK-NEWLOWERING` the checks have been updated
so:
* `CHECK-NEWLOWERING` -> `CHECK` (the new default)
* `CHECK` -> `CHECK-SDAG` (the old SelectionDAG lowering)
But otherwise, the check lines have no changed.
Tests that were not explicitly checking the SME lowering have been
updated to match the new default lowering.
Those tests are:
* llvm/test/CodeGen/AArch64/O0-pipeline.ll
* llvm/test/CodeGen/AArch64/O3-pipeline.ll
* llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
* llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
* llvm/test/CodeGen/AArch64/stack-hazard.ll
---
clang/test/CodeGen/AArch64/sme-remarks.c | 28 +-
.../Target/AArch64/AArch64TargetMachine.cpp | 2 +-
llvm/test/CodeGen/AArch64/O0-pipeline.ll | 5 +-
llvm/test/CodeGen/AArch64/O3-pipeline.ll | 7 +-
.../AArch64/sme-abi-save-call-remarks.ll | 68 +-
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 366 +++----
.../AArch64/sme-disable-gisel-fisel.ll | 41 +-
.../CodeGen/AArch64/sme-framelower-use-bp.ll | 908 ++++++------------
.../CodeGen/AArch64/sme-lazy-save-call.ll | 564 +++++------
.../CodeGen/AArch64/sme-new-za-function.ll | 224 ++---
.../CodeGen/AArch64/sme-za-control-flow.ll | 660 ++++++-------
.../test/CodeGen/AArch64/sme-za-exceptions.ll | 4 +-
.../AArch64/sme-za-lazy-save-buffer.ll | 106 +-
llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 336 +++----
llvm/test/CodeGen/AArch64/stack-hazard.ll | 10 +-
.../CodeGen/AArch64/sve-stack-frame-layout.ll | 146 +--
16 files changed, 1581 insertions(+), 1894 deletions(-)
diff --git a/clang/test/CodeGen/AArch64/sme-remarks.c b/clang/test/CodeGen/AArch64/sme-remarks.c
index fd144b8a6c425..f7a1f33f3372d 100644
--- a/clang/test/CodeGen/AArch64/sme-remarks.c
+++ b/clang/test/CodeGen/AArch64/sme-remarks.c
@@ -1,39 +1,39 @@
// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -Rpass-analysis=sme -verify %s -S -o /dev/null
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -mllvm -aarch64-new-sme-abi -Rpass-analysis=sme -verify=expected-new %s -S -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -mllvm -aarch64-new-sme-abi=false -Rpass-analysis=sme -verify=expected-sdag %s -S -o /dev/null
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -Rpass-analysis=sme -verify %s -S -o /dev/null %s
void private_za_callee_a();
void private_za_callee_b();
void private_za_callee_c();
void test_za_merge_paths(int a) __arm_inout("za") {
- // expected-new-remark at +1 {{lazy save of ZA emitted in 'test_za_merge_paths'}}
+ // expected-remark at +1 {{lazy save of ZA emitted in 'test_za_merge_paths'}}
if (a != 0)
- // expected-remark at +2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}}
- // expected-new-remark at +1 {{call to 'private_za_callee_a' requires ZA save}}
+ // expected-sdag-remark at +2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}}
+ // expected-remark at +1 {{call to 'private_za_callee_a' requires ZA save}}
private_za_callee_a();
else
- // expected-remark at +2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}}
- // expected-new-remark at +1 {{call to 'private_za_callee_b' requires ZA save}}
+ // expected-sdag-remark at +2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}}
+ // expected-remark at +1 {{call to 'private_za_callee_b' requires ZA save}}
private_za_callee_b();
- // expected-remark at +3 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}}
+ // expected-sdag-remark at +3 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}}
/// The new lowering won't report this call as the save is already needed due
/// to the call to `private_za_callee_a/b()` calls on both paths to this call.
private_za_callee_c();
}
void test_lazy_save_multiple_paths(int a) __arm_inout("za") {
- // expected-new-remark at +1 {{lazy save of ZA emitted in 'test_lazy_save_multiple_paths'}}
+ // expected-remark at +1 {{lazy save of ZA emitted in 'test_lazy_save_multiple_paths'}}
if (a != 0)
- // expected-remark at +2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}}
- // expected-new-remark at +1 {{call to 'private_za_callee_a' requires ZA save}}
+ // expected-sdag-remark at +2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}}
+ // expected-remark at +1 {{call to 'private_za_callee_a' requires ZA save}}
private_za_callee_a();
else {
- // expected-remark at +2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}}
- // expected-new-remark at +1 {{call to 'private_za_callee_b' requires ZA save}}
+ // expected-sdag-remark at +2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}}
+ // expected-remark at +1 {{call to 'private_za_callee_b' requires ZA save}}
private_za_callee_b();
- // expected-remark at +3 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}}
+ // expected-sdag-remark at +3 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}}
/// The new lowering won't report this call as the save is already needed
/// due to the call to `private_za_callee_b()`.
private_za_callee_c();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 346e18e553c5e..1ec5a20cc0ce0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -225,7 +225,7 @@ static cl::opt<bool>
static cl::opt<bool>
EnableNewSMEABILowering("aarch64-new-sme-abi",
cl::desc("Enable new lowering for the SME ABI"),
- cl::init(false), cl::Hidden);
+ cl::init(true), cl::Hidden);
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
LLVMInitializeAArch64Target() {
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index 96f5e5a4afb3e..80ff4fbb11a8f 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -33,7 +33,6 @@
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: AArch64 Stack Tagging
-; CHECK-NEXT: SME ABI Pass
; CHECK-NEXT: Exception handling preparation
; CHECK-NEXT: Prepare callbr
; CHECK-NEXT: Safe Stack instrumentation pass
@@ -56,6 +55,10 @@
; CHECK-NEXT: AArch64 Instruction Selection
; CHECK-NEXT: Finalize ISel and expand pseudo-instructions
; CHECK-NEXT: Local Stack Slot Allocation
+; CHECK-NEXT: Bundle Machine CFG Edges
+; CHECK-NEXT: Lazy Machine Block Frequency Analysis
+; CHECK-NEXT: Machine Optimization Remark Emitter
+; CHECK-NEXT: Machine SME ABI pass
; CHECK-NEXT: Eliminate PHI nodes for register allocation
; CHECK-NEXT: Two-Address instruction pass
; CHECK-NEXT: Fast Register Allocator
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index e8ea55e027aec..15266b0d6a916 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -97,8 +97,6 @@
; CHECK-NEXT: Interleaved Load Combine Pass
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Interleaved Access Pass
-; CHECK-NEXT: SME ABI Pass
-; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Type Promotion
; CHECK-NEXT: CodeGen Prepare
@@ -129,8 +127,11 @@
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up
; CHECK-NEXT: Finalize ISel and expand pseudo-instructions
-; CHECK-NEXT: SME Peephole Optimization pass
+; CHECK-NEXT: Bundle Machine CFG Edges
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
+; CHECK-NEXT: Machine Optimization Remark Emitter
+; CHECK-NEXT: Machine SME ABI pass
+; CHECK-NEXT: SME Peephole Optimization pass
; CHECK-NEXT: Early Tail Duplication
; CHECK-NEXT: Optimize machine instruction PHIs
; CHECK-NEXT: Slot index numbering
diff --git a/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll
index 755dcfbf17ba4..c3c76e3e803d0 100644
--- a/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sme2 --aarch64-new-sme-abi=false --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SDAG
; RUN: llc -mtriple=aarch64 -mattr=+sme2 --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s
-; RUN: llc -mtriple=aarch64 -mattr=+sme2 --aarch64-new-sme-abi --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NEWLOWERING
declare void @private_za_callee()
declare void @private_za_callee_a()
@@ -13,42 +13,42 @@ declare void @shared_za_zt0_callee() "aarch64_inout_za" "aarch64_inout_zt0"
; Note: These remarks are more useful with source debug info (which gives line numbers for `<unknown>:0:0`).
define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_1_callee'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
+; CHECK: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_1_callee'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
call void @private_za_callee()
ret void
}
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_2_callees'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
+; CHECK: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_2_callees'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
call void @private_za_callee()
call void @private_za_callee()
ret void
}
define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_expanded_intrinsic'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'cosf' requires ZA save
+; CHECK: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_expanded_intrinsic'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'cosf' requires ZA save
%res = call float @llvm.cos.f32(float %a)
ret float %res
}
define void @test_lazy_save_multiple_paths(i1 %a) "aarch64_inout_za" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_a' sets up a lazy save for ZA
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_b' sets up a lazy save for ZA
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_c' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_a' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_b' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_c' sets up a lazy save for ZA
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_multiple_paths'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_b' requires ZA save
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_a' requires ZA save
+; CHECK: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_multiple_paths'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_b' requires ZA save
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_a' requires ZA save
entry:
br i1 %a, label %if.end, label %if.else
@@ -67,12 +67,12 @@ if.end:
define void @test_lazy_save_with_zt0() "aarch64_inout_za" "aarch64_inout_zt0"
{
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_with_zt0' to 'private_za_callee' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_with_zt0' to 'private_za_callee' sets up a lazy save for ZA
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'shared_za_callee' requires ZT0 save
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
+; CHECK: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'shared_za_callee' requires ZT0 save
+; CHECK-NEXT: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
call void @shared_za_callee() ; Save ZT0 (remark ZT0 spill)
call void @private_za_callee() ; Save ZA (remark ZA save)
ret void
@@ -80,13 +80,13 @@ define void @test_lazy_save_with_zt0() "aarch64_inout_za" "aarch64_inout_zt0"
define void @test_lazy_save_with_zt0_reload() "aarch64_inout_za" "aarch64_inout_zt0"
{
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_with_zt0_reload' to 'private_za_callee' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_with_zt0_reload' to 'private_za_callee' sets up a lazy save for ZA
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'shared_za_callee' requires ZT0 save
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0_reload'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
+; CHECK: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'shared_za_callee' requires ZT0 save
+; CHECK-NEXT: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload'
+; CHECK-NEXT: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0_reload'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save
call void @shared_za_callee() ; Save ZT0 (remark ZT0 spill)
call void @shared_za_zt0_callee() ; Reload ZT0
call void @private_za_callee() ; Save ZA, ZT0 (remark ZT0 spill and ZA save)
@@ -96,9 +96,9 @@ define void @test_lazy_save_with_zt0_reload() "aarch64_inout_za" "aarch64_inout_
define void @test_za_merge_paths(i1 %a) "aarch64_za_state_agnostic" {
;; Note: The old lowering does not emit any remarks for agnostic ZA saves.
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: full save of ZA emitted in 'test_za_merge_paths'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_b' requires ZA save
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_a' requires ZA save
+; CHECK: remark: <unknown>:0:0: full save of ZA emitted in 'test_za_merge_paths'
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_b' requires ZA save
+; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_a' requires ZA save
entry:
br i1 %a, label %if.end, label %if.else
@@ -119,10 +119,10 @@ exit:
}
define void @test_lazy_save_function_ptr_callee(ptr %private_za_callee) nounwind "aarch64_inout_za" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_function_ptr_callee' to 'unknown callee' sets up a lazy save for ZA
+; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_function_ptr_callee' to 'unknown callee' sets up a lazy save for ZA
-; CHECK-NEWLOWERING: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_function_ptr_callee'
-; CHECK-NEWLOWERING-NEXT: remark: <unknown>:0:0: call requires ZA save
+; CHECK: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_function_ptr_callee'
+; CHECK-NEXT: remark: <unknown>:0:0: call requires ZA save
call void %private_za_callee()
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 0906e10b551b7..344f1ef24b843 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi=false | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
-; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
target triple = "aarch64"
@@ -24,6 +24,35 @@ define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind "aarch64_za_state_agno
; inserted for calls to non-agnostic functions and that the arg/result registers are
; preserved by the register allocator.
define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" {
+; CHECK-SDAG-LABEL: agnostic_caller_private_za_callee:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: mov x8, x0
+; CHECK-SDAG-NEXT: bl __arm_sme_state_size
+; CHECK-SDAG-NEXT: sub sp, sp, x0
+; CHECK-SDAG-NEXT: mov x19, sp
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: mov x0, x8
+; CHECK-SDAG-NEXT: bl private_za_decl
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: bl private_za_decl
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: agnostic_caller_private_za_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -37,12 +66,6 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: bl private_za_decl
-; CHECK-NEXT: mov x1, x0
-; CHECK-NEXT: mov x0, x19
-; CHECK-NEXT: bl __arm_sme_restore
-; CHECK-NEXT: mov x0, x19
-; CHECK-NEXT: bl __arm_sme_save
-; CHECK-NEXT: mov x0, x1
; CHECK-NEXT: bl private_za_decl
; CHECK-NEXT: mov x1, x0
; CHECK-NEXT: mov x0, x19
@@ -52,29 +75,6 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
-; CHECK-NEWLOWERING-NEXT: mov x19, sp
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: mov x1, x0
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x1
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@@ -110,6 +110,47 @@ define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "a
; agnostic-ZA + streaming -> private-ZA + non-streaming
define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_enabled" {
+; CHECK-SDAG-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x8, x0
+; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: add x29, sp, #64
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: bl __arm_sme_state_size
+; CHECK-SDAG-NEXT: sub sp, sp, x0
+; CHECK-SDAG-NEXT: mov x20, sp
+; CHECK-SDAG-NEXT: mov x0, x20
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: smstop sm
+; CHECK-SDAG-NEXT: mov x0, x8
+; CHECK-SDAG-NEXT: bl private_za_decl
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: smstart sm
+; CHECK-SDAG-NEXT: mov x0, x20
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x20
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: smstop sm
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: bl private_za_decl
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: smstart sm
+; CHECK-SDAG-NEXT: mov x0, x20
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: sub sp, x29, #64
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
@@ -128,14 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEXT: smstop sm
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: bl private_za_decl
-; CHECK-NEXT: mov x1, x0
-; CHECK-NEXT: smstart sm
-; CHECK-NEXT: mov x0, x20
-; CHECK-NEXT: bl __arm_sme_restore
-; CHECK-NEXT: mov x0, x20
-; CHECK-NEXT: bl __arm_sme_save
-; CHECK-NEXT: smstop sm
-; CHECK-NEXT: mov x0, x1
; CHECK-NEXT: bl private_za_decl
; CHECK-NEXT: mov x1, x0
; CHECK-NEXT: smstart sm
@@ -150,39 +183,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
-; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
-; CHECK-NEWLOWERING-NEXT: mov x20, sp
-; CHECK-NEWLOWERING-NEXT: mov x0, x20
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT: smstop sm
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: mov x1, x0
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: mov x0, x20
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x1
-; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@@ -190,6 +190,60 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; agnostic-ZA + streaming-compatible -> private-ZA + non-streaming
define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_compatible" {
+; CHECK-SDAG-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x8, x0
+; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: add x29, sp, #64
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mrs x20, SVCR
+; CHECK-SDAG-NEXT: bl __arm_sme_state_size
+; CHECK-SDAG-NEXT: sub sp, sp, x0
+; CHECK-SDAG-NEXT: mov x19, sp
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_2
+; CHECK-SDAG-NEXT: // %bb.1:
+; CHECK-SDAG-NEXT: smstop sm
+; CHECK-SDAG-NEXT: .LBB5_2:
+; CHECK-SDAG-NEXT: mov x0, x8
+; CHECK-SDAG-NEXT: bl private_za_decl
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_4
+; CHECK-SDAG-NEXT: // %bb.3:
+; CHECK-SDAG-NEXT: smstart sm
+; CHECK-SDAG-NEXT: .LBB5_4:
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_6
+; CHECK-SDAG-NEXT: // %bb.5:
+; CHECK-SDAG-NEXT: smstop sm
+; CHECK-SDAG-NEXT: .LBB5_6:
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: bl private_za_decl
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: tbz w20, #0, .LBB5_8
+; CHECK-SDAG-NEXT: // %bb.7:
+; CHECK-SDAG-NEXT: smstart sm
+; CHECK-SDAG-NEXT: .LBB5_8:
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: sub sp, x29, #64
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
@@ -200,10 +254,10 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: mrs x20, SVCR
; CHECK-NEXT: bl __arm_sme_state_size
; CHECK-NEXT: sub sp, sp, x0
; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: mrs x20, SVCR
; CHECK-NEXT: mov x0, x19
; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: tbz w20, #0, .LBB5_2
@@ -212,6 +266,7 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEXT: .LBB5_2:
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: bl private_za_decl
+; CHECK-NEXT: bl private_za_decl
; CHECK-NEXT: mov x1, x0
; CHECK-NEXT: tbz w20, #0, .LBB5_4
; CHECK-NEXT: // %bb.3:
@@ -219,21 +274,6 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEXT: .LBB5_4:
; CHECK-NEXT: mov x0, x19
; CHECK-NEXT: bl __arm_sme_restore
-; CHECK-NEXT: mov x0, x19
-; CHECK-NEXT: bl __arm_sme_save
-; CHECK-NEXT: tbz w20, #0, .LBB5_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: smstop sm
-; CHECK-NEXT: .LBB5_6:
-; CHECK-NEXT: mov x0, x1
-; CHECK-NEXT: bl private_za_decl
-; CHECK-NEXT: mov x1, x0
-; CHECK-NEXT: tbz w20, #0, .LBB5_8
-; CHECK-NEXT: // %bb.7:
-; CHECK-NEXT: smstart sm
-; CHECK-NEXT: .LBB5_8:
-; CHECK-NEXT: mov x0, x19
-; CHECK-NEXT: bl __arm_sme_restore
; CHECK-NEXT: mov x0, x1
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
@@ -243,46 +283,6 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
-; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
-; CHECK-NEWLOWERING-NEXT: mov x19, sp
-; CHECK-NEWLOWERING-NEXT: mrs x20, SVCR
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: smstop sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_2:
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: bl private_za_decl
-; CHECK-NEWLOWERING-NEXT: mov x1, x0
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
-; CHECK-NEWLOWERING-NEXT: // %bb.3:
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: .LBB5_4:
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x1
-; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@@ -295,6 +295,31 @@ declare i64 @many_args_private_za_callee(
; stack pointer before the call -- in this test the call to __arm_sme_save
; should occur _before_ the stack decrement.
define i64 @test_many_callee_arguments(
+; CHECK-SDAG-LABEL: test_many_callee_arguments:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: mov x8, x0
+; CHECK-SDAG-NEXT: bl __arm_sme_state_size
+; CHECK-SDAG-NEXT: sub sp, sp, x0
+; CHECK-SDAG-NEXT: ldp x9, x10, [x29, #32]
+; CHECK-SDAG-NEXT: mov x19, sp
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: stp x9, x10, [sp, #-16]!
+; CHECK-SDAG-NEXT: mov x0, x8
+; CHECK-SDAG-NEXT: bl many_args_private_za_callee
+; CHECK-SDAG-NEXT: add sp, sp, #16
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: test_many_callee_arguments:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -303,8 +328,8 @@ define i64 @test_many_callee_arguments(
; CHECK-NEXT: mov x8, x0
; CHECK-NEXT: bl __arm_sme_state_size
; CHECK-NEXT: sub sp, sp, x0
-; CHECK-NEXT: ldp x9, x10, [x29, #32]
; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: ldp x9, x10, [x29, #32]
; CHECK-NEXT: mov x0, x19
; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: stp x9, x10, [sp, #-16]!
@@ -319,31 +344,6 @@ define i64 @test_many_callee_arguments(
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
-; CHECK-NEWLOWERING-NEXT: mov x19, sp
-; CHECK-NEWLOWERING-NEXT: ldp x9, x10, [x29, #32]
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT: stp x9, x10, [sp, #-16]!
-; CHECK-NEWLOWERING-NEXT: mov x0, x8
-; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee
-; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: mov x1, x0
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov x0, x1
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9
) nounwind "aarch64_za_state_agnostic" {
%ret = call i64 @many_args_private_za_callee(
@@ -352,6 +352,34 @@ define i64 @test_many_callee_arguments(
}
define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{
+; CHECK-SDAG-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: bl __arm_sme_state_size
+; CHECK-SDAG-NEXT: mov x8, sp
+; CHECK-SDAG-NEXT: sub x19, x8, x0
+; CHECK-SDAG-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536
+; CHECK-SDAG-NEXT: cmp sp, x19
+; CHECK-SDAG-NEXT: b.le .LBB7_3
+; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-SDAG-NEXT: str xzr, [sp]
+; CHECK-SDAG-NEXT: b .LBB7_1
+; CHECK-SDAG-NEXT: .LBB7_3:
+; CHECK-SDAG-NEXT: mov sp, x19
+; CHECK-SDAG-NEXT: ldr xzr, [sp]
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: bl private_za
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -360,6 +388,8 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
; CHECK-NEXT: bl __arm_sme_state_size
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: sub x19, x8, x0
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT: cmp sp, x19
@@ -370,8 +400,6 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
; CHECK-NEXT: .LBB7_3:
; CHECK-NEXT: mov sp, x19
; CHECK-NEXT: ldr xzr, [sp]
-; CHECK-NEXT: mov x0, x19
-; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: bl private_za
; CHECK-NEXT: mov x0, x19
; CHECK-NEXT: bl __arm_sme_restore
@@ -379,34 +407,6 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
-; CHECK-NEWLOWERING-NEXT: mov x8, sp
-; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
-; CHECK-NEWLOWERING-NEXT: cmp sp, x19
-; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3
-; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
-; CHECK-NEWLOWERING-NEXT: b .LBB7_1
-; CHECK-NEWLOWERING-NEXT: .LBB7_3:
-; CHECK-NEWLOWERING-NEXT: mov sp, x19
-; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
-; CHECK-NEWLOWERING-NEXT: bl private_za
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void @private_za()
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 57025ea172097..b5974f5407c73 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -213,19 +213,18 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
-; CHECK-COMMON: // %bb.0: // %prelude
+; CHECK-COMMON: // %bb.0: // %entry
; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-COMMON-NEXT: cbz x8, .LBB6_2
-; CHECK-COMMON-NEXT: b .LBB6_1
-; CHECK-COMMON-NEXT: .LBB6_1: // %save.za
+; CHECK-COMMON-NEXT: cbnz x8, .LBB6_1
+; CHECK-COMMON-NEXT: b .LBB6_2
+; CHECK-COMMON-NEXT: .LBB6_1: // %entry
; CHECK-COMMON-NEXT: bl __arm_tpidr2_save
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT: zero {za}
; CHECK-COMMON-NEXT: b .LBB6_2
; CHECK-COMMON-NEXT: .LBB6_2: // %entry
; CHECK-COMMON-NEXT: smstart za
-; CHECK-COMMON-NEXT: zero {za}
; CHECK-COMMON-NEXT: bl za_shared_callee
; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-COMMON-NEXT: fmov d1, x8
@@ -254,6 +253,9 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: sub x8, x29, #16
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
; CHECK-COMMON-NEXT: bl normal_callee
+; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT: fmov d1, x8
+; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: smstart za
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
; CHECK-COMMON-NEXT: sub x0, x29, #16
@@ -264,9 +266,6 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: b .LBB7_2
; CHECK-COMMON-NEXT: .LBB7_2: // %entry
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
-; CHECK-COMMON-NEXT: fmov d1, x8
-; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: mov sp, x29
; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
@@ -441,18 +440,18 @@ declare double @zt0_shared_callee(double) "aarch64_inout_zt0"
define double @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline optnone "aarch64_new_zt0" {
; CHECK-COMMON-LABEL: zt0_new_caller_to_zt0_shared_callee:
-; CHECK-COMMON: // %bb.0: // %prelude
+; CHECK-COMMON: // %bb.0: // %entry
; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-COMMON-NEXT: cbz x8, .LBB13_2
-; CHECK-COMMON-NEXT: b .LBB13_1
-; CHECK-COMMON-NEXT: .LBB13_1: // %save.za
+; CHECK-COMMON-NEXT: cbnz x8, .LBB13_1
+; CHECK-COMMON-NEXT: b .LBB13_2
+; CHECK-COMMON-NEXT: .LBB13_1: // %entry
; CHECK-COMMON-NEXT: bl __arm_tpidr2_save
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT: zero { zt0 }
; CHECK-COMMON-NEXT: b .LBB13_2
; CHECK-COMMON-NEXT: .LBB13_2: // %entry
; CHECK-COMMON-NEXT: smstart za
-; CHECK-COMMON-NEXT: zero { zt0 }
; CHECK-COMMON-NEXT: bl zt0_shared_callee
; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-COMMON-NEXT: fmov d1, x8
@@ -470,17 +469,18 @@ define double @zt0_shared_caller_to_normal_callee(double %x) nounwind noinline
; CHECK-COMMON-LABEL: zt0_shared_caller_to_normal_callee:
; CHECK-COMMON: // %bb.0: // %entry
; CHECK-COMMON-NEXT: sub sp, sp, #80
-; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: mov x19, sp
-; CHECK-COMMON-NEXT: str zt0, [x19]
+; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Spill
+; CHECK-COMMON-NEXT: mov x8, sp
+; CHECK-COMMON-NEXT: str zt0, [x8]
; CHECK-COMMON-NEXT: smstop za
; CHECK-COMMON-NEXT: bl normal_callee
-; CHECK-COMMON-NEXT: smstart za
-; CHECK-COMMON-NEXT: ldr zt0, [x19]
; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-COMMON-NEXT: fmov d1, x8
; CHECK-COMMON-NEXT: fadd d0, d0, d1
-; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: smstart za
+; CHECK-COMMON-NEXT: mov x8, sp
+; CHECK-COMMON-NEXT: ldr zt0, [x8]
+; CHECK-COMMON-NEXT: ldr x30, [sp, #64] // 8-byte Reload
; CHECK-COMMON-NEXT: add sp, sp, #80
; CHECK-COMMON-NEXT: ret
entry:
@@ -511,4 +511,3 @@ define void @agnostic_za_function(ptr %ptr) nounwind "aarch64_za_state_agnostic"
call void %ptr()
ret void
}
-
diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 99c65b090adb0..28050960c1da4 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -10,506 +10,168 @@ declare void @llvm.trap() #0
define void @quux() #1 {
; CHECK-LABEL: quux:
-; CHECK: // %bb.0: // %prelude
-; CHECK-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #384
-; CHECK-NEXT: .cfi_def_cfa w29, 96
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w21, -24
-; CHECK-NEXT: .cfi_offset w22, -32
-; CHECK-NEXT: .cfi_offset w23, -40
-; CHECK-NEXT: .cfi_offset w24, -48
-; CHECK-NEXT: .cfi_offset w25, -56
-; CHECK-NEXT: .cfi_offset w26, -64
-; CHECK-NEXT: .cfi_offset w27, -72
-; CHECK-NEXT: .cfi_offset w28, -80
-; CHECK-NEXT: .cfi_offset w30, -88
-; CHECK-NEXT: .cfi_offset w29, -96
-; CHECK-NEXT: rdsvl x8, #1
+; CHECK: // %bb.0: // %bb
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #352
+; CHECK-NEXT: addvl sp, sp, #-21
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xf0, 0x02, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x01, 0x1e, 0x22 // sp + 368 + 168 * VG
+; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: cbz x8, .LBB0_2
-; CHECK-NEXT: b .LBB0_1
-; CHECK-NEXT: .LBB0_1: // %save.za
+; CHECK-NEXT: cbnz x8, .LBB0_1
+; CHECK-NEXT: b .LBB0_2
+; CHECK-NEXT: .LBB0_1: // %bb
; CHECK-NEXT: bl __arm_tpidr2_save
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero {za}
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_2: // %bb
; CHECK-NEXT: smstart za
-; CHECK-NEXT: zero {za}
-; CHECK-NEXT: mov w9, #15 // =0xf
-; CHECK-NEXT: // implicit-def: $x8
-; CHECK-NEXT: mov w8, w9
-; CHECK-NEXT: mov x9, x8
-; CHECK-NEXT: incd x9
-; CHECK-NEXT: mov w0, w9
-; CHECK-NEXT: // implicit-def: $x9
-; CHECK-NEXT: mov w9, w0
-; CHECK-NEXT: and x14, x9, #0x70
-; CHECK-NEXT: sub x9, x29, #120
-; CHECK-NEXT: stur x14, [x9, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #112
-; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #104
-; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, x8
-; CHECK-NEXT: incb x9
-; CHECK-NEXT: mov w0, w9
-; CHECK-NEXT: // implicit-def: $x9
-; CHECK-NEXT: mov w9, w0
-; CHECK-NEXT: and x10, x9, #0x3f0
-; CHECK-NEXT: sub x9, x29, #96
-; CHECK-NEXT: stur x10, [x9, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #88
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #80
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #72
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #64
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #56
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #48
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #40
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #32
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #24
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #16
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x11, x29, #8
-; CHECK-NEXT: stur x9, [x11, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: stur x9, [x29, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: stur x9, [x29, #-248] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x10
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: stur x9, [x29, #-240] // 8-byte Folded Spill
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, #16
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, #16
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, #16
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, #16
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, #16
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x14
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, #16
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: mov x9, x8
-; CHECK-NEXT: incb x9, all, mul #2
-; CHECK-NEXT: mov w0, w9
-; CHECK-NEXT: // implicit-def: $x9
-; CHECK-NEXT: mov w9, w0
-; CHECK-NEXT: and x9, x9, #0x7f0
-; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: subs x10, x10, x9
-; CHECK-NEXT: and x10, x10, #0xffffffffffffffe0
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: mov x2, sp
-; CHECK-NEXT: subs x10, x2, #16
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: stur x10, [x29, #-232] // 8-byte Folded Spill
-; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: subs x11, x10, x14
-; CHECK-NEXT: mov sp, x11
-; CHECK-NEXT: mov x10, x11
-; CHECK-NEXT: stur x10, [x29, #-224] // 8-byte Folded Spill
-; CHECK-NEXT: mov x0, sp
-; CHECK-NEXT: subs x10, x0, #16
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: stur x10, [x29, #-216] // 8-byte Folded Spill
-; CHECK-NEXT: mov x17, sp
-; CHECK-NEXT: subs x10, x17, #16
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: stur x10, [x29, #-208] // 8-byte Folded Spill
-; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: subs x10, x10, x14
-; CHECK-NEXT: stur x10, [x29, #-32] // 8-byte Folded Spill
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: stur x10, [x29, #-200] // 8-byte Folded Spill
-; CHECK-NEXT: mov x15, sp
-; CHECK-NEXT: subs x10, x15, #16
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: stur x10, [x29, #-192] // 8-byte Folded Spill
-; CHECK-NEXT: mov x13, sp
-; CHECK-NEXT: subs x10, x13, #16
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: stur x10, [x29, #-184] // 8-byte Folded Spill
-; CHECK-NEXT: incw x8
-; CHECK-NEXT: mov w1, w8
-; CHECK-NEXT: // implicit-def: $x8
-; CHECK-NEXT: mov w8, w1
-; CHECK-NEXT: and x12, x8, #0xf0
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: subs x10, x8, x12
-; CHECK-NEXT: mov sp, x10
-; CHECK-NEXT: mov x8, x10
-; CHECK-NEXT: stur x8, [x29, #-176] // 8-byte Folded Spill
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: subs x8, x8, x12
-; CHECK-NEXT: stur x8, [x29, #-24] // 8-byte Folded Spill
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-168] // 8-byte Folded Spill
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: subs x8, x8, x9
-; CHECK-NEXT: and x8, x8, #0xffffffffffffffe0
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-160] // 8-byte Folded Spill
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: subs x8, x8, x9
-; CHECK-NEXT: and x8, x8, #0xffffffffffffffe0
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-152] // 8-byte Folded Spill
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-56] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-48] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x24, sp
-; CHECK-NEXT: subs x8, x24, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x7, sp
-; CHECK-NEXT: subs x8, x7, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: subs x8, x27, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x26, sp
-; CHECK-NEXT: subs x8, x26, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x1, sp
-; CHECK-NEXT: subs x8, x1, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x8, x9, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x20, sp
-; CHECK-NEXT: subs x8, x20, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x16, sp
-; CHECK-NEXT: subs x8, x16, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-144] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x5, sp
-; CHECK-NEXT: subs x8, x5, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x12, sp
-; CHECK-NEXT: subs x8, x12, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x22, sp
-; CHECK-NEXT: subs x8, x22, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x25, sp
-; CHECK-NEXT: subs x8, x25, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x30, sp
-; CHECK-NEXT: subs x8, x30, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-96] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-64] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-128] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-136] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-120] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-80] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-112] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-88] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x6, sp
-; CHECK-NEXT: subs x8, x6, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x21, sp
-; CHECK-NEXT: subs x8, x21, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: stur x8, [x29, #-40] // 8-byte Folded Spill
-; CHECK-NEXT: subs x8, x8, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x28, sp
-; CHECK-NEXT: subs x8, x28, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: subs x4, x8, x14
-; CHECK-NEXT: mov sp, x4
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: subs x3, x8, x14
-; CHECK-NEXT: mov sp, x3
-; CHECK-NEXT: mov x23, sp
-; CHECK-NEXT: subs x8, x23, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x18, sp
-; CHECK-NEXT: subs x8, x18, #16
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: mov x14, sp
-; CHECK-NEXT: subs x8, x14, #16
-; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: sturb w8, [x9, #-16]
-; CHECK-NEXT: ldur x9, [x29, #-144] // 8-byte Folded Reload
-; CHECK-NEXT: sturb w8, [x9, #-16]
-; CHECK-NEXT: ldur x9, [x29, #-96] // 8-byte Folded Reload
-; CHECK-NEXT: sturb w8, [x30, #-16]
+; CHECK-NEXT: strb w8, [sp, #207]
+; CHECK-NEXT: strb w8, [sp, #183]
+; CHECK-NEXT: strb w8, [sp, #143]
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: stur x8, [x29, #-16] // 8-byte Folded Spill
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x20, #-16]
-; CHECK-NEXT: ldur x9, [x27, #-16]
-; CHECK-NEXT: add x30, x8, x9, lsl #2
-; CHECK-NEXT: ldur x8, [x1, #-16]
-; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: ldur x9, [x16, #-16]
-; CHECK-NEXT: mul x8, x8, x9
-; CHECK-NEXT: ldur x9, [x29, #-64] // 8-byte Folded Reload
-; CHECK-NEXT: add x30, x30, x8, lsl #2
-; CHECK-NEXT: ldur x8, [x29, #-96] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x8, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x5, #-16]
-; CHECK-NEXT: ldur x9, [x26, #-16]
-; CHECK-NEXT: add x30, x8, x9, lsl #2
-; CHECK-NEXT: ldur x8, [x1, #-16]
-; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: ldur x9, [x12, #-16]
-; CHECK-NEXT: mul x8, x8, x9
-; CHECK-NEXT: ldur x9, [x29, #-128] // 8-byte Folded Reload
-; CHECK-NEXT: add x30, x30, x8, lsl #2
-; CHECK-NEXT: ldur x8, [x29, #-64] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x8, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x22, #-16]
-; CHECK-NEXT: ldur x9, [x27, #-16]
-; CHECK-NEXT: add x30, x8, x9, lsl #2
-; CHECK-NEXT: ldur x8, [x26, #-16]
-; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: ldur x9, [x25, #-16]
-; CHECK-NEXT: mul x8, x8, x9
-; CHECK-NEXT: ldur x9, [x29, #-136] // 8-byte Folded Reload
-; CHECK-NEXT: add x30, x30, x8, lsl #2
-; CHECK-NEXT: ldur x8, [x29, #-128] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x8, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-120] // 8-byte Folded Reload
-; CHECK-NEXT: mov w30, #32 // =0x20
-; CHECK-NEXT: // kill: def $lr killed $w30
-; CHECK-NEXT: stur x30, [x9, #-16]
-; CHECK-NEXT: ldur x9, [x29, #-80] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x8, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x1, #-16]
-; CHECK-NEXT: lsl x8, x8, #5
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x9, [x29, #-112] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x16, #-16]
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x27, #-16]
-; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: lsr x8, x8, #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x20, #-16]
-; CHECK-NEXT: stur x8, [x29, #-104] // 8-byte Folded Spill
-; CHECK-NEXT: ldur x8, [x29, #-80] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x9, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x8, #-16]
-; CHECK-NEXT: mul x9, x9, x8
-; CHECK-NEXT: ldur x8, [x29, #-104] // 8-byte Folded Reload
-; CHECK-NEXT: add x8, x8, x9, lsl #2
-; CHECK-NEXT: ldur x9, [x29, #-96] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x9, [x29, #-88] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x12, #-16]
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x26, #-16]
-; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: lsr x8, x8, #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x5, #-16]
-; CHECK-NEXT: stur x8, [x29, #-72] // 8-byte Folded Spill
-; CHECK-NEXT: ldur x8, [x29, #-80] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x9, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x8, #-16]
-; CHECK-NEXT: mul x9, x9, x8
-; CHECK-NEXT: ldur x8, [x29, #-72] // 8-byte Folded Reload
-; CHECK-NEXT: add x8, x8, x9, lsl #2
-; CHECK-NEXT: ldur x9, [x29, #-64] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x9, [x29, #-40] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x6, #-16]
-; CHECK-NEXT: stur x8, [x6, #-16]
-; CHECK-NEXT: stur x8, [x21, #-16]
-; CHECK-NEXT: stur x8, [x21, #-16]
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x27, #-16]
-; CHECK-NEXT: ldur x9, [x21, #-16]
-; CHECK-NEXT: subs x8, x8, x9
-; CHECK-NEXT: ldur x9, [x29, #-56] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x9, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-48] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x8, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-40] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x9, [x9, #-16]
-; CHECK-NEXT: stur x9, [x8, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x28, #-16]
-; CHECK-NEXT: ldur x8, [x26, #-16]
-; CHECK-NEXT: ldur x9, [x6, #-16]
-; CHECK-NEXT: subs x8, x8, x9
-; CHECK-NEXT: ldur x9, [x29, #-32] // 8-byte Folded Reload
-; CHECK-NEXT: stur x8, [x24, #-16]
-; CHECK-NEXT: ldur x8, [x29, #-24] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x7, #-16]
-; CHECK-NEXT: ldur x7, [x29, #-16] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x24, [x24, #-16]
-; CHECK-NEXT: stur x24, [x28, #-16]
-; CHECK-NEXT: ldur x24, [x21, #-16]
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: whilelt pn8.s, x24, x27, vlx2
-; CHECK-NEXT: str pn8, [x4]
-; CHECK-NEXT: ldur x24, [x6, #-16]
-; CHECK-NEXT: ldur x26, [x26, #-16]
-; CHECK-NEXT: whilelt pn8.s, x24, x26, vlx2
-; CHECK-NEXT: str pn8, [x3]
-; CHECK-NEXT: stur x7, [x23, #-16]
-; CHECK-NEXT: ldur x22, [x22, #-16]
-; CHECK-NEXT: ldur x24, [x21, #-16]
-; CHECK-NEXT: add x22, x22, x24, lsl #2
-; CHECK-NEXT: ldur x24, [x6, #-16]
-; CHECK-NEXT: ldur x25, [x25, #-16]
-; CHECK-NEXT: mul x24, x24, x25
-; CHECK-NEXT: add x22, x22, x24, lsl #2
-; CHECK-NEXT: stur x22, [x23, #-16]
+; CHECK-NEXT: str x8, [sp, #8] // 8-byte Spill
+; CHECK-NEXT: str x8, [sp, #128]
+; CHECK-NEXT: ldr x9, [sp, #192]
+; CHECK-NEXT: ldr x10, [sp, #224]
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: ldr x10, [sp, #208]
+; CHECK-NEXT: subs x10, x10, #1
+; CHECK-NEXT: ldr x11, [sp, #184]
+; CHECK-NEXT: mul x10, x10, x11
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: str x9, [sp, #128]
+; CHECK-NEXT: str x8, [sp, #120]
+; CHECK-NEXT: ldr x9, [sp, #168]
+; CHECK-NEXT: ldr x10, [sp, #216]
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: ldr x10, [sp, #208]
+; CHECK-NEXT: subs x10, x10, #1
+; CHECK-NEXT: ldr x11, [sp, #160]
+; CHECK-NEXT: mul x10, x10, x11
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: str x9, [sp, #120]
+; CHECK-NEXT: str x8, [sp, #112]
+; CHECK-NEXT: ldr x9, [sp, #152]
+; CHECK-NEXT: ldr x10, [sp, #224]
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: ldr x10, [sp, #216]
+; CHECK-NEXT: subs x10, x10, #1
+; CHECK-NEXT: ldr x11, [sp, #144]
+; CHECK-NEXT: mul x10, x10, x11
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: str x9, [sp, #112]
+; CHECK-NEXT: mov w9, #32 // =0x20
+; CHECK-NEXT: // kill: def $x9 killed $w9
+; CHECK-NEXT: str x9, [sp, #104]
+; CHECK-NEXT: str x9, [sp, #96]
+; CHECK-NEXT: str x8, [sp, #88]
+; CHECK-NEXT: ldr x10, [sp, #208]
+; CHECK-NEXT: lsl x10, x10, #5
+; CHECK-NEXT: str x10, [sp, #88]
+; CHECK-NEXT: str x9, [sp, #184]
+; CHECK-NEXT: str x8, [sp, #80]
+; CHECK-NEXT: ldr x10, [sp, #224]
+; CHECK-NEXT: subs x10, x10, #1
+; CHECK-NEXT: lsr x10, x10, #5
+; CHECK-NEXT: add x10, x10, #1
+; CHECK-NEXT: str x10, [sp, #80]
+; CHECK-NEXT: ldr x10, [sp, #192]
+; CHECK-NEXT: ldr x11, [sp, #80]
+; CHECK-NEXT: ldr x12, [sp, #88]
+; CHECK-NEXT: mul x11, x11, x12
+; CHECK-NEXT: add x10, x10, x11, lsl #2
+; CHECK-NEXT: str x10, [sp, #128]
+; CHECK-NEXT: str x9, [sp, #160]
+; CHECK-NEXT: str x8, [sp, #72]
+; CHECK-NEXT: ldr x10, [sp, #216]
+; CHECK-NEXT: subs x10, x10, #1
+; CHECK-NEXT: lsr x10, x10, #5
+; CHECK-NEXT: add x10, x10, #1
+; CHECK-NEXT: str x10, [sp, #72]
+; CHECK-NEXT: ldr x10, [sp, #168]
+; CHECK-NEXT: ldr x11, [sp, #72]
+; CHECK-NEXT: ldr x12, [sp, #88]
+; CHECK-NEXT: mul x11, x11, x12
+; CHECK-NEXT: add x10, x10, x11, lsl #2
+; CHECK-NEXT: str x10, [sp, #120]
+; CHECK-NEXT: str x8, [sp, #64]
+; CHECK-NEXT: str x8, [sp, #64]
+; CHECK-NEXT: str x8, [sp, #56]
+; CHECK-NEXT: str x8, [sp, #56]
+; CHECK-NEXT: str x8, [sp, #48]
+; CHECK-NEXT: ldr x10, [sp, #224]
+; CHECK-NEXT: ldr x11, [sp, #56]
+; CHECK-NEXT: subs x10, x10, x11
+; CHECK-NEXT: str x10, [sp, #256]
+; CHECK-NEXT: str x9, [sp, #248]
+; CHECK-NEXT: ldr x10, [sp, #256]
+; CHECK-NEXT: str x10, [sp, #48]
+; CHECK-NEXT: str x8, [sp, #40]
+; CHECK-NEXT: ldr x10, [sp, #216]
+; CHECK-NEXT: ldr x11, [sp, #64]
+; CHECK-NEXT: subs x10, x10, x11
+; CHECK-NEXT: str x10, [sp, #240]
+; CHECK-NEXT: str x9, [sp, #232]
+; CHECK-NEXT: ldr x9, [sp, #240]
+; CHECK-NEXT: str x9, [sp, #40]
+; CHECK-NEXT: ldr x9, [sp, #56]
+; CHECK-NEXT: ldr x10, [sp, #224]
+; CHECK-NEXT: whilelt pn8.s, x9, x10, vlx2
+; CHECK-NEXT: add x9, sp, #352
+; CHECK-NEXT: str pn8, [x9, #7, mul vl]
+; CHECK-NEXT: ldr x9, [sp, #64]
+; CHECK-NEXT: ldr x10, [sp, #216]
+; CHECK-NEXT: whilelt pn8.s, x9, x10, vlx2
+; CHECK-NEXT: add x9, sp, #352
+; CHECK-NEXT: str pn8, [x9, #6, mul vl]
+; CHECK-NEXT: str x8, [sp, #32]
+; CHECK-NEXT: ldr x9, [sp, #152]
+; CHECK-NEXT: ldr x10, [sp, #56]
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: ldr x10, [sp, #64]
+; CHECK-NEXT: ldr x11, [sp, #144]
+; CHECK-NEXT: mul x10, x10, x11
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: str x9, [sp, #32]
; CHECK-NEXT: zero {za}
-; CHECK-NEXT: stur x7, [x18, #-16]
-; CHECK-NEXT: ldur x20, [x20, #-16]
-; CHECK-NEXT: ldur x21, [x21, #-16]
-; CHECK-NEXT: ldur x22, [x1, #-16]
-; CHECK-NEXT: mul x21, x21, x22
-; CHECK-NEXT: add x20, x20, x21, lsl #2
-; CHECK-NEXT: stur x20, [x18, #-16]
-; CHECK-NEXT: stur x7, [x14, #-16]
-; CHECK-NEXT: ldur x5, [x5, #-16]
-; CHECK-NEXT: ldur x6, [x6, #-16]
-; CHECK-NEXT: ldur x7, [x1, #-16]
-; CHECK-NEXT: mul x6, x6, x7
-; CHECK-NEXT: add x5, x5, x6, lsl #2
-; CHECK-NEXT: stur x5, [x14, #-16]
-; CHECK-NEXT: ldur x1, [x1, #-16]
-; CHECK-NEXT: ldr p1, [x4]
-; CHECK-NEXT: ldur x18, [x18, #-16]
-; CHECK-NEXT: ldur x16, [x16, #-16]
-; CHECK-NEXT: lsr x16, x16, #2
-; CHECK-NEXT: ldr p0, [x3]
-; CHECK-NEXT: ldur x14, [x14, #-16]
-; CHECK-NEXT: ldur x12, [x12, #-16]
-; CHECK-NEXT: lsr x12, x12, #2
-; CHECK-NEXT: stur x1, [x2, #-16]
-; CHECK-NEXT: str p1, [x11]
-; CHECK-NEXT: stur x18, [x0, #-16]
-; CHECK-NEXT: stur x16, [x17, #-16]
-; CHECK-NEXT: str p0, [x9]
-; CHECK-NEXT: stur x14, [x15, #-16]
-; CHECK-NEXT: stur x12, [x13, #-16]
-; CHECK-NEXT: ldr p0, [x11]
+; CHECK-NEXT: str x8, [sp, #24]
+; CHECK-NEXT: ldr x9, [sp, #192]
+; CHECK-NEXT: ldr x10, [sp, #56]
+; CHECK-NEXT: ldr x11, [sp, #208]
+; CHECK-NEXT: mul x10, x10, x11
+; CHECK-NEXT: add x9, x9, x10, lsl #2
+; CHECK-NEXT: str x9, [sp, #24]
+; CHECK-NEXT: str x8, [sp, #16]
+; CHECK-NEXT: ldr x8, [sp, #168]
+; CHECK-NEXT: ldr x9, [sp, #64]
+; CHECK-NEXT: ldr x10, [sp, #208]
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: add x8, x8, x9, lsl #2
+; CHECK-NEXT: str x8, [sp, #16]
+; CHECK-NEXT: ldr x12, [sp, #208]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p1, [x8, #7, mul vl]
+; CHECK-NEXT: ldr x11, [sp, #24]
+; CHECK-NEXT: ldr x8, [sp, #184]
+; CHECK-NEXT: lsr x10, x8, #2
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #6, mul vl]
+; CHECK-NEXT: ldr x9, [sp, #16]
+; CHECK-NEXT: ldr x8, [sp, #160]
+; CHECK-NEXT: lsr x8, x8, #2
+; CHECK-NEXT: str x12, [sp, #296]
+; CHECK-NEXT: add x12, sp, #352
+; CHECK-NEXT: str p1, [x12, #47, mul vl]
+; CHECK-NEXT: str x11, [sp, #288]
+; CHECK-NEXT: str x10, [sp, #280]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: str p0, [x10, #46, mul vl]
+; CHECK-NEXT: str x9, [sp, #272]
+; CHECK-NEXT: str x8, [sp, #264]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #47, mul vl]
; CHECK-NEXT: mov p8.b, p0.b
; CHECK-NEXT: pext { p3.s, p4.s }, pn8[0]
; CHECK-NEXT: mov p0.b, p3.b
@@ -517,146 +179,168 @@ define void @quux() #1 {
; CHECK-NEXT: and p0.b, p0/z, p0.b, p2.b
; CHECK-NEXT: mov p1.b, p4.b
; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b
-; CHECK-NEXT: mov x11, x10
-; CHECK-NEXT: incd x11
-; CHECK-NEXT: str p1, [x11]
-; CHECK-NEXT: str p0, [x10]
-; CHECK-NEXT: ldr p0, [x9]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: addpl x8, x8, #31
+; CHECK-NEXT: addpl x8, x8, #13
+; CHECK-NEXT: incd x8
+; CHECK-NEXT: str p1, [x8]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p0, [x8, #44, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #46, mul vl]
; CHECK-NEXT: mov p8.b, p0.b
; CHECK-NEXT: pext { p3.s, p4.s }, pn8[0]
; CHECK-NEXT: mov p0.b, p3.b
; CHECK-NEXT: and p0.b, p0/z, p0.b, p2.b
; CHECK-NEXT: mov p1.b, p4.b
; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b
-; CHECK-NEXT: mov x9, x8
-; CHECK-NEXT: incd x9
-; CHECK-NEXT: str p1, [x9]
-; CHECK-NEXT: str p0, [x8]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: addpl x8, x8, #31
+; CHECK-NEXT: addpl x8, x8, #11
+; CHECK-NEXT: incd x8
+; CHECK-NEXT: str p1, [x8]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p0, [x8, #42, mul vl]
; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_3: // %bb178
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldur x9, [x29, #-232] // 8-byte Folded Reload
-; CHECK-NEXT: sub x8, x29, #80
-; CHECK-NEXT: ldur x8, [x8, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x10, x29, #88
-; CHECK-NEXT: ldur x10, [x10, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x11, x29, #104
-; CHECK-NEXT: ldur x11, [x11, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x12, x29, #112
-; CHECK-NEXT: ldur x12, [x12, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x13, [x29, #-152] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x14, [x29, #-160] // 8-byte Folded Reload
-; CHECK-NEXT: sub x15, x29, #48
-; CHECK-NEXT: ldur x17, [x15, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x15, x29, #56
-; CHECK-NEXT: ldur x18, [x15, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x15, x29, #64
-; CHECK-NEXT: ldur x0, [x15, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x15, x29, #72
-; CHECK-NEXT: ldur x1, [x15, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x15, [x29, #-168] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x2, [x29, #-176] // 8-byte Folded Reload
-; CHECK-NEXT: sub x16, x29, #16
-; CHECK-NEXT: ldur x3, [x16, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x16, x29, #24
-; CHECK-NEXT: ldur x4, [x16, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x16, x29, #32
-; CHECK-NEXT: ldur x5, [x16, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x16, x29, #40
-; CHECK-NEXT: ldur x6, [x16, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x16, [x29, #-240] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x7, [x29, #-248] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x20, [x29, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sub x21, x29, #8
-; CHECK-NEXT: ldur x21, [x21, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x23, [x29, #-192] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x22, [x29, #-184] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x24, [x29, #-200] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x26, [x29, #-216] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x25, [x29, #-208] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x29, #-224] // 8-byte Folded Reload
-; CHECK-NEXT: ldr p0, [x27]
-; CHECK-NEXT: ldr x27, [x26]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #47, mul vl]
+; CHECK-NEXT: ldr x8, [sp, #288]
; CHECK-NEXT: mov p8.b, p0.b
-; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x27]
+; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x8]
; CHECK-NEXT: mov z0.d, z16.d
; CHECK-NEXT: mov z1.d, z24.d
-; CHECK-NEXT: str z1, [x14, #1, mul vl]
-; CHECK-NEXT: str z0, [x14]
-; CHECK-NEXT: ldr x27, [x25]
-; CHECK-NEXT: ldr x25, [x26]
-; CHECK-NEXT: add x25, x25, x27, lsl #2
-; CHECK-NEXT: str x25, [x26]
-; CHECK-NEXT: ldr p0, [x24]
-; CHECK-NEXT: ldr x24, [x23]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z1, [x8, #4, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z0, [x8, #3, mul vl]
+; CHECK-NEXT: ldr x9, [sp, #280]
+; CHECK-NEXT: ldr x8, [sp, #288]
+; CHECK-NEXT: add x8, x8, x9, lsl #2
+; CHECK-NEXT: str x8, [sp, #288]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #46, mul vl]
+; CHECK-NEXT: ldr x8, [sp, #272]
; CHECK-NEXT: mov p8.b, p0.b
-; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x24]
+; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x8]
; CHECK-NEXT: mov z0.d, z16.d
; CHECK-NEXT: mov z1.d, z24.d
-; CHECK-NEXT: str z1, [x13, #1, mul vl]
-; CHECK-NEXT: str z0, [x13]
-; CHECK-NEXT: ldr x24, [x22]
-; CHECK-NEXT: ldr x22, [x23]
-; CHECK-NEXT: add x22, x22, x24, lsl #2
-; CHECK-NEXT: str x22, [x23]
-; CHECK-NEXT: ldr p1, [x2]
-; CHECK-NEXT: ldr p0, [x15]
-; CHECK-NEXT: ldr z1, [x14]
-; CHECK-NEXT: ldr z0, [x13]
-; CHECK-NEXT: str p1, [x21]
-; CHECK-NEXT: str p0, [x20]
-; CHECK-NEXT: str z1, [x7]
-; CHECK-NEXT: str z0, [x16]
-; CHECK-NEXT: ldr p0, [x21]
-; CHECK-NEXT: ldr p1, [x20]
-; CHECK-NEXT: ldr z0, [x7]
-; CHECK-NEXT: ldr z1, [x16]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z1, [x8, #2, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z0, [x8, #1, mul vl]
+; CHECK-NEXT: ldr x9, [sp, #264]
+; CHECK-NEXT: ldr x8, [sp, #272]
+; CHECK-NEXT: add x8, x8, x9, lsl #2
+; CHECK-NEXT: str x8, [sp, #272]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p1, [x8, #44, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #42, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z1, [x8, #3, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z0, [x8, #1, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p1, [x8, #95, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p0, [x8, #94, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z1, [x8, #10, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z0, [x8, #9, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #95, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p1, [x8, #94, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z0, [x8, #10, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z1, [x8, #9, mul vl]
; CHECK-NEXT: fmopa za0.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT: mov x16, x2
-; CHECK-NEXT: incd x16
-; CHECK-NEXT: ldr p1, [x16]
-; CHECK-NEXT: ldr p0, [x15]
-; CHECK-NEXT: ldr z1, [x14, #1, mul vl]
-; CHECK-NEXT: ldr z0, [x13]
-; CHECK-NEXT: str p1, [x6]
-; CHECK-NEXT: str p0, [x5]
-; CHECK-NEXT: str z1, [x4]
-; CHECK-NEXT: str z0, [x3]
-; CHECK-NEXT: ldr p0, [x6]
-; CHECK-NEXT: ldr p1, [x5]
-; CHECK-NEXT: ldr z0, [x4]
-; CHECK-NEXT: ldr z1, [x3]
+; CHECK-NEXT: add x9, sp, #352
+; CHECK-NEXT: addpl x9, x9, #31
+; CHECK-NEXT: addpl x9, x9, #13
+; CHECK-NEXT: incd x9
+; CHECK-NEXT: ldr p1, [x9]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #42, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z1, [x8, #4, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z0, [x8, #1, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p1, [x8, #119, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p0, [x8, #118, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z1, [x8, #13, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z0, [x8, #12, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #119, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p1, [x8, #118, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z0, [x8, #13, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z1, [x8, #12, mul vl]
; CHECK-NEXT: fmopa za1.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT: ldr p1, [x2]
-; CHECK-NEXT: incd x15
-; CHECK-NEXT: ldr p0, [x15]
-; CHECK-NEXT: ldr z1, [x14]
-; CHECK-NEXT: ldr z0, [x13, #1, mul vl]
-; CHECK-NEXT: str p1, [x1]
-; CHECK-NEXT: str p0, [x0]
-; CHECK-NEXT: str z1, [x18]
-; CHECK-NEXT: str z0, [x17]
-; CHECK-NEXT: ldr p0, [x1]
-; CHECK-NEXT: ldr p1, [x0]
-; CHECK-NEXT: ldr z0, [x18]
-; CHECK-NEXT: ldr z1, [x17]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p1, [x8, #44, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: addpl x8, x8, #31
+; CHECK-NEXT: addpl x8, x8, #11
+; CHECK-NEXT: incd x8
+; CHECK-NEXT: ldr p0, [x8]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: ldr z1, [x10, #3, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: ldr z0, [x10, #2, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: str p1, [x10, #143, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: str p0, [x10, #142, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: str z1, [x10, #16, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: str z0, [x10, #15, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: ldr p0, [x10, #143, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: ldr p1, [x10, #142, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: ldr z0, [x10, #16, mul vl]
+; CHECK-NEXT: add x10, sp, #352
+; CHECK-NEXT: ldr z1, [x10, #15, mul vl]
; CHECK-NEXT: fmopa za2.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT: ldr p1, [x16]
-; CHECK-NEXT: ldr p0, [x15]
-; CHECK-NEXT: ldr z1, [x14, #1, mul vl]
-; CHECK-NEXT: ldr z0, [x13, #1, mul vl]
-; CHECK-NEXT: str p1, [x12]
-; CHECK-NEXT: str p0, [x11]
-; CHECK-NEXT: str z1, [x10]
-; CHECK-NEXT: str z0, [x8]
-; CHECK-NEXT: ldr p0, [x12]
-; CHECK-NEXT: ldr p1, [x11]
-; CHECK-NEXT: ldr z0, [x10]
-; CHECK-NEXT: ldr z1, [x8]
+; CHECK-NEXT: ldr p1, [x9]
+; CHECK-NEXT: ldr p0, [x8]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z1, [x8, #4, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z0, [x8, #2, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p1, [x8, #167, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str p0, [x8, #166, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z1, [x8, #19, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: str z0, [x8, #18, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p0, [x8, #167, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr p1, [x8, #166, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z0, [x8, #19, mul vl]
+; CHECK-NEXT: add x8, sp, #352
+; CHECK-NEXT: ldr z1, [x8, #18, mul vl]
; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT: ldr x8, [x9]
+; CHECK-NEXT: ldr x8, [sp, #296]
; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: str x8, [x9]
+; CHECK-NEXT: str x8, [sp, #296]
; CHECK-NEXT: b .LBB0_3
bb:
%alloca = alloca <vscale x 16 x i1>, align 2
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index e672f777703a6..188059baa6675 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
-; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
declare void @private_za_callee()
declare void @shared_za_callee() "aarch64_inout_za"
@@ -42,19 +42,57 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; Test lazy-save mechanism for multiple callees.
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
+; CHECK-SDAG-LABEL: test_lazy_save_2_callees:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: sub x20, x29, #16
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2
+; CHECK-SDAG-NEXT: // %bb.1:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB1_2:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB1_4
+; CHECK-SDAG-NEXT: // %bb.3:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB1_4:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: test_lazy_save_2_callees:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x20, x29, #16
+; CHECK-NEXT: sub x10, x29, #16
; CHECK-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -64,48 +102,10 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x20
-; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB1_4
-; CHECK-NEXT: // %bb.3:
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: test_lazy_save_2_callees:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB1_2:
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void @private_za_callee()
call void @private_za_callee()
ret void
@@ -145,6 +145,50 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; Test a combination of streaming-compatible -> normal call with lazy-save.
define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
+; CHECK-SDAG-LABEL: test_lazy_save_and_conditional_smstart:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: add x29, sp, #64
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: mrs x20, SVCR
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: sub x10, x29, #80
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x10
+; CHECK-SDAG-NEXT: tbz w20, #0, .LBB3_2
+; CHECK-SDAG-NEXT: // %bb.1:
+; CHECK-SDAG-NEXT: smstop sm
+; CHECK-SDAG-NEXT: .LBB3_2:
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: tbz w20, #0, .LBB3_4
+; CHECK-SDAG-NEXT: // %bb.3:
+; CHECK-SDAG-NEXT: smstart sm
+; CHECK-SDAG-NEXT: .LBB3_4:
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #80
+; CHECK-SDAG-NEXT: cbnz x8, .LBB3_6
+; CHECK-SDAG-NEXT: // %bb.5:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB3_6:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: sub sp, x29, #64
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
@@ -157,12 +201,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: mrs x20, SVCR
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #80
; CHECK-NEXT: stp x9, x8, [x29, #-80]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: mrs x20, SVCR
+; CHECK-NEXT: sub x8, x29, #80
+; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: tbz w20, #0, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -188,50 +232,6 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: test_lazy_save_and_conditional_smstart:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80]
-; CHECK-NEWLOWERING-NEXT: mrs x20, SVCR
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #80
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB3_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: smstop sm
-; CHECK-NEWLOWERING-NEXT: .LBB3_2:
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB3_4
-; CHECK-NEWLOWERING-NEXT: // %bb.3:
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: .LBB3_4:
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_6
-; CHECK-NEWLOWERING-NEXT: // %bb.5:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB3_6:
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void @private_za_callee()
ret void
}
@@ -240,15 +240,67 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; restore from it (since ZA is off on return). We could improve this case
; by turning ZA off before the final private ZA call.
define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za"
+; CHECK-SDAG-LABEL: test_lazy_save_mixed_shared_and_private_callees:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT: .cfi_offset w19, -8
+; CHECK-SDAG-NEXT: .cfi_offset w20, -16
+; CHECK-SDAG-NEXT: .cfi_offset w30, -24
+; CHECK-SDAG-NEXT: .cfi_offset w29, -32
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB4_2
+; CHECK-SDAG-NEXT: // %bb.1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB4_2:
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: sub x20, x29, #16
+; CHECK-SDAG-NEXT: zero {za}
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB4_4
+; CHECK-SDAG-NEXT: // %bb.3:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB4_4:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: bl shared_za_callee
+; CHECK-SDAG-NEXT: bl preserves_za_callee
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB4_6
+; CHECK-SDAG-NEXT: // %bb.5:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB4_6:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: test_lazy_save_mixed_shared_and_private_callees:
-; CHECK: // %bb.0: // %prelude
+; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa w29, 32
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w19, -16
; CHECK-NEXT: .cfi_offset w30, -24
; CHECK-NEXT: .cfi_offset w29, -32
; CHECK-NEXT: rdsvl x8, #1
@@ -258,14 +310,14 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za"
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB4_2
-; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_save
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero {za}
; CHECK-NEXT: .LBB4_2:
; CHECK-NEXT: smstart za
-; CHECK-NEXT: sub x20, x29, #16
-; CHECK-NEXT: zero {za}
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -277,67 +329,15 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za"
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: bl shared_za_callee
; CHECK-NEXT: bl preserves_za_callee
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB4_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB4_6:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: smstop za
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: test_lazy_save_mixed_shared_and_private_callees:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 32
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -24
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -32
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB4_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: zero {za}
-; CHECK-NEWLOWERING-NEXT: .LBB4_2:
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB4_4
-; CHECK-NEWLOWERING-NEXT: // %bb.3:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB4_4:
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: bl shared_za_callee
-; CHECK-NEWLOWERING-NEXT: bl preserves_za_callee
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
{
call void @private_za_callee()
call void @shared_za_callee()
@@ -347,15 +347,98 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za"
}
define void @test_many_back2back_private_za_calls() "aarch64_inout_za" {
+; CHECK-SDAG-LABEL: test_many_back2back_private_za_calls:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT: .cfi_offset w19, -8
+; CHECK-SDAG-NEXT: .cfi_offset w20, -16
+; CHECK-SDAG-NEXT: .cfi_offset w30, -24
+; CHECK-SDAG-NEXT: .cfi_offset w29, -32
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: bl shared_za_callee
+; CHECK-SDAG-NEXT: sub x20, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_2
+; CHECK-SDAG-NEXT: // %bb.1:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_2:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_4
+; CHECK-SDAG-NEXT: // %bb.3:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_4:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_6
+; CHECK-SDAG-NEXT: // %bb.5:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_6:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_8
+; CHECK-SDAG-NEXT: // %bb.7:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_8:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_10
+; CHECK-SDAG-NEXT: // %bb.9:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_10:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_callee
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_12
+; CHECK-SDAG-NEXT: // %bb.11:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_12:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: bl shared_za_callee
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: test_many_back2back_private_za_calls:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa w29, 32
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w19, -16
; CHECK-NEXT: .cfi_offset w30, -24
; CHECK-NEXT: .cfi_offset w29, -32
; CHECK-NEXT: rdsvl x8, #1
@@ -364,110 +447,27 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" {
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: bl shared_za_callee
-; CHECK-NEXT: sub x20, x29, #16
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_2
-; CHECK-NEXT: // %bb.1:
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_2:
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_4
-; CHECK-NEXT: // %bb.3:
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_4:
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_6:
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_8
-; CHECK-NEXT: // %bb.7:
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_8:
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_10
-; CHECK-NEXT: // %bb.9:
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_10:
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_12
-; CHECK-NEXT: // %bb.11:
+; CHECK-NEXT: cbnz x8, .LBB5_2
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_12:
+; CHECK-NEXT: .LBB5_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: bl shared_za_callee
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: test_many_back2back_private_za_calls:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 32
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -24
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -32
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: bl shared_za_callee
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB5_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB5_2:
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: bl shared_za_callee
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void @shared_za_callee()
call void @private_za_callee()
call void @private_za_callee()
@@ -572,19 +572,51 @@ declare i64 @many_args_private_za_callee(
; stack pointer before the call -- in this test the lazy save should be setup
; before the stack decrement.
define i64 @test_many_callee_arguments(
+; CHECK-SDAG-LABEL: test_many_callee_arguments:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: mov x8, sp
+; CHECK-SDAG-NEXT: rdsvl x9, #1
+; CHECK-SDAG-NEXT: msub x8, x9, x9, x8
+; CHECK-SDAG-NEXT: mov sp, x8
+; CHECK-SDAG-NEXT: ldp x10, x11, [x29, #32]
+; CHECK-SDAG-NEXT: sub x12, x29, #16
+; CHECK-SDAG-NEXT: stp x8, x9, [x29, #-16]
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x12
+; CHECK-SDAG-NEXT: stp x10, x11, [sp, #-16]!
+; CHECK-SDAG-NEXT: bl many_args_private_za_callee
+; CHECK-SDAG-NEXT: add sp, sp, #16
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB9_2
+; CHECK-SDAG-NEXT: // %bb.1:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB9_2:
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: test_many_callee_arguments:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: msub x8, x9, x9, x8
-; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: ldp x10, x11, [x29, #32]
; CHECK-NEXT: sub x12, x29, #16
-; CHECK-NEXT: stp x8, x9, [x29, #-16]
+; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: msr TPIDR2_EL0, x12
; CHECK-NEXT: stp x10, x11, [sp, #-16]!
; CHECK-NEXT: bl many_args_private_za_callee
@@ -603,38 +635,6 @@ define i64 @test_many_callee_arguments(
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: ldp x10, x11, [x29, #32]
-; CHECK-NEWLOWERING-NEXT: sub x12, x29, #16
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x12
-; CHECK-NEWLOWERING-NEXT: stp x10, x11, [sp, #-16]!
-; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee
-; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: mov x1, x0
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB9_2:
-; CHECK-NEWLOWERING-NEXT: mov x0, x1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9
) nounwind "aarch64_inout_za" {
%ret = call i64 @many_args_private_za_callee(
diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
index 0717387ae2963..d2715b58439d8 100644
--- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
+++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
@@ -1,51 +1,51 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefix=CHECK-SDAG
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
declare void @shared_za_callee() "aarch64_inout_za"
define void @private_za() "aarch64_new_za" {
+; CHECK-SDAG-LABEL: private_za:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SDAG-NEXT: .cfi_offset w30, -16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB0_2
+; CHECK-SDAG-NEXT: b .LBB0_1
+; CHECK-SDAG-NEXT: .LBB0_1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: mov x8, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: b .LBB0_2
+; CHECK-SDAG-NEXT: .LBB0_2:
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: zero {za}
+; CHECK-SDAG-NEXT: bl shared_za_callee
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: private_za:
-; CHECK: // %bb.0: // %prelude
+; CHECK: // %bb.0:
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: cbz x8, .LBB0_2
-; CHECK-NEXT: b .LBB0_1
-; CHECK-NEXT: .LBB0_1: // %save.za
+; CHECK-NEXT: cbnz x8, .LBB0_1
+; CHECK-NEXT: b .LBB0_2
+; CHECK-NEXT: .LBB0_1:
; CHECK-NEXT: bl __arm_tpidr2_save
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero {za}
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: smstart za
-; CHECK-NEXT: zero {za}
; CHECK-NEXT: bl shared_za_callee
; CHECK-NEXT: smstop za
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: private_za:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -16
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_1
-; CHECK-NEWLOWERING-NEXT: b .LBB0_2
-; CHECK-NEWLOWERING-NEXT: .LBB0_1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: zero {za}
-; CHECK-NEWLOWERING-NEXT: b .LBB0_2
-; CHECK-NEWLOWERING-NEXT: .LBB0_2:
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: bl shared_za_callee
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void @shared_za_callee()
ret void
}
@@ -53,29 +53,65 @@ define void @private_za() "aarch64_new_za" {
; Note: This test must run at -O0 as otherwise the multiple exits are optimized out.
; TODO: We should be able to omit the ZA save here (as this function does not use ZA).
define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" {
+; CHECK-SDAG-LABEL: private_za_multiple_exit:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: sub sp, sp, #32
+; CHECK-SDAG-NEXT: str x30, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SDAG-NEXT: .cfi_offset w30, -16
+; CHECK-SDAG-NEXT: str x2, [sp] // 8-byte Spill
+; CHECK-SDAG-NEXT: str w1, [sp, #8] // 4-byte Spill
+; CHECK-SDAG-NEXT: str w0, [sp, #12] // 4-byte Spill
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB1_2
+; CHECK-SDAG-NEXT: b .LBB1_1
+; CHECK-SDAG-NEXT: .LBB1_1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: mov x8, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: b .LBB1_2
+; CHECK-SDAG-NEXT: .LBB1_2: // %entry
+; CHECK-SDAG-NEXT: ldr x8, [sp] // 8-byte Reload
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: zero {za}
+; CHECK-SDAG-NEXT: subs x8, x8, #1
+; CHECK-SDAG-NEXT: b.ne .LBB1_4
+; CHECK-SDAG-NEXT: b .LBB1_3
+; CHECK-SDAG-NEXT: .LBB1_3: // %if.else
+; CHECK-SDAG-NEXT: ldr w8, [sp, #12] // 4-byte Reload
+; CHECK-SDAG-NEXT: ldr w9, [sp, #8] // 4-byte Reload
+; CHECK-SDAG-NEXT: add w0, w8, w9
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: add sp, sp, #32
+; CHECK-SDAG-NEXT: ret
+; CHECK-SDAG-NEXT: .LBB1_4: // %if.end
+; CHECK-SDAG-NEXT: ldr w8, [sp, #12] // 4-byte Reload
+; CHECK-SDAG-NEXT: ldr w9, [sp, #8] // 4-byte Reload
+; CHECK-SDAG-NEXT: subs w0, w8, w9
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: add sp, sp, #32
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: private_za_multiple_exit:
-; CHECK: // %bb.0: // %prelude
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: str x30, [sp, #16] // 8-byte Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: str x2, [sp] // 8-byte Spill
-; CHECK-NEXT: str w1, [sp, #8] // 4-byte Spill
-; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill
-; CHECK-NEXT: rdsvl x8, #1
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: cbz x8, .LBB1_2
-; CHECK-NEXT: b .LBB1_1
-; CHECK-NEXT: .LBB1_1: // %save.za
+; CHECK-NEXT: cbnz x8, .LBB1_1
+; CHECK-NEXT: b .LBB1_2
+; CHECK-NEXT: .LBB1_1: // %entry
; CHECK-NEXT: bl __arm_tpidr2_save
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero {za}
; CHECK-NEXT: b .LBB1_2
; CHECK-NEXT: .LBB1_2: // %entry
-; CHECK-NEXT: ldr x8, [sp] // 8-byte Reload
; CHECK-NEXT: smstart za
-; CHECK-NEXT: zero {za}
-; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: str w1, [sp, #8] // 4-byte Spill
+; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill
+; CHECK-NEXT: subs x8, x2, #1
; CHECK-NEXT: b.ne .LBB1_4
; CHECK-NEXT: b .LBB1_3
; CHECK-NEXT: .LBB1_3: // %if.else
@@ -83,51 +119,15 @@ define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za"
; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload
; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: smstop za
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_4: // %if.end
; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload
; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload
; CHECK-NEXT: subs w0, w8, w9
; CHECK-NEXT: smstop za
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: private_za_multiple_exit:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_1
-; CHECK-NEWLOWERING-NEXT: b .LBB1_2
-; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %entry
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: zero {za}
-; CHECK-NEWLOWERING-NEXT: b .LBB1_2
-; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %entry
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: str w1, [sp, #8] // 4-byte Spill
-; CHECK-NEWLOWERING-NEXT: str w0, [sp, #12] // 4-byte Spill
-; CHECK-NEWLOWERING-NEXT: subs x8, x2, #1
-; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_4
-; CHECK-NEWLOWERING-NEXT: b .LBB1_3
-; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %if.else
-; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Reload
-; CHECK-NEWLOWERING-NEXT: add w0, w8, w9
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: ret
-; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %if.end
-; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Reload
-; CHECK-NEWLOWERING-NEXT: subs w0, w8, w9
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: ret
entry:
%tobool = icmp eq i64 %cond, 1
br i1 %tobool, label %if.else, label %if.end
@@ -143,36 +143,36 @@ if.end:
; In simple cases like this we should omit all ZA setup.
define i32 @private_za_trivially_does_not_use_za(i32 %x) "aarch64_new_za" {
+; CHECK-SDAG-LABEL: private_za_trivially_does_not_use_za:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: sub sp, sp, #32
+; CHECK-SDAG-NEXT: str x30, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SDAG-NEXT: .cfi_offset w30, -16
+; CHECK-SDAG-NEXT: str w0, [sp, #12] // 4-byte Spill
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB2_2
+; CHECK-SDAG-NEXT: b .LBB2_1
+; CHECK-SDAG-NEXT: .LBB2_1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: mov x8, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: b .LBB2_2
+; CHECK-SDAG-NEXT: .LBB2_2:
+; CHECK-SDAG-NEXT: ldr w8, [sp, #12] // 4-byte Reload
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: zero {za}
+; CHECK-SDAG-NEXT: add w0, w8, w8
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: add sp, sp, #32
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: private_za_trivially_does_not_use_za:
-; CHECK: // %bb.0: // %prelude
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: str x30, [sp, #16] // 8-byte Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: cbz x8, .LBB2_2
-; CHECK-NEXT: b .LBB2_1
-; CHECK-NEXT: .LBB2_1: // %save.za
-; CHECK-NEXT: bl __arm_tpidr2_save
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEXT: b .LBB2_2
-; CHECK-NEXT: .LBB2_2:
-; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: zero {za}
-; CHECK-NEXT: add w0, w8, w8
-; CHECK-NEXT: smstop za
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload
-; CHECK-NEXT: add sp, sp, #32
+; CHECK: // %bb.0:
+; CHECK-NEXT: add w0, w0, w0
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: private_za_trivially_does_not_use_za:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: add w0, w0, w0
-; CHECK-NEWLOWERING-NEXT: ret
%ret = add i32 %x, %x
ret i32 %ret
}
diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index 240b204d15210..50449172ce85b 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -1,86 +1,86 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
declare void @private_za_call()
declare void @shared_za_call() "aarch64_inout_za"
define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind {
+; CHECK-SDAG-LABEL: private_za_loop:
+; CHECK-SDAG: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: cmp w0, #1
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: b.lt .LBB0_5
+; CHECK-SDAG-NEXT: // %bb.1: // %loop.preheader
+; CHECK-SDAG-NEXT: mov w19, w0
+; CHECK-SDAG-NEXT: sub x20, x29, #16
+; CHECK-SDAG-NEXT: b .LBB0_3
+; CHECK-SDAG-NEXT: .LBB0_2: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB0_3 Depth=1
+; CHECK-SDAG-NEXT: subs w19, w19, #1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: b.eq .LBB0_5
+; CHECK-SDAG-NEXT: .LBB0_3: // %loop
+; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB0_2
+; CHECK-SDAG-NEXT: // %bb.4: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB0_3 Depth=1
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: b .LBB0_2
+; CHECK-SDAG-NEXT: .LBB0_5: // %exit
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: private_za_loop:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: sub x10, x29, #16
; CHECK-NEXT: cmp w0, #1
; CHECK-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEXT: b.lt .LBB0_5
+; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: b.lt .LBB0_3
; CHECK-NEXT: // %bb.1: // %loop.preheader
; CHECK-NEXT: mov w19, w0
-; CHECK-NEXT: sub x20, x29, #16
-; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_2: // %loop
-; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: subs w19, w19, #1
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: b.eq .LBB0_5
-; CHECK-NEXT: .LBB0_3: // %loop
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_call
+; CHECK-NEXT: subs w19, w19, #1
+; CHECK-NEXT: b.ne .LBB0_2
+; CHECK-NEXT: .LBB0_3: // %exit
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB0_2
-; CHECK-NEXT: // %bb.4: // %loop
-; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: cbnz x8, .LBB0_5
+; CHECK-NEXT: // %bb.4: // %exit
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_5: // %exit
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: private_za_loop:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT: cmp w0, #1
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: b.lt .LBB0_3
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT: mov w19, w0
-; CHECK-NEWLOWERING-NEXT: .LBB0_2: // %loop
-; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1
-; CHECK-NEWLOWERING-NEXT: b.ne .LBB0_2
-; CHECK-NEWLOWERING-NEXT: .LBB0_3: // %exit
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_5
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB0_5: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
entry:
%cmpgt = icmp sgt i32 %n, 0
br i1 %cmpgt, label %loop, label %exit
@@ -98,6 +98,47 @@ exit:
; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop.
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
+; CHECK-SDAG-LABEL: private_za_loop_active_entry_and_exit:
+; CHECK-SDAG: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: mov w19, w0
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: cmp w19, #1
+; CHECK-SDAG-NEXT: b.lt .LBB1_5
+; CHECK-SDAG-NEXT: // %bb.1: // %loop.preheader
+; CHECK-SDAG-NEXT: sub x20, x29, #16
+; CHECK-SDAG-NEXT: b .LBB1_3
+; CHECK-SDAG-NEXT: .LBB1_2: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB1_3 Depth=1
+; CHECK-SDAG-NEXT: subs w19, w19, #1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: b.eq .LBB1_5
+; CHECK-SDAG-NEXT: .LBB1_3: // %loop
+; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2
+; CHECK-SDAG-NEXT: // %bb.4: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB1_3 Depth=1
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: b .LBB1_2
+; CHECK-SDAG-NEXT: .LBB1_5: // %exit
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: b shared_za_call
+;
; CHECK-LABEL: private_za_loop_active_entry_and_exit:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -106,9 +147,9 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: mov w19, w0
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: mov w19, w0
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: bl shared_za_call
; CHECK-NEXT: cmp w19, #1
@@ -118,13 +159,13 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
; CHECK-NEXT: b .LBB1_3
; CHECK-NEXT: .LBB1_2: // %loop
; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1
-; CHECK-NEXT: subs w19, w19, #1
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: b.eq .LBB1_5
+; CHECK-NEXT: cbz w19, .LBB1_5
; CHECK-NEXT: .LBB1_3: // %loop
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_call
+; CHECK-NEXT: sub w19, w19, #1
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
@@ -138,47 +179,6 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: b shared_za_call
-;
-; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: mov w19, w0
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: cmp w19, #1
-; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
-; CHECK-NEWLOWERING-NEXT: b .LBB1_3
-; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5
-; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop
-; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: b .LBB1_2
-; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: b shared_za_call
entry:
%cmpgt = icmp sgt i32 %n, 0
tail call void @shared_za_call()
@@ -268,6 +268,45 @@ exit:
}
define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind {
+; CHECK-SDAG-LABEL: mixed_shared_private_za_loop:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: mov x19, x0
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: sub x20, x29, #16
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: b .LBB4_2
+; CHECK-SDAG-NEXT: .LBB4_1: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: ldrb w8, [x19]
+; CHECK-SDAG-NEXT: tbz w8, #0, .LBB4_4
+; CHECK-SDAG-NEXT: .LBB4_2: // %loop
+; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB4_1
+; CHECK-SDAG-NEXT: // %bb.3: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: b .LBB4_1
+; CHECK-SDAG-NEXT: .LBB4_4: // %exit
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: mixed_shared_private_za_loop:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -276,26 +315,26 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: sub x20, x29, #16
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: b .LBB4_2
; CHECK-NEXT: .LBB4_1: // %loop
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: ldrb w8, [x19]
; CHECK-NEXT: tbz w8, #0, .LBB4_4
; CHECK-NEXT: .LBB4_2: // %loop
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: bl shared_za_call
; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_call
+; CHECK-NEXT: ldrb w8, [x19]
; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: mrs x9, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB4_1
+; CHECK-NEXT: cbnz x9, .LBB4_1
; CHECK-NEXT: // %bb.3: // %loop
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: bl __arm_tpidr2_restore
@@ -306,45 +345,6 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: mixed_shared_private_za_loop:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: mov x19, x0
-; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: b .LBB4_2
-; CHECK-NEWLOWERING-NEXT: .LBB4_1: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB4_2 Depth=1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: tbz w8, #0, .LBB4_4
-; CHECK-NEWLOWERING-NEXT: .LBB4_2: // %loop
-; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: ldrb w8, [x19]
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB4_1
-; CHECK-NEWLOWERING-NEXT: // %bb.3: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB4_2 Depth=1
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: b .LBB4_1
-; CHECK-NEWLOWERING-NEXT: .LBB4_4: // %exit
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
br label %loop
loop:
@@ -364,6 +364,49 @@ exit:
define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounwind {
+; CHECK-SDAG-LABEL: cond_clobber_followed_by_clobber:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: mov w19, w0
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: tbz w19, #0, .LBB5_4
+; CHECK-SDAG-NEXT: // %bb.1: // %cond_clobber
+; CHECK-SDAG-NEXT: sub x8, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_3
+; CHECK-SDAG-NEXT: // %bb.2: // %cond_clobber
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_3: // %cond_clobber
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB5_4: // %exit
+; CHECK-SDAG-NEXT: sub x8, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB5_6
+; CHECK-SDAG-NEXT: // %bb.5: // %exit
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB5_6: // %exit
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: b shared_za_call
+;
; CHECK-LABEL: cond_clobber_followed_by_clobber:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -372,73 +415,30 @@ define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounw
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: mov w19, w0
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: mov w19, w0
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: bl shared_za_call
-; CHECK-NEXT: tbz w19, #0, .LBB5_4
-; CHECK-NEXT: // %bb.1: // %cond_clobber
; CHECK-NEXT: sub x8, x29, #16
; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: tbz w19, #0, .LBB5_2
+; CHECK-NEXT: // %bb.1: // %cond_clobber
; CHECK-NEXT: bl private_za_call
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_3
-; CHECK-NEXT: // %bb.2: // %cond_clobber
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_3: // %cond_clobber
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: .LBB5_4: // %exit
-; CHECK-NEXT: sub x8, x29, #16
-; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: .LBB5_2: // %exit
; CHECK-NEXT: bl private_za_call
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB5_6
-; CHECK-NEXT: // %bb.5: // %exit
+; CHECK-NEXT: cbnz x8, .LBB5_4
+; CHECK-NEXT: // %bb.3: // %exit
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB5_6: // %exit
+; CHECK-NEXT: .LBB5_4: // %exit
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: b shared_za_call
-;
-; CHECK-NEWLOWERING-LABEL: cond_clobber_followed_by_clobber:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: mov w19, w0
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB5_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %cond_clobber
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: .LBB5_2: // %exit
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB5_4
-; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB5_4: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: b shared_za_call
tail call void @shared_za_call()
br i1 %cond, label %cond_clobber, label %exit
@@ -543,6 +543,48 @@ merge_shared:
define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwind {
+; CHECK-SDAG-LABEL: diamond_mixed_za_merge_private:
+; CHECK-SDAG: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: tbz w0, #0, .LBB8_2
+; CHECK-SDAG-NEXT: // %bb.1: // %then
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: b .LBB8_5
+; CHECK-SDAG-NEXT: .LBB8_2: // %else
+; CHECK-SDAG-NEXT: sub x8, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB8_4
+; CHECK-SDAG-NEXT: // %bb.3: // %else
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB8_4: // %else
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB8_5: // %merge_private_za
+; CHECK-SDAG-NEXT: sub x8, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB8_7
+; CHECK-SDAG-NEXT: // %bb.6: // %merge_private_za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB8_7: // %merge_private_za
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: diamond_mixed_za_merge_private:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -556,68 +598,26 @@ define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwin
; CHECK-NEXT: tbz w0, #0, .LBB8_2
; CHECK-NEXT: // %bb.1: // %then
; CHECK-NEXT: bl shared_za_call
-; CHECK-NEXT: b .LBB8_5
-; CHECK-NEXT: .LBB8_2: // %else
; CHECK-NEXT: sub x8, x29, #16
; CHECK-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEXT: bl private_za_call
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB8_4
-; CHECK-NEXT: // %bb.3: // %else
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB8_4: // %else
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: .LBB8_5: // %merge_private_za
+; CHECK-NEXT: b .LBB8_3
+; CHECK-NEXT: .LBB8_2: // %else
; CHECK-NEXT: sub x8, x29, #16
; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl private_za_call
+; CHECK-NEXT: .LBB8_3: // %merge_private_za
+; CHECK-NEXT: bl private_za_call
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB8_7
-; CHECK-NEXT: // %bb.6: // %merge_private_za
+; CHECK-NEXT: cbnz x8, .LBB8_5
+; CHECK-NEXT: // %bb.4: // %merge_private_za
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB8_7: // %merge_private_za
+; CHECK-NEXT: .LBB8_5: // %merge_private_za
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: diamond_mixed_za_merge_private:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB8_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %then
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: b .LBB8_3
-; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %else
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: .LBB8_3: // %merge_private_za
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_5
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %merge_private_za
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB8_5: // %merge_private_za
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
entry:
br i1 %cond, label %then, label %else
@@ -635,6 +635,56 @@ merge_private_za:
}
define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind {
+; CHECK-SDAG-LABEL: critical_edge_mixed_za:
+; CHECK-SDAG: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: mov w19, w1
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: tbz w0, #0, .LBB9_5
+; CHECK-SDAG-NEXT: // %bb.1: // %shared_path
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: tbz w19, #0, .LBB9_8
+; CHECK-SDAG-NEXT: .LBB9_2: // %exit_private
+; CHECK-SDAG-NEXT: sub x8, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB9_4
+; CHECK-SDAG-NEXT: // %bb.3: // %exit_private
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB9_4: // %exit_private
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: b .LBB9_9
+; CHECK-SDAG-NEXT: .LBB9_5: // %private_path
+; CHECK-SDAG-NEXT: sub x8, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB9_7
+; CHECK-SDAG-NEXT: // %bb.6: // %private_path
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB9_7: // %private_path
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: tbnz w19, #0, .LBB9_2
+; CHECK-SDAG-NEXT: .LBB9_8: // %exit_shared
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: .LBB9_9: // %common.ret
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: critical_edge_mixed_za:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -643,9 +693,9 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: mov w19, w1
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: mov w19, w1
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: tbz w0, #0, .LBB9_5
; CHECK-NEXT: // %bb.1: // %shared_path
@@ -684,56 +734,6 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: critical_edge_mixed_za:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: mov w19, w1
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB9_5
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %shared_path
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB9_8
-; CHECK-NEWLOWERING-NEXT: .LBB9_2: // %exit_private
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_4
-; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit_private
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB9_4: // %exit_private
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: b .LBB9_9
-; CHECK-NEWLOWERING-NEXT: .LBB9_5: // %private_path
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_7
-; CHECK-NEWLOWERING-NEXT: // %bb.6: // %private_path
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB9_7: // %private_path
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB9_2
-; CHECK-NEWLOWERING-NEXT: .LBB9_8: // %exit_shared
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: .LBB9_9: // %common.ret
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
entry:
br i1 %c1, label %shared_path, label %private_path
@@ -836,6 +836,46 @@ exit:
}
define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind {
+; CHECK-SDAG-LABEL: loop_with_external_entry:
+; CHECK-SDAG: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: mov w19, w1
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: tbz w0, #0, .LBB11_2
+; CHECK-SDAG-NEXT: // %bb.1: // %init
+; CHECK-SDAG-NEXT: bl shared_za_call
+; CHECK-SDAG-NEXT: .LBB11_2: // %loop.preheader
+; CHECK-SDAG-NEXT: sub x20, x29, #16
+; CHECK-SDAG-NEXT: b .LBB11_4
+; CHECK-SDAG-NEXT: .LBB11_3: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB11_4 Depth=1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: tbz w19, #0, .LBB11_6
+; CHECK-SDAG-NEXT: .LBB11_4: // %loop
+; CHECK-SDAG-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT: bl private_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB11_3
+; CHECK-SDAG-NEXT: // %bb.5: // %loop
+; CHECK-SDAG-NEXT: // in Loop: Header=BB11_4 Depth=1
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: b .LBB11_3
+; CHECK-SDAG-NEXT: .LBB11_6: // %exit
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: loop_with_external_entry:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -844,9 +884,9 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: mov w19, w1
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: mov w19, w1
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: tbz w0, #0, .LBB11_2
; CHECK-NEXT: // %bb.1: // %init
@@ -875,46 +915,6 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: loop_with_external_entry:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: mov w19, w1
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB11_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %init
-; CHECK-NEWLOWERING-NEXT: bl shared_za_call
-; CHECK-NEWLOWERING-NEXT: .LBB11_2: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
-; CHECK-NEWLOWERING-NEXT: b .LBB11_4
-; CHECK-NEWLOWERING-NEXT: .LBB11_3: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB11_6
-; CHECK-NEWLOWERING-NEXT: .LBB11_4: // %loop
-; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_3
-; CHECK-NEWLOWERING-NEXT: // %bb.5: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: b .LBB11_3
-; CHECK-NEWLOWERING-NEXT: .LBB11_6: // %exit
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
entry:
br i1 %c1, label %init, label %loop
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index 3947127c47844..5243b8d7203d8 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG
; A simple EH test case that corresponds to the following C++ source:
;
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index afd56d198d0d3..d4840f77c5392 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
define i32 @no_tpidr2_save_required() "aarch64_inout_za" {
; CHECK-COMMON-LABEL: no_tpidr2_save_required:
@@ -64,6 +64,51 @@ exit:
}
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
+; CHECK-SDAG-LABEL: multi_bb_stpidr2_save_required_stackprobe:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: str xzr, [sp, #-16]!
+; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 16
+; CHECK-SDAG-NEXT: .cfi_offset w30, -8
+; CHECK-SDAG-NEXT: .cfi_offset w29, -16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536
+; CHECK-SDAG-NEXT: cmp sp, x9
+; CHECK-SDAG-NEXT: b.le .LBB2_3
+; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-SDAG-NEXT: str xzr, [sp]
+; CHECK-SDAG-NEXT: b .LBB2_1
+; CHECK-SDAG-NEXT: .LBB2_3:
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: ldr xzr, [sp]
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: cbz w0, .LBB2_5
+; CHECK-SDAG-NEXT: // %bb.4: // %use_b
+; CHECK-SDAG-NEXT: fmov s1, #4.00000000
+; CHECK-SDAG-NEXT: fadd s0, s0, s1
+; CHECK-SDAG-NEXT: b .LBB2_8
+; CHECK-SDAG-NEXT: .LBB2_5: // %use_c
+; CHECK-SDAG-NEXT: fmov s0, s1
+; CHECK-SDAG-NEXT: sub x8, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT: bl cosf
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB2_7
+; CHECK-SDAG-NEXT: // %bb.6: // %use_c
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB2_7: // %use_c
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB2_8: // %exit
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -74,7 +119,9 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: sub x10, x29, #16
; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT: cmp sp, x9
@@ -90,69 +137,22 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEXT: // %bb.4: // %use_b
; CHECK-NEXT: fmov s1, #4.00000000
; CHECK-NEXT: fadd s0, s0, s1
-; CHECK-NEXT: b .LBB2_8
+; CHECK-NEXT: b .LBB2_6
; CHECK-NEXT: .LBB2_5: // %use_c
; CHECK-NEXT: fmov s0, s1
-; CHECK-NEXT: sub x8, x29, #16
-; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl cosf
+; CHECK-NEXT: .LBB2_6: // %exit
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB2_7
-; CHECK-NEXT: // %bb.6: // %use_c
+; CHECK-NEXT: cbnz x8, .LBB2_8
+; CHECK-NEXT: // %bb.7: // %exit
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB2_7: // %use_c
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: .LBB2_8: // %exit
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required_stackprobe:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: str xzr, [sp, #-16]!
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
-; CHECK-NEWLOWERING-NEXT: cmp sp, x9
-; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
-; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
-; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
-; CHECK-NEWLOWERING-NEXT: b .LBB2_1
-; CHECK-NEWLOWERING-NEXT: .LBB2_3:
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
-; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
-; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
-; CHECK-NEWLOWERING-NEXT: b .LBB2_6
-; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
-; CHECK-NEWLOWERING-NEXT: fmov s0, s1
-; CHECK-NEWLOWERING-NEXT: bl cosf
-; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
-; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
%cmp = icmp ne i32 %a, 0
br i1 %cmp, label %use_b, label %use_c
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 0d4a39b2eeb2f..24b4565cf24b5 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -aarch64-new-sme-abi=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
;
; Private-ZA Callee
@@ -30,6 +30,36 @@ define void @zt0_in_caller_no_state_callee(ptr %callee) "aarch64_in_zt0" nounwin
; Expect setup and restore lazy-save around call
; Expect smstart za after call
define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" nounwind {
+; CHECK-SDAG-LABEL: za_zt0_shared_caller_no_state_callee:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #80
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: sub x10, x29, #16
+; CHECK-SDAG-NEXT: sub x19, x29, #80
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x10
+; CHECK-SDAG-NEXT: str zt0, [x19]
+; CHECK-SDAG-NEXT: blr x0
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: ldr zt0, [x19]
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2
+; CHECK-SDAG-NEXT: // %bb.1:
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB1_2:
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: za_zt0_shared_caller_no_state_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
@@ -40,55 +70,25 @@ define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: sub x19, x29, #80
-; CHECK-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: sub x19, x29, #64
+; CHECK-NEXT: sub x10, x29, #80
+; CHECK-NEXT: stp x9, x8, [x29, #-80]
; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: blr x0
; CHECK-NEXT: smstart za
-; CHECK-NEXT: ldr zt0, [x19]
; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: sub x0, x29, #80
; CHECK-NEXT: cbnz x8, .LBB1_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: ldr zt0, [x19]
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: za_zt0_shared_caller_no_state_callee:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x19, x29, #64
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80]
-; CHECK-NEWLOWERING-NEXT: str zt0, [x19]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: blr x0
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB1_2:
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19]
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void %callee();
ret void;
}
@@ -167,48 +167,48 @@ define void @zt0_in_caller_zt0_new_callee(ptr %callee) "aarch64_in_zt0" nounwind
; Expect spill & fill of ZT0 around call
; Before return, expect smstop ZA
define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwind {
+; CHECK-SDAG-LABEL: zt0_new_caller_zt0_new_callee:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: sub sp, sp, #80
+; CHECK-SDAG-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB6_2
+; CHECK-SDAG-NEXT: // %bb.1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB6_2:
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: zero { zt0 }
+; CHECK-SDAG-NEXT: mov x19, sp
+; CHECK-SDAG-NEXT: str zt0, [x19]
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: blr x0
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: ldr zt0, [x19]
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: add sp, sp, #80
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: zt0_new_caller_zt0_new_callee:
-; CHECK: // %bb.0: // %prelude
+; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB6_2
-; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_save
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero { zt0 }
; CHECK-NEXT: .LBB6_2:
; CHECK-NEXT: smstart za
-; CHECK-NEXT: zero { zt0 }
-; CHECK-NEXT: mov x19, sp
-; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str zt0, [x8]
; CHECK-NEXT: smstop za
; CHECK-NEXT: blr x0
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: ldr zt0, [x19]
-; CHECK-NEXT: smstop za
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload
; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB6_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: zero { zt0 }
-; CHECK-NEWLOWERING-NEXT: .LBB6_2:
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mov x8, sp
-; CHECK-NEWLOWERING-NEXT: str zt0, [x8]
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: blr x0
-; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: add sp, sp, #80
-; CHECK-NEWLOWERING-NEXT: ret
call void %callee() "aarch64_new_zt0";
ret void;
}
@@ -219,46 +219,46 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi
; Expect spill & fill of ZT0 around __arm_sme_state call
; Before return, expect smstop ZA
define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind {
+; CHECK-SDAG-LABEL: zt0_new_caller_abi_routine_callee:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: sub sp, sp, #80
+; CHECK-SDAG-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB7_2
+; CHECK-SDAG-NEXT: // %bb.1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB7_2:
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: zero { zt0 }
+; CHECK-SDAG-NEXT: mov x19, sp
+; CHECK-SDAG-NEXT: str zt0, [x19]
+; CHECK-SDAG-NEXT: bl __arm_sme_state
+; CHECK-SDAG-NEXT: ldr zt0, [x19]
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: add sp, sp, #80
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: zt0_new_caller_abi_routine_callee:
-; CHECK: // %bb.0: // %prelude
+; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB7_2
-; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_save
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero { zt0 }
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: smstart za
-; CHECK-NEXT: zero { zt0 }
-; CHECK-NEXT: mov x19, sp
-; CHECK-NEXT: str zt0, [x19]
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str zt0, [x8]
; CHECK-NEXT: smstop za
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload
; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB7_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: zero { zt0 }
-; CHECK-NEWLOWERING-NEXT: .LBB7_2:
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mov x8, sp
-; CHECK-NEWLOWERING-NEXT: str zt0, [x8]
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state
-; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: add sp, sp, #80
-; CHECK-NEWLOWERING-NEXT: ret
%res = call {i64, i64} @__arm_sme_state()
%res.0 = extractvalue {i64, i64} %res, 0
ret i64 %res.0
@@ -274,37 +274,37 @@ declare {i64, i64} @__arm_sme_state()
; Expect smstart ZA & clear ZT0
; Before return, expect smstop ZA
define void @zt0_new_caller(ptr %callee) "aarch64_new_zt0" nounwind {
+; CHECK-SDAG-LABEL: zt0_new_caller:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB8_2
+; CHECK-SDAG-NEXT: // %bb.1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB8_2:
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: zero { zt0 }
+; CHECK-SDAG-NEXT: blr x0
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: zt0_new_caller:
-; CHECK: // %bb.0: // %prelude
+; CHECK: // %bb.0:
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB8_2
-; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_save
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: zero { zt0 }
; CHECK-NEXT: .LBB8_2:
; CHECK-NEXT: smstart za
-; CHECK-NEXT: zero { zt0 }
; CHECK-NEXT: blr x0
; CHECK-NEXT: smstop za
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: zt0_new_caller:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB8_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: zero { zt0 }
-; CHECK-NEWLOWERING-NEXT: .LBB8_2:
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: blr x0
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void %callee() "aarch64_in_zt0";
ret void;
}
@@ -313,39 +313,39 @@ define void @zt0_new_caller(ptr %callee) "aarch64_new_zt0" nounwind {
; Expect smstart ZA, clear ZA & clear ZT0
; Before return, expect smstop ZA
define void @new_za_zt0_caller(ptr %callee) "aarch64_new_za" "aarch64_new_zt0" nounwind {
+; CHECK-SDAG-LABEL: new_za_zt0_caller:
+; CHECK-SDAG: // %bb.0: // %prelude
+; CHECK-SDAG-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: cbz x8, .LBB9_2
+; CHECK-SDAG-NEXT: // %bb.1: // %save.za
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .LBB9_2:
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: zero {za}
+; CHECK-SDAG-NEXT: zero { zt0 }
+; CHECK-SDAG-NEXT: blr x0
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: new_za_zt0_caller:
-; CHECK: // %bb.0: // %prelude
+; CHECK: // %bb.0:
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB9_2
-; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_save
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: .LBB9_2:
-; CHECK-NEXT: smstart za
; CHECK-NEXT: zero {za}
; CHECK-NEXT: zero { zt0 }
+; CHECK-NEXT: .LBB9_2:
+; CHECK-NEXT: smstart za
; CHECK-NEXT: blr x0
; CHECK-NEXT: smstop za
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: new_za_zt0_caller:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB9_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: zero {za}
-; CHECK-NEWLOWERING-NEXT: zero { zt0 }
-; CHECK-NEWLOWERING-NEXT: .LBB9_2:
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: blr x0
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void %callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
}
@@ -378,6 +378,38 @@ define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0"
define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwind {
+; CHECK-SDAG-LABEL: zt0_multiple_private_za_calls:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: sub sp, sp, #96
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x20, sp
+; CHECK-SDAG-NEXT: mov x19, x0
+; CHECK-SDAG-NEXT: str x30, [sp, #64] // 8-byte Spill
+; CHECK-SDAG-NEXT: str zt0, [x20]
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: blr x0
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: ldr zt0, [x20]
+; CHECK-SDAG-NEXT: str zt0, [x20]
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: blr x19
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: ldr zt0, [x20]
+; CHECK-SDAG-NEXT: str zt0, [x20]
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: blr x19
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: ldr zt0, [x20]
+; CHECK-SDAG-NEXT: str zt0, [x20]
+; CHECK-SDAG-NEXT: smstop za
+; CHECK-SDAG-NEXT: blr x19
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: ldr zt0, [x20]
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldr x30, [sp, #64] // 8-byte Reload
+; CHECK-SDAG-NEXT: add sp, sp, #96
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: zt0_multiple_private_za_calls:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #96
@@ -388,20 +420,8 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin
; CHECK-NEXT: str zt0, [x20]
; CHECK-NEXT: smstop za
; CHECK-NEXT: blr x0
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: ldr zt0, [x20]
-; CHECK-NEXT: str zt0, [x20]
-; CHECK-NEXT: smstop za
; CHECK-NEXT: blr x19
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: ldr zt0, [x20]
-; CHECK-NEXT: str zt0, [x20]
-; CHECK-NEXT: smstop za
; CHECK-NEXT: blr x19
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: ldr zt0, [x20]
-; CHECK-NEXT: str zt0, [x20]
-; CHECK-NEXT: smstop za
; CHECK-NEXT: blr x19
; CHECK-NEXT: smstart za
; CHECK-NEXT: ldr zt0, [x20]
@@ -409,26 +429,6 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: zt0_multiple_private_za_calls:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #96
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x20, sp
-; CHECK-NEWLOWERING-NEXT: mov x19, x0
-; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: str zt0, [x20]
-; CHECK-NEWLOWERING-NEXT: smstop za
-; CHECK-NEWLOWERING-NEXT: blr x0
-; CHECK-NEWLOWERING-NEXT: blr x19
-; CHECK-NEWLOWERING-NEXT: blr x19
-; CHECK-NEWLOWERING-NEXT: blr x19
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: ldr zt0, [x20]
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload
-; CHECK-NEWLOWERING-NEXT: add sp, sp, #96
-; CHECK-NEWLOWERING-NEXT: ret
call void %callee()
call void %callee()
call void %callee()
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index c1a42b568673a..000523de203fc 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -3238,9 +3238,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK0-NEXT: sub sp, sp, #16
; CHECK0-NEXT: rdsvl x8, #1
; CHECK0-NEXT: mov x9, sp
-; CHECK0-NEXT: mov w20, w0
; CHECK0-NEXT: msub x9, x8, x8, x9
; CHECK0-NEXT: mov sp, x9
+; CHECK0-NEXT: mov w20, w0
; CHECK0-NEXT: sub x10, x29, #80
; CHECK0-NEXT: stp x9, x8, [x29, #-80]
; CHECK0-NEXT: msr TPIDR2_EL0, x10
@@ -3309,10 +3309,10 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: sub sp, sp, #80
; CHECK64-NEXT: rdsvl x8, #1
; CHECK64-NEXT: mov x9, sp
-; CHECK64-NEXT: mov w20, w0
-; CHECK64-NEXT: msub x9, x8, x8, x9
; CHECK64-NEXT: mov x19, sp
+; CHECK64-NEXT: msub x9, x8, x8, x9
; CHECK64-NEXT: mov sp, x9
+; CHECK64-NEXT: mov w20, w0
; CHECK64-NEXT: add x10, x19, #0
; CHECK64-NEXT: stp x9, x8, [x19]
; CHECK64-NEXT: msr TPIDR2_EL0, x10
@@ -3387,10 +3387,10 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: sub sp, sp, #1040
; CHECK1024-NEXT: rdsvl x8, #1
; CHECK1024-NEXT: mov x9, sp
-; CHECK1024-NEXT: mov w20, w0
-; CHECK1024-NEXT: msub x9, x8, x8, x9
; CHECK1024-NEXT: mov x19, sp
+; CHECK1024-NEXT: msub x9, x8, x8, x9
; CHECK1024-NEXT: mov sp, x9
+; CHECK1024-NEXT: mov w20, w0
; CHECK1024-NEXT: add x10, x19, #0
; CHECK1024-NEXT: stp x9, x8, [x19]
; CHECK1024-NEXT: msr TPIDR2_EL0, x10
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 37adfb89e4762..4dec5471e689c 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -aarch64-new-sme-abi=false | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT
; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64
@@ -524,6 +524,77 @@ declare ptr @memset(ptr, i32, i32)
; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: VariableSized, Align: 16, Size: 0
define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" {
+; CHECK-SDAG-LABEL: vastate:
+; CHECK-SDAG: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 112
+; CHECK-SDAG-NEXT: cntd x9
+; CHECK-SDAG-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x9, [sp, #80] // 8-byte Spill
+; CHECK-SDAG-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: add x29, sp, #64
+; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 48
+; CHECK-SDAG-NEXT: .cfi_offset w19, -8
+; CHECK-SDAG-NEXT: .cfi_offset w20, -16
+; CHECK-SDAG-NEXT: .cfi_offset vg, -32
+; CHECK-SDAG-NEXT: .cfi_offset w30, -40
+; CHECK-SDAG-NEXT: .cfi_offset w29, -48
+; CHECK-SDAG-NEXT: .cfi_offset b8, -56
+; CHECK-SDAG-NEXT: .cfi_offset b9, -64
+; CHECK-SDAG-NEXT: .cfi_offset b10, -72
+; CHECK-SDAG-NEXT: .cfi_offset b11, -80
+; CHECK-SDAG-NEXT: .cfi_offset b12, -88
+; CHECK-SDAG-NEXT: .cfi_offset b13, -96
+; CHECK-SDAG-NEXT: .cfi_offset b14, -104
+; CHECK-SDAG-NEXT: .cfi_offset b15, -112
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: mov w20, w0
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: sub x10, x29, #80
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-80]
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x10
+; CHECK-SDAG-NEXT: smstop sm
+; CHECK-SDAG-NEXT: bl other
+; CHECK-SDAG-NEXT: smstart sm
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #80
+; CHECK-SDAG-NEXT: cbnz x8, .LBB8_2
+; CHECK-SDAG-NEXT: // %bb.1: // %entry
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB8_2: // %entry
+; CHECK-SDAG-NEXT: mov w0, w20
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: sub sp, x29, #64
+; CHECK-SDAG-NEXT: .cfi_def_cfa wsp, 112
+; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 0
+; CHECK-SDAG-NEXT: .cfi_restore w19
+; CHECK-SDAG-NEXT: .cfi_restore w20
+; CHECK-SDAG-NEXT: .cfi_restore vg
+; CHECK-SDAG-NEXT: .cfi_restore w30
+; CHECK-SDAG-NEXT: .cfi_restore w29
+; CHECK-SDAG-NEXT: .cfi_restore b8
+; CHECK-SDAG-NEXT: .cfi_restore b9
+; CHECK-SDAG-NEXT: .cfi_restore b10
+; CHECK-SDAG-NEXT: .cfi_restore b11
+; CHECK-SDAG-NEXT: .cfi_restore b12
+; CHECK-SDAG-NEXT: .cfi_restore b13
+; CHECK-SDAG-NEXT: .cfi_restore b14
+; CHECK-SDAG-NEXT: .cfi_restore b15
+; CHECK-SDAG-NEXT: ret
+;
; CHECK-LABEL: vastate:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
@@ -553,9 +624,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: mov w20, w0
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: mov w20, w0
; CHECK-NEXT: sub x10, x29, #80
; CHECK-NEXT: stp x9, x8, [x29, #-80]
; CHECK-NEXT: msr TPIDR2_EL0, x10
@@ -594,77 +665,6 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK-NEXT: .cfi_restore b14
; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: vastate:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 112
-; CHECK-NEWLOWERING-NEXT: cntd x9
-; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x9, [sp, #80] // 8-byte Spill
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 48
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -8
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w20, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_offset vg, -32
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -40
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -48
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b8, -56
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b9, -64
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b10, -72
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b11, -80
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b12, -88
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b13, -96
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b14, -104
-; CHECK-NEWLOWERING-NEXT: .cfi_offset b15, -112
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: mov w20, w0
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: smstop sm
-; CHECK-NEWLOWERING-NEXT: bl other
-; CHECK-NEWLOWERING-NEXT: smstart sm
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %entry
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %entry
-; CHECK-NEWLOWERING-NEXT: mov w0, w20
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa wsp, 112
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEWLOWERING-NEXT: .cfi_restore w19
-; CHECK-NEWLOWERING-NEXT: .cfi_restore w20
-; CHECK-NEWLOWERING-NEXT: .cfi_restore vg
-; CHECK-NEWLOWERING-NEXT: .cfi_restore w30
-; CHECK-NEWLOWERING-NEXT: .cfi_restore w29
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b8
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b9
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b10
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b11
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b12
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b13
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b14
-; CHECK-NEWLOWERING-NEXT: .cfi_restore b15
-; CHECK-NEWLOWERING-NEXT: ret
entry:
tail call void @other()
ret i32 %x
More information about the llvm-commits
mailing list