[llvm] bd4935c - [AArch64][SME] Implement ABI for calls from streaming-compatible functions.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 16 07:48:54 PDT 2022
Author: Sander de Smalen
Date: 2022-09-16T14:48:37Z
New Revision: bd4935c175ad29f907a80cab6cfa6511db819c3d
URL: https://github.com/llvm/llvm-project/commit/bd4935c175ad29f907a80cab6cfa6511db819c3d
DIFF: https://github.com/llvm/llvm-project/commit/bd4935c175ad29f907a80cab6cfa6511db819c3d.diff
LOG: [AArch64][SME] Implement ABI for calls from streaming-compatible functions.
When a function is streaming-compatible and calls a function with a normal or streaming
interface, it may need to enable/disable stremaing mode before the call, and
needs to restore PSTATE.SM after the call.
This patch implements this with a Pseudo node that gets expanded to a
conditional branch and smstart/smstop node.
More details about the SME attributes and design can be found
in D131562.
Reviewed By: aemerson
Differential Revision: https://reviews.llvm.org/D131578
Added:
llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
Modified:
llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 1c034f43d844..0b36f1f45fbd 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -89,6 +89,8 @@ class AArch64ExpandPseudo : public MachineFunctionPass {
bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
+ MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
};
} // end anonymous namespace
@@ -849,6 +851,99 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
return true;
}
+MachineBasicBlock *
+AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ MachineInstr &MI = *MBBI;
+ // In the case of a smstart/smstop before a unreachable, just remove the pseudo.
+ // Exception handling code generated by Clang may introduce unreachables and it
+ // seems unnecessary to restore pstate.sm when that happens. Note that it is
+ // not just an optimisation, the code below expects a successor instruction/block
+ // in order to split the block at MBBI.
+ if (std::next(MBBI) == MBB.end() &&
+ MI.getParent()->successors().begin() ==
+ MI.getParent()->successors().end()) {
+ MI.eraseFromParent();
+ return &MBB;
+ }
+
+ // Expand the pseudo into smstart or smstop instruction. The pseudo has the
+ // following operands:
+ //
+ // MSRpstatePseudo <za|sm|both>, <0|1>, pstate.sm, expectedval, <regmask>
+ //
+ // The pseudo is expanded into a conditional smstart/smstop, with a
+ // check if pstate.sm (register) equals the expected value, and if not,
+ // invokes the smstart/smstop.
+ //
+ // As an example, the following block contains a normal call from a
+ // streaming-compatible function:
+ //
+ // OrigBB:
+ // MSRpstatePseudo 3, 0, %0, 0, <regmask> <- Conditional SMSTOP
+ // bl @normal_callee
+ // MSRpstatePseudo 3, 1, %0, 0, <regmask> <- Conditional SMSTART
+ //
+ // ...which will be transformed into:
+ //
+ // OrigBB:
+ // TBNZx %0:gpr64, 0, SMBB
+ // b EndBB
+ //
+ // SMBB:
+ // MSRpstatesvcrImm1 3, 0, <regmask> <- SMSTOP
+ //
+ // EndBB:
+ // bl @normal_callee
+ // MSRcond_pstatesvcrImm1 3, 1, <regmask> <- SMSTART
+ //
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Create the conditional branch based on the third operand of the
+ // instruction, which tells us if we are wrapping a normal or streaming
+ // function.
+ // We test the live value of pstate.sm and toggle pstate.sm if this is not the
+ // expected value for the callee (0 for a normal callee and 1 for a streaming
+ // callee).
+ auto PStateSM = MI.getOperand(2).getReg();
+ bool IsStreamingCallee = MI.getOperand(3).getImm();
+ unsigned Opc = IsStreamingCallee ? AArch64::TBZX : AArch64::TBNZX;
+ MachineInstrBuilder Tbx =
+ BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(PStateSM).addImm(0);
+
+ // Split MBB and create two new blocks:
+ // - MBB now contains all instructions before MSRcond_pstatesvcrImm1.
+ // - SMBB contains the MSRcond_pstatesvcrImm1 instruction only.
+ // - EndBB contains all instructions after MSRcond_pstatesvcrImm1.
+ MachineInstr &PrevMI = *std::prev(MBBI);
+ MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true);
+ MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end()
+ ? *SMBB->successors().begin()
+ : SMBB->splitAt(MI, /*UpdateLiveIns*/ true);
+
+ // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB.
+ Tbx.addMBB(SMBB);
+ BuildMI(&MBB, DL, TII->get(AArch64::B))
+ .addMBB(EndBB);
+ MBB.addSuccessor(EndBB);
+
+ // Create the SMSTART/SMSTOP (MSRpstatesvcrImm1) instruction in SMBB.
+ MachineInstrBuilder MIB = BuildMI(*SMBB, SMBB->begin(), MI.getDebugLoc(),
+ TII->get(AArch64::MSRpstatesvcrImm1));
+ // Copy all but the second and third operands of MSRcond_pstatesvcrImm1 (as
+ // these contain the CopyFromReg for the first argument and the flag to
+ // indicate whether the callee is streaming or normal).
+ MIB.add(MI.getOperand(0));
+ MIB.add(MI.getOperand(1));
+ for (unsigned i = 4; i < MI.getNumOperands(); ++i)
+ MIB.add(MI.getOperand(i));
+
+ BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
+
+ MI.eraseFromParent();
+ return EndBB;
+}
+
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1276,6 +1371,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandCALL_BTI(MBB, MBBI);
case AArch64::StoreSwiftAsyncContext:
return expandStoreSwiftAsyncContext(MBB, MBBI);
+ case AArch64::MSRpstatePseudo: {
+ auto *NewMBB = expandCondSMToggle(MBB, MBBI);
+ if (NewMBB != &MBB)
+ NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
+ return true;
+ }
}
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 84674703735e..f55314dba63e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2536,7 +2536,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
-
case TargetOpcode::STATEPOINT:
// STATEPOINT is a pseudo instruction which has no implicit defs/uses
// while bl call instruction (where statepoint will be lowered at the end)
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index d309b363e6c4..69d1eda88937 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -147,6 +147,21 @@ def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>;
def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>;
+// Pseudo to match to smstart/smstop. This expands:
+//
+// pseudonode (pstate_za|pstate_sm), before_call, expected_value
+//
+// Into:
+//
+// if (before_call != expected_value)
+// node (pstate_za|pstate_sm)
+//
+// where node can be either 'smstart' or 'smstop'.
+def MSRpstatePseudo :
+ Pseudo<(outs),
+ (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
+ Sched<[WriteSys]>;
+
// Scenario A:
//
// %pstate.before.call = 1
@@ -175,6 +190,12 @@ def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 0), (i64 1)), // before
def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 0), (i64 1)), // after call
(MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
+// The generic case which gets expanded to a pseudo node.
+def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
+ (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
+def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
+ (MSRpstatePseudo svcr_op:$pstate, 0b0, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
+
// Read and write TPIDR2_EL0
def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
(MSR 0xde85, GPR64:$val)>;
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
new file mode 100644
index 000000000000..d795f98f18de
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -0,0 +1,402 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sme < %s | FileCheck %s
+
+; This file tests the following combinations related to streaming-enabled functions:
+; [ ] N -> SC (Normal -> Streaming-compatible)
+; [ ] SC -> N (Streaming-compatible -> Normal)
+; [ ] SC -> S (Streaming-compatible -> Streaming)
+; [ ] SC -> SC (Streaming-compatible -> Streaming-compatible)
+;
+; The following combination is tested in sme-streaming-interface.ll
+; [ ] S -> SC (Streaming -> Streaming-compatible)
+
+declare void @normal_callee();
+declare void @streaming_callee() "aarch64_pstate_sm_enabled";
+declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
+
+; [x] N -> SC (Normal -> Streaming-compatible)
+; [ ] SC -> N (Streaming-compatible -> Normal)
+; [ ] SC -> S (Streaming-compatible -> Streaming)
+; [ ] SC -> SC (Streaming-compatible -> Streaming-compatible)
+define void @normal_caller_streaming_compatible_callee() nounwind {
+; CHECK-LABEL: normal_caller_streaming_compatible_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl streaming_compatible_callee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @streaming_compatible_callee();
+ ret void;
+}
+
+; [ ] N -> SC (Normal -> Streaming-compatible)
+; [x] SC -> N (Streaming-compatible -> Normal)
+; [ ] SC -> S (Streaming-compatible -> Streaming)
+; [ ] SC -> SC (Streaming-compatible -> Streaming-compatible)
+define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-LABEL: streaming_compatible_caller_normal_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbz x19, #0, .LBB1_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB1_2:
+; CHECK-NEXT: bl normal_callee
+; CHECK-NEXT: tbz x19, #0, .LBB1_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+ call void @normal_callee();
+ ret void;
+}
+
+; Streaming Compatible Caller, Streaming Callee
+
+; [ ] N -> SC (Normal -> Streaming-compatible)
+; [ ] SC -> N (Streaming-compatible -> Normal)
+; [x] SC -> S (Streaming-compatible -> Streaming)
+; [ ] SC -> SC (Streaming-compatible -> Streaming-compatible)
+define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbnz x19, #0, .LBB2_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz x19, #0, .LBB2_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB2_4:
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+ call void @streaming_callee();
+ ret void;
+}
+
+; [ ] N -> SC (Normal -> Streaming-compatible)
+; [ ] SC -> N (Streaming-compatible -> Normal)
+; [ ] SC -> S (Streaming-compatible -> Streaming)
+; [x] SC -> SC (Streaming-compatible -> Streaming-compatible)
+define void @streaming_compatible_caller_and_callee() "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-LABEL: streaming_compatible_caller_and_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl streaming_compatible_callee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+
+ call void @streaming_compatible_callee();
+ ret void;
+}
+
+
+;
+; Handle special cases here.
+;
+
+define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
+; CHECK-LABEL: streaming_compatible_with_neon_vectors:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbz x19, #0, .LBB4_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: bl normal_callee_vec_arg
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: tbz x19, #0, .LBB4_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB4_4:
+; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: ret
+ %res = call <2 x double> @normal_callee_vec_arg(<2 x double> %arg)
+ %fadd = fadd <2 x double> %res, %arg
+ ret <2 x double> %fadd
+}
+declare <2 x double> @normal_callee_vec_arg(<2 x double>)
+
+define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
+; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-18
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbz x19, #0, .LBB5_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB5_2:
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: bl normal_callee_scalable_vec_arg
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: tbz x19, #0, .LBB5_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB5_4:
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: fadd z0.d, z1.d, z0.d
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #18
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
+ %fadd = fadd <vscale x 2 x double> %res, %arg
+ ret <vscale x 2 x double> %fadd
+}
+
+declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double>)
+
+define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
+; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-18
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p0, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbz x19, #0, .LBB6_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: bl normal_callee_predicate_vec_arg
+; CHECK-NEXT: str p0, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: tbz x19, #0, .LBB6_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB6_4:
+; CHECK-NEXT: ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #18
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
+ %and = and <vscale x 2 x i1> %res, %arg
+ ret <vscale x 2 x i1> %and
+}
+
+declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>)
+
+define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-LABEL: conditional_smstart_unreachable_block:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbnz x19, #0, .LBB7_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: bl streaming_callee
+ call void @streaming_callee()
+ unreachable
+}
+
+define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-LABEL: conditional_smstart_no_successor_block:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tbz w0, #0, .LBB8_6
+; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbnz x19, #0, .LBB8_3
+; CHECK-NEXT: // %bb.2: // %if.then
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB8_3: // %if.then
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz x19, #0, .LBB8_5
+; CHECK-NEXT: // %bb.4: // %if.then
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB8_5: // %if.then
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: .LBB8_6: // %exit
+; CHECK-NEXT: ret
+ br i1 %p, label %if.then, label %exit
+
+if.then:
+ call void @streaming_callee()
+ br label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sve" }
More information about the llvm-commits
mailing list