[llvm] [AArch64] Add SME peephole optimizer pass (PR #104612)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 16 09:25:58 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sander de Smalen (sdesmalen-arm)
<details>
<summary>Changes</summary>
This pass removes back-to-back smstart/smstop instructions
to reduce the number of streaming mode changes in a function.
The implementation as proposed doesn't aim to solve all problems
yet and suggests a number of cases that can be optimized in the
future.
---
Patch is 36.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/104612.diff
11 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64.h (+2)
- (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (+9)
- (modified) llvm/lib/Target/AArch64/CMakeLists.txt (+1)
- (added) llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp (+216)
- (modified) llvm/test/CodeGen/AArch64/O3-pipeline.ll (+1)
- (added) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (+474)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-body.ll (+2-18)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-interface.ll (-2)
- (modified) llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll (+6-1)
- (modified) llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll (-12)
- (modified) llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll (-2)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index ff19327c692021..62fbf94e803f0c 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -59,6 +59,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
FunctionPass *createSMEABIPass();
+FunctionPass *createSMEPeepholeOptPass();
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
@@ -110,6 +111,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
void initializeSMEABIPass(PassRegistry &);
+void initializeSMEPeepholeOptPass(PassRegistry &);
void initializeSVEIntrinsicOptsPass(PassRegistry &);
void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &);
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index bcd677310d1247..bd5684a287381a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -167,6 +167,11 @@ static cl::opt<bool>
cl::desc("Enable SVE intrinsic opts"),
cl::init(true));
+static cl::opt<bool>
+ EnableSMEPeepholeOpt("enable-aarch64-sme-peephole-opt", cl::init(true),
+ cl::Hidden,
+ cl::desc("Perform SME peephole optimization"));
+
static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
cl::init(true), cl::Hidden);
@@ -256,6 +261,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeLDTLSCleanupPass(*PR);
initializeKCFIPass(*PR);
initializeSMEABIPass(*PR);
+ initializeSMEPeepholeOptPass(*PR);
initializeSVEIntrinsicOptsPass(*PR);
initializeAArch64SpeculationHardeningPass(*PR);
initializeAArch64SLSHardeningPass(*PR);
@@ -754,6 +760,9 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
}
void AArch64PassConfig::addMachineSSAOptimization() {
+ if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
+ addPass(createSMEPeepholeOptPass());
+
// Run default MachineSSAOptimization first.
TargetPassConfig::addMachineSSAOptimization();
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 639bc0707dff24..da13db8e68b0e6 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -87,6 +87,7 @@ add_llvm_target(AArch64CodeGen
AArch64TargetObjectFile.cpp
AArch64TargetTransformInfo.cpp
SMEABIPass.cpp
+ SMEPeepholeOpt.cpp
SVEIntrinsicOpts.cpp
AArch64SIMDInstrOpt.cpp
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
new file mode 100644
index 00000000000000..e6b8c6664f9fee
--- /dev/null
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -0,0 +1,216 @@
+//===- SMEPeepholeOpt.cpp - SME peephole optimization pass-----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass tries to remove back-to-back (smstart, smstop) and
+// (smstop, smstart) sequences. The pass is conservative when it cannot
+// determine that it is safe to remove these sequences.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "Utils/AArch64SMEAttributes.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-sme-peephole-opt"
+
+namespace {
+
+struct SMEPeepholeOpt : public MachineFunctionPass {
+ static char ID;
+
+ SMEPeepholeOpt() : MachineFunctionPass(ID) {
+ initializeSMEPeepholeOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SME Peephole Optimization pass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool optimizeStartStopPairs(MachineBasicBlock &MBB,
+ bool &HasRemainingSMChange) const;
+};
+
+char SMEPeepholeOpt::ID = 0;
+
+} // end anonymous namespace
+
+static bool isConditionalStartStop(const MachineInstr *MI) {
+ return MI->getOpcode() == AArch64::MSRpstatePseudo;
+}
+
+static bool isMatchingStartStopPair(const MachineInstr *MI1,
+ const MachineInstr *MI2) {
+ // We only consider the same type of streaming mode change here, i.e.
+ // start/stop SM, or start/stop ZA pairs.
+ if (MI1->getOperand(0).getImm() != MI2->getOperand(0).getImm())
+ return false;
+
+ // One must be 'start', the other must be 'stop'
+ if (MI1->getOperand(1).getImm() == MI2->getOperand(1).getImm())
+ return false;
+
+ bool IsConditional = isConditionalStartStop(MI2);
+ if (isConditionalStartStop(MI1) != IsConditional)
+ return false;
+
+ if (!IsConditional)
+ return true;
+
+ // Check to make sure the conditional start/stop pairs are identical.
+ if (MI1->getOperand(2).getImm() != MI2->getOperand(2).getImm())
+ return false;
+
+ // Ensure reg masks are identical.
+ if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask())
+ return false;
+
+ // This optimisation is unlikely to happen in practice for conditional
+ // smstart/smstop pairs as the virtual registers for pstate.sm will always
+ // be different.
+ // TODO: For this optimisation to apply to conditional smstart/smstop,
+ // this pass will need to do more work to remove redundant calls to
+ // __arm_sme_state.
+
+ // Only consider conditional start/stop pairs which read the same register
+ // holding the original value of pstate.sm, as some conditional start/stops
+ // require the state on entry to the function.
+ if (MI1->getOperand(3).isReg() && MI2->getOperand(3).isReg()) {
+ Register Reg1 = MI1->getOperand(3).getReg();
+ Register Reg2 = MI2->getOperand(3).getReg();
+ if (Reg1.isPhysical() || Reg2.isPhysical() || Reg1 != Reg2)
+ return false;
+ }
+
+ return true;
+}
+
+static bool ChangesStreamingMode(const MachineInstr *MI) {
+ assert((MI->getOpcode() == AArch64::MSRpstatesvcrImm1 ||
+ MI->getOpcode() == AArch64::MSRpstatePseudo) &&
+ "Expected MI to be a smstart/smstop instruction");
+ return MI->getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
+ MI->getOperand(0).getImm() == AArch64SVCR::SVCRSMZA;
+}
+
+bool SMEPeepholeOpt::optimizeStartStopPairs(MachineBasicBlock &MBB,
+ bool &HasRemainingSMChange) const {
+ SmallVector<MachineInstr *, 4> ToBeRemoved;
+
+ bool Changed = false;
+ MachineInstr *Prev = nullptr;
+ HasRemainingSMChange = false;
+
+ auto Reset = [&]() {
+ if (Prev && ChangesStreamingMode(Prev))
+ HasRemainingSMChange = true;
+ Prev = nullptr;
+ ToBeRemoved.clear();
+ };
+
+ // Walk through instructions in the block trying to find pairs of smstart
+ // and smstop nodes that cancel each other out. We only permit a limited
+ // set of instructions to appear between them, otherwise we reset our
+ // tracking.
+ for (MachineInstr &MI : make_early_inc_range(MBB)) {
+ switch (MI.getOpcode()) {
+ default:
+ Reset();
+ break;
+ case AArch64::COPY: {
+ // Permit copies of 32 and 64-bit registers.
+ if (!MI.getOperand(1).isReg()) {
+ Reset();
+ break;
+ }
+ Register Reg = MI.getOperand(1).getReg();
+ if (!AArch64::GPR32RegClass.contains(Reg) &&
+ !AArch64::GPR64RegClass.contains(Reg))
+ Reset();
+ break;
+ }
+ case AArch64::ADJCALLSTACKDOWN:
+ case AArch64::ADJCALLSTACKUP:
+ case AArch64::ANDXri:
+ case AArch64::ADDXri:
+ // We permit these as they don't generate SVE/NEON instructions.
+ break;
+ case AArch64::VGRestorePseudo:
+ case AArch64::VGSavePseudo:
+ // When the smstart/smstop are removed, we should also remove
+ // the pseudos that save/restore the VG value for CFI info.
+ ToBeRemoved.push_back(&MI);
+ break;
+ case AArch64::MSRpstatesvcrImm1:
+ case AArch64::MSRpstatePseudo: {
+ if (!Prev)
+ Prev = &MI;
+ else if (isMatchingStartStopPair(Prev, &MI)) {
+ // If they match, we can remove them, and possibly any instructions
+ // that we marked for deletion in between.
+ Prev->eraseFromParent();
+ MI.eraseFromParent();
+ for (MachineInstr *TBR : ToBeRemoved)
+ TBR->eraseFromParent();
+ ToBeRemoved.clear();
+ Prev = nullptr;
+ Changed = true;
+ } else {
+ Reset();
+ Prev = &MI;
+ }
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt",
+ "SME Peephole Optimization", false, false)
+
+bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
+ return false;
+
+ assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
+
+ bool Changed = false;
+ bool FunctionHasRemainingSMChange = false;
+
+ // Even if the block lives in a function with no SME attributes attached we
+ // still have to analyze all the blocks because we may call a streaming
+ // function that requires smstart/smstop pairs.
+ for (MachineBasicBlock &MBB : MF) {
+ bool BlockHasRemainingSMChange;
+ Changed |= optimizeStartStopPairs(MBB, BlockHasRemainingSMChange);
+ FunctionHasRemainingSMChange |= BlockHasRemainingSMChange;
+ }
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (Changed && AFI->hasStreamingModeChanges())
+ AFI->setHasStreamingModeChanges(FunctionHasRemainingSMChange);
+
+ return Changed;
+}
+
+FunctionPass *llvm::createSMEPeepholeOptPass() { return new SMEPeepholeOpt(); }
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 72a888bde5ebbc..3465b717261cf5 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -122,6 +122,7 @@
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up
; CHECK-NEXT: Finalize ISel and expand pseudo-instructions
+; CHECK-NEXT: SME Peephole Optimization pass
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Early Tail Duplication
; CHECK-NEXT: Optimize machine instruction PHIs
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
new file mode 100644
index 00000000000000..668202b6e533ad
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+
+declare void @callee()
+declare void @callee_farg(float)
+declare float @callee_farg_fret(float)
+
+; normal caller -> streaming callees
+define void @test0() nounwind {
+; CHECK-LABEL: test0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: str x0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee() "aarch64_pstate_sm_enabled"
+ call void @callee() "aarch64_pstate_sm_enabled"
+ ret void
+}
+
+; streaming caller -> normal callees
+define void @test1() nounwind "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: test1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: str x0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee()
+ call void @callee()
+ ret void
+}
+
+; streaming-compatible caller -> normal callees
+; these conditional smstart/smstop are not yet optimized away.
+define void @test2() nounwind "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: test2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbz w19, #0, .LBB2_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbz w19, #0, .LBB2_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB2_4:
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbz w19, #0, .LBB2_6
+; CHECK-NEXT: // %bb.5:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB2_6:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbz w19, #0, .LBB2_8
+; CHECK-NEXT: // %bb.7:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB2_8:
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee()
+ call void @callee()
+ ret void
+}
+
+; streaming-compatible caller -> mixed callees
+define void @test3() nounwind "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: test3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbnz w19, #0, .LBB3_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB3_2:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbnz w19, #0, .LBB3_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB3_4:
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbz w19, #0, .LBB3_6
+; CHECK-NEXT: // %bb.5:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB3_6:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbz w19, #0, .LBB3_8
+; CHECK-NEXT: // %bb.7:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB3_8:
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: tbnz w19, #0, .LBB3_10
+; CHECK-NEXT: // %bb.9:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB3_10:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbnz w19, #0, .LBB3_12
+; CHECK-NEXT: // %bb.11:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB3_12:
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee() "aarch64_pstate_sm_enabled"
+ call void @callee()
+ call void @callee() "aarch64_pstate_sm_enabled"
+ ret void
+}
+
+; streaming caller -> normal callees (pass 0.0f)
+define void @test4() nounwind "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: test4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: str x0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: fmov s0, wzr
+; CHECK-NEXT: bl callee_farg
+; CHECK-NEXT: fmov s0, wzr
+; CHECK-NEXT: bl callee_farg
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @callee_farg(float zeroinitializer)
+ call void @callee_farg(float zeroinitializer)
+ ret void
+}
+
+; streaming caller -> normal callees (pass fp arg)
+define void @test5(float %f) nounwind "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: test5:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: str x0, [sp, #88] // 8-byte Folded Spill
+; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT: bl callee_farg
+; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT: bl callee_farg
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
+...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/104612
More information about the llvm-commits
mailing list