[llvm-branch-commits] [llvm] PR for llvm/llvm-project#80140 (PR #80141)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jan 31 05:35:37 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (llvmbot)
<details>
<summary>Changes</summary>
resolves llvm/llvm-project#<!-- -->80140
---
Patch is 93.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80141.diff
11 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp (+6)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+20-4)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+3-1)
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+35)
- (modified) llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td (+22)
- (modified) llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll (+12-8)
- (added) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+1640)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-body.ll (+4)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll (+20-9)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-interface.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 352c61d48e2ff..1af064b6de3cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1544,6 +1544,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
return true;
}
+ case AArch64::COALESCER_BARRIER_FPR16:
+ case AArch64::COALESCER_BARRIER_FPR32:
+ case AArch64::COALESCER_BARRIER_FPR64:
+ case AArch64::COALESCER_BARRIER_FPR128:
+ MI.eraseFromParent();
+ return true;
case AArch64::LD1B_2Z_IMM_PSEUDO:
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 332fb37655288..a59b1f2ec3c1c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2375,6 +2375,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((AArch64ISD::NodeType)Opcode) {
case AArch64ISD::FIRST_NUMBER:
break;
+ MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -7154,13 +7155,18 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
}
}
+static bool isPassedInFPR(EVT VT) {
+ return VT.isFixedLengthVector() ||
+ (VT.isFloatingPoint() && !VT.isScalableVector());
+}
+
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const {
+ SDValue ThisVal, bool RequiresSMChange) const {
DenseMap<unsigned, SDValue> CopiedRegs;
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -7205,6 +7211,10 @@ SDValue AArch64TargetLowering::LowerCallResult(
break;
}
+ if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
+ Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(),
+ Val);
+
InVals.push_back(Val);
}
@@ -7915,6 +7925,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
return ArgReg.Reg == VA.getLocReg();
});
} else {
+ // Add an extra level of indirection for streaming mode changes by
+ // using a pseudo copy node that cannot be rematerialised between a
+ // smstart/smstop and the call by the simple register coalescer.
+ if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
+ Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
+ Arg.getValueType(), Arg);
RegsToPass.emplace_back(VA.getLocReg(), Arg);
RegsUsed.insert(VA.getLocReg());
const TargetOptions &Options = DAG.getTarget().Options;
@@ -8151,9 +8167,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Handle result values, copying them out of physregs into vregs that we
// return.
- SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs,
- DL, DAG, InVals, IsThisReturn,
- IsThisReturn ? OutVals[0] : SDValue());
+ SDValue Result = LowerCallResult(
+ Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
if (!Ins.empty())
InGlue = Result.getValue(Result->getNumValues() - 1);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6505931e17e18..541a810fb5cba 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -58,6 +58,8 @@ enum NodeType : unsigned {
CALL_BTI, // Function call followed by a BTI instruction.
+ COALESCER_BARRIER,
+
SMSTART,
SMSTOP,
RESTORE_ZA,
@@ -1026,7 +1028,7 @@ class AArch64TargetLowering : public TargetLowering {
const SmallVectorImpl<CCValAssign> &RVLocs,
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const;
+ SDValue ThisVal, bool RequiresSMChange) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ea9882160d6fb..f86e6947c9cdb 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1015,6 +1015,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
const TargetRegisterClass *DstRC, unsigned DstSubReg,
const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+ MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+
if (MI->isCopy() &&
((DstRC->getID() == AArch64::GPR64RegClassID) ||
(DstRC->getID() == AArch64::GPR64commonRegClassID)) &&
@@ -1023,5 +1025,38 @@ bool AArch64RegisterInfo::shouldCoalesce(
// which implements a 32 to 64 bit zero extension
// which relies on the upper 32 bits being zeroed.
return false;
+
+ auto IsCoalescerBarrier = [](const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AArch64::COALESCER_BARRIER_FPR16:
+ case AArch64::COALESCER_BARRIER_FPR32:
+ case AArch64::COALESCER_BARRIER_FPR64:
+ case AArch64::COALESCER_BARRIER_FPR128:
+ return true;
+ default:
+ return false;
+ }
+ };
+
+ // For calls that temporarily have to toggle streaming mode as part of the
+ // call-sequence, we need to be more careful when coalescing copy instructions
+ // so that we don't end up coalescing the NEON/FP result or argument register
+ // with a whole Z-register, such that after coalescing the register allocator
+ // will try to spill/reload the entire Z register.
+ //
+ // We do this by checking if the node has any defs/uses that are
+ // COALESCER_BARRIER pseudos. These are 'nops' in practice, but they exist to
+ // instruct the coalescer to avoid coalescing the copy.
+ if (MI->isCopy() && SubReg != DstSubReg &&
+ (AArch64::ZPRRegClass.hasSubClassEq(DstRC) ||
+ AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) {
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier))
+ return false;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier))
+ return false;
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index eeae5303a3f89..acf067f2cc5a9 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -28,6 +28,8 @@ def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2,
def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+def AArch64CoalescerBarrier
+ : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>;
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
@@ -189,6 +191,26 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
(MSR 0xde85, GPR64:$val)>;
def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
(MRS 0xde85)>;
+
+multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
+ def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
+ let Constraints = "$dst = $src";
+ }
+ foreach vt = vts in {
+ def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$src))),
+ (!cast<Instruction>(NAME) rc:$src)>;
+ }
+}
+
+multiclass CoalescerBarriers {
+ defm _FPR16 : CoalescerBarrierPseudo<FPR16, [bf16, f16]>;
+ defm _FPR32 : CoalescerBarrierPseudo<FPR32, [f32]>;
+ defm _FPR64 : CoalescerBarrierPseudo<FPR64, [f64, v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16]>;
+ defm _FPR128 : CoalescerBarrierPseudo<FPR128, [f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64, v8bf16]>;
+}
+
+defm COALESCER_BARRIER : CoalescerBarriers;
+
} // End let Predicates = [HasSME]
// Pseudo to match to smstart/smstop. This expands:
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index e18e18a1cfad1..381091b453943 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -23,9 +23,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-FISEL-NEXT: bl streaming_callee
; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: smstop sm
+; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-FISEL-NEXT: adrp x8, .LCPI0_0
; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI0_0]
-; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-FISEL-NEXT: fadd d0, d1, d0
; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -49,9 +49,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-GISEL-NEXT: bl streaming_callee
; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: smstop sm
+; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-GISEL-NEXT: fmov d0, x8
-; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-GISEL-NEXT: fadd d0, d1, d0
; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -82,9 +82,9 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: bl normal_callee
; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
+; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-COMMON-NEXT: fmov d0, x8
-; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: fadd d0, d1, d0
; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -110,14 +110,16 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
+; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: bl normal_callee
; CHECK-COMMON-NEXT: str d0, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
+; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-COMMON-NEXT: fmov d0, x8
-; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: fadd d0, d1, d0
; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
@@ -329,9 +331,9 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-COMMON-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
-; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
@@ -390,9 +392,9 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp s1, s0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
-; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: bl fmodf
; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
@@ -420,7 +422,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl __arm_sme_state
+; CHECK-COMMON-NEXT: ldp s2, s0, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: and x19, x0, #0x1
+; CHECK-COMMON-NEXT: stp s2, s0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2
; CHECK-COMMON-NEXT: // %bb.1:
; CHECK-COMMON-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
new file mode 100644
index 0000000000000..d5bea725b6d14
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -0,0 +1,1640 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-unknown-eabi-elf"
+
+; This test verifies that call arguments and results are not coalesced
+; with SVE vector registers by the coalescer, such that no 'mul vl'
+; ldr/str pairs are generated in the streaming-mode-changing call
+; sequence.
+
+;
+; Scalar arguments
+;
+
+define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i8
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: st1b { z0.b }, p0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 16 x i8> poison, i8 %arg, i32 0
+ call void @use_i8(i8 %arg)
+ store <vscale x 16 x i8> %vec, ptr %ptr
+ ret void
+}
+
+define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i16
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: st1h { z0.h }, p0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i16> poison, i16 %arg, i32 0
+ call void @use_i16(i16 %arg)
+ store <vscale x 8 x i16> %vec, ptr %ptr
+ ret void
+}
+
+define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i32
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: st1w { z0.s }, p0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i32> poison, i32 %arg, i32 0
+ call void @use_i32(i32 %arg)
+ store <vscale x 4 x i32> %vec, ptr %ptr
+ ret void
+}
+
+define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i64
+; CHECK-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/80141
More information about the llvm-branch-commits
mailing list