[llvm] 5166345 - [SVE][AArch64] Refine hasSVEArgsOrReturn

Fri Jul 1 06:26:16 PDT 2022

Author: Matt Devereau
Date: 2022-07-01T13:24:55Z
New Revision: 5166345f50412f1a380948c18809545c4b7a9bd3

URL: https://github.com/llvm/llvm-project/commit/5166345f50412f1a380948c18809545c4b7a9bd3
DIFF: https://github.com/llvm/llvm-project/commit/5166345f50412f1a380948c18809545c4b7a9bd3.diff

LOG: [SVE][AArch64] Refine hasSVEArgsOrReturn

As described in aapcs64 (https://github.com/ARM-software/abi-aa/blob/2022Q1/aapcs64/aapcs64.rst#scalable-vector-registers)
AAVPCS is used only when registers z0-z7 take an SVE argument. This fixes the case where floats occupy the lower bits
of registers z0-z7 but SVE arguments in registers greater than z7 cause a function to use AAVPCS where it should use AAPCS.

Moving SVE function deduction from AArch64RegisterInfo::hasSVEArgsOrReturn to AArch64TargetLowering::LowerFormalArguments
where physical register lowering is more accurate fixes this.

Differential Revision: https://reviews.llvm.org/D127209

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
    llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
    llvm/lib/Target/AArch64/AArch64RegisterInfo.h
    llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 8cb4fa27aaaaa..ef4860979dd34 100644

--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -865,7 +865,7 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
   if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall ||
       MF->getFunction().getCallingConv() ==
           CallingConv::AArch64_SVE_VectorCall ||
-      STI->getRegisterInfo()->hasSVEArgsOrReturn(MF)) {
+      MF->getInfo<AArch64FunctionInfo>()->isSVECC()) {
     auto *TS =
         static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
     TS->emitDirectiveVariantPCS(CurrentFnSym);

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f149c20f93012..63cd8f93083cf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5675,8 +5675,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  const Function &F = MF.getFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
+  bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
+                DAG.getTargetLoweringInfo(), MF.getDataLayout());
+  if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
+    FuncInfo->setIsSVECC(true);
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -5690,7 +5698,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
   // LocVT.
   unsigned NumArgs = Ins.size();
-  Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
+  Function::const_arg_iterator CurOrigArg = F.arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ValVT = Ins[i].VT;
@@ -5761,11 +5769,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
         RC = &AArch64::FPR128RegClass;
       else if (RegVT.isScalableVector() &&
-               RegVT.getVectorElementType() == MVT::i1)
+               RegVT.getVectorElementType() == MVT::i1) {
+        FuncInfo->setIsSVECC(true);
         RC = &AArch64::PPRRegClass;
-      else if (RegVT.isScalableVector())
+      } else if (RegVT.isScalableVector()) {
+        FuncInfo->setIsSVECC(true);
         RC = &AArch64::ZPRRegClass;
-      else
+      } else
         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
       // Transform the arguments in physical registers into virtual ones.
@@ -5887,7 +5897,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       // i1 arguments are zero-extended to i8 by the caller. Emit a
       // hint to reflect this.
       if (Ins[i].isOrigArg()) {
-        Argument *OrigArg = MF.getFunction().getArg(Ins[i].getOrigArgIndex());
+        Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
         if (OrigArg->getType()->isIntegerTy(1)) {
           if (!Ins[i].Flags.isZExt()) {
             ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
@@ -5902,7 +5912,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
   // varargs
-  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
     if (!Subtarget->isTargetDarwin() || IsWin64) {
       // The AAPCS variadic function ABI is identical to the non-variadic
@@ -6215,7 +6224,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   // The check for matching callee-saved regs will determine whether it is
   // eligible for TCO.
   if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
-      AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
+      MF.getInfo<AArch64FunctionInfo>()->isSVECC())
     CallerCC = CallingConv::AArch64_SVE_VectorCall;
 
   bool CCMatch = CallerCC == CalleeCC;

diff  --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index df047fbb4b549..f070f989a5b7d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -177,6 +177,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
 
   bool IsMTETagged = false;
 
+  /// The function has Scalable Vector or Scalable Predicate register argument
+  /// or return type
+  bool IsSVECC = false;
+
   /// True if the function need unwind information.
   mutable Optional<bool> NeedsDwarfUnwindInfo;
 
@@ -191,6 +195,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
+  bool isSVECC() const { return IsSVECC; };
+  void setIsSVECC(bool s) { IsSVECC = s; };
+
   void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index b186df8948b21..f7c06b9fb71b3 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -66,14 +66,6 @@ bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg,
   return true;
 }
 
-bool AArch64RegisterInfo::hasSVEArgsOrReturn(const MachineFunction *MF) {
-  const Function &F = MF->getFunction();
-  return isa<ScalableVectorType>(F.getReturnType()) ||
-         any_of(F.args(), [](const Argument &Arg) {
-           return isa<ScalableVectorType>(Arg.getType());
-         });
-}
-
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
@@ -111,7 +103,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     // This is for OSes other than Windows; Windows is a separate case further
     // above.
     return CSR_AArch64_AAPCS_X18_SaveList;
-  if (hasSVEArgsOrReturn(MF))
+  if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
     return CSR_AArch64_SVE_AAPCS_SaveList;
   return CSR_AArch64_AAPCS_SaveList;
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index ac8c8fe270689..12dd70fa4aa84 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -42,8 +42,6 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   void UpdateCustomCallPreservedMask(MachineFunction &MF,
                                      const uint32_t **Mask) const;
 
-  static bool hasSVEArgsOrReturn(const MachineFunction *MF);
-
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const;

diff  --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 40af9abcc555b..1a159558ba5d4 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -188,6 +188,193 @@ entry:
   ret double %x0
 }
 
+; Use AAVPCS, SVE register in z0-z7 used
+
+define void @aavpcs1(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, <vscale x 4 x i32> %s7, <vscale x 4 x i32> %s8, <vscale x 4 x i32> %s9, <vscale x 4 x i32> %s10, <vscale x 4 x i32> %s11, <vscale x 4 x i32> %s12, <vscale x 4 x i32> %s13, <vscale x 4 x i32> %s14, <vscale x 4 x i32> %s15, <vscale x 4 x i32> %s16, i32 * %ptr) nounwind {
+; CHECK-LABEL: aavpcs1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp x8, x9, [sp]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x7]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z24.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x9]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast i32 * %ptr to <vscale x 4 x i32> *
+  store volatile <vscale x 4 x i32> %s7, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s8, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s9, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s11, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s12, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s13, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s14, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s15, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s16, <vscale x 4 x i32>* %ptr1.bc
+  ret void
+}
+
+; Use AAVPCS, SVE register in z0-z7 used
+
+define void @aavpcs2(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, <vscale x 4 x float> %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12,<vscale x 4 x float> %s13,<vscale x 4 x float> %s14,<vscale x 4 x float> %s15,<vscale x 4 x float> %s16,float * %ptr) nounwind {
+; CHECK-LABEL: aavpcs2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp x8, x9, [sp]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x7]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x6]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x5]
+; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x4]
+; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x3]
+; CHECK-NEXT:    ld1w { z6.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z24.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast float * %ptr to <vscale x 4 x float> *
+  store volatile <vscale x 4 x float> %s7, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s8, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s9, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s11, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s12, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s13, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s14, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s15, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s16, <vscale x 4 x float>* %ptr1.bc
+  ret void
+}
+
+; Use AAVPCS, no SVE register in z0-z7 used (floats occupy z0-z7) but predicate arg is used
+
+define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, <vscale x 16 x i1> %p0, float * %ptr) nounwind {
+; CHECK-LABEL: aavpcs3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x7]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x6]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x5]
+; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x4]
+; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x3]
+; CHECK-NEXT:    ld1w { z6.s }, p0/z, [x2]
+; CHECK-NEXT:    ld1w { z7.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x0]
+; CHECK-NEXT:    ldr x8, [sp, #16]
+; CHECK-NEXT:    st1w { z24.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast float * %ptr to <vscale x 4 x float> *
+  store volatile <vscale x 4 x float> %s8, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s9, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s10, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s11, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s12, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s13, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s14, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s15, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s16, <vscale x 4 x float>* %ptr1.bc
+  ret void
+}
+
+; use AAVPCS, SVE register in z0-z7 used (i32s dont occupy z0-z7)
+
+define void @aavpcs4(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, i32 %s7, <vscale x 4 x i32> %s8, <vscale x 4 x i32> %s9, <vscale x 4 x i32> %s10, <vscale x 4 x i32> %s11, <vscale x 4 x i32> %s12, <vscale x 4 x i32> %s13, <vscale x 4 x i32> %s14, <vscale x 4 x i32> %s15, <vscale x 4 x i32> %s16, <vscale x 4 x i32> %s17, i32 * %ptr) nounwind {
+; CHECK-LABEL: aavpcs4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr x9, [sp, #16]
+; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x8]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x9]
+; CHECK-NEXT:    st1w { z24.s }, p0, [x9]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast i32 * %ptr to <vscale x 4 x i32> *
+  store volatile <vscale x 4 x i32> %s8, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s9, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s10, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s11, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s12, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s13, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s14, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s15, <vscale x 4 x i32>* %ptr1.bc
+  store volatile <vscale x 4 x i32> %s16, <vscale x 4 x i32>* %ptr1.bc
+  ret void
+}
+
+; Use AAPCS, no SVE register in z0-7 used (floats occupy z0-z7)
+
+define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, float * %ptr) nounwind {
+; CHECK-LABEL: aapcs1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x7]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x6]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x5]
+; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x4]
+; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x3]
+; CHECK-NEXT:    ld1w { z6.s }, p0/z, [x2]
+; CHECK-NEXT:    ld1w { z7.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z16.s }, p0/z, [x0]
+; CHECK-NEXT:    ldr x8, [sp, #16]
+; CHECK-NEXT:    st1w { z16.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast float * %ptr to <vscale x 4 x float> *
+  store volatile <vscale x 4 x float> %s8, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s9, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s10, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s11, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s12, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s13, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s14, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s15, <vscale x 4 x float>* %ptr1.bc
+  store volatile <vscale x 4 x float> %s16, <vscale x 4 x float>* %ptr1.bc
+  ret void
+}
+
 declare float @callee1(float, <vscale x 8 x double>, <vscale x 8 x double>, <vscale x 2 x double>)
 declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, <vscale x 8 x double>, <vscale x 8 x double>)
 declare float @callee3(float, float, <vscale x 8 x double>, <vscale x 6 x double>, <vscale x 2 x double>)