[llvm] 4263585 - [AArch64][SVE] Allow accesses to SVE stack objects to use frame pointer

Thu Jan 28 04:40:47 PST 2021

Author: Bradley Smith
Date: 2021-01-28T12:39:57Z
New Revision: 42635856ed3c9a05957640f9deb50cf865c03825

URL: https://github.com/llvm/llvm-project/commit/42635856ed3c9a05957640f9deb50cf865c03825
DIFF: https://github.com/llvm/llvm-project/commit/42635856ed3c9a05957640f9deb50cf865c03825.diff

LOG: [AArch64][SVE] Allow accesses to SVE stack objects to use frame pointer

The layout of the stack frame for SVE means that using the frame pointer
rather than the stack pointer for an access to an SVE stack object
removes the need for an additional add to jump over the non-SVE objects.

Likewise the opposite is true for non-SVE stack objects.

This patch allows for the former to be done by having HasFP return true
in the presence of both SVE and non-SVE stack objects, and also fixes a
minor issue whereby the later would not be done for certain offsets.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
    llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
    llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
    llvm/test/CodeGen/AArch64/framelayout-sve.mir
    llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 65ee5016042c..f5847cfa8fa9 100644

--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -296,6 +296,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
 /// pointer register.
 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   // Win64 EH requires a frame pointer if funclets are present, as the locals
   // are accessed off the frame pointer in both the parent function and the
@@ -320,6 +321,14 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   if (!MFI.isMaxCallFrameSizeComputed() ||
       MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
     return true;
+  // If there are both SVE and non-SVE objects on the stack, make the frame
+  // pointer available since it may be more performant to use it.
+  uint64_t CalleeStackSize = AFI->isCalleeSavedStackSizeComputed()
+                                 ? AFI->getCalleeSavedStackSize()
+                                 : 0;
+  uint64_t NonSVEStackSize = MFI.getStackSize() - CalleeStackSize;
+  if (AFI->getStackSizeSVE() && NonSVEStackSize)
+    return true;
 
   return false;
 }
@@ -1883,10 +1892,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
   // right thing for the emergency spill slot.
   bool UseFP = false;
   if (AFI->hasStackFrame() && !isSVE) {
-    // We shouldn't prefer using the FP when there is an SVE area
-    // in between the FP and the non-SVE locals/spills.
-    PreferFP &= !SVEStackSize;
-
     // Note: Keeping the following as multiple 'if' statements rather than
     // merging to a single expression for readability.
     //
@@ -1907,6 +1912,10 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
       PreferFP |= Offset > -FPOffset;
 
+      // The FP offset will not fit if there is an SVE area in the way.
+      if (SVEStackSize && FPOffset < 0)
+        FPOffsetFits = false;
+
       if (MFI.hasVarSizedObjects()) {
         // If we have variable sized objects, we can use either FP or BP, as the
         // SP offset is unknown. We can use the base pointer if we have one and

diff  --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index f60e2b6c316e..0298d9e5c358 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -248,6 +248,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     return getCalleeSavedStackSize();
   }
 
+  bool isCalleeSavedStackSizeComputed() const {
+    return HasCalleeSavedStackSize;
+  }
+
   unsigned getCalleeSavedStackSize() const {
     assert(HasCalleeSavedStackSize &&
            "CalleeSavedStackSize has not been calculated");

diff  --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
index 75917ef32ae2..3e65e40449ee 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
@@ -12,16 +12,16 @@
 # CHECK1: : DW_OP_breg31 WSP+16)
 # CHECK1: DW_AT_type {{.*}}ty32
 #
-# CHECK2: : DW_OP_breg31 WSP+16, DW_OP_lit16, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_plus)
+# CHECK2: : DW_OP_breg29 W29+0, DW_OP_lit8, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_minus)
 # CHECK2: DW_AT_type {{.*}}svint32_t
 #
-# CHECK3: : DW_OP_breg31 WSP+16, DW_OP_lit8, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_plus)
+# CHECK3: : DW_OP_breg29 W29+0, DW_OP_lit16, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_minus)
 # CHECK3: DW_AT_type {{.*}}svint32_t
 #
-# CHECK4: : DW_OP_breg31 WSP+16, DW_OP_lit7, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_plus)
+# CHECK4: : DW_OP_breg29 W29+0, DW_OP_lit17, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_minus)
 # CHECK4: DW_AT_type {{.*}}svbool_t
 #
-# CHECK5: : DW_OP_breg31 WSP+16, DW_OP_lit6, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_plus)
+# CHECK5: : DW_OP_breg29 W29+0, DW_OP_lit18, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_minus)
 # CHECK5: DW_AT_type {{.*}}svbool_t
 
 --- |

diff  --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index b3d17cf17bbe..3418228c3501 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -57,6 +57,7 @@
 
 # CHECK:      bb.0.entry:
 # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
@@ -67,11 +68,9 @@
 # CHECK-NEXT: RET_ReallyLR
 
 # ASM-LABEL: test_allocate_sve:
-# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG
-# ASM-NEXT:  .cfi_offset w29, -16
+# ASM:       .cfi_offset w29, -16
 #
-# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_offset: reg29 -16
 name:            test_allocate_sve
 stack:
   - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
@@ -96,6 +95,7 @@ body:             |
 # CHECK:      bb.0.entry:
 # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -32
 # CHECK-NEXT: frame-setup STPXi killed $x21, killed $x20, $sp, 2
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK-COUNT-4: frame-setup CFI_INSTRUCTION
@@ -109,13 +109,11 @@ body:             |
 # CHECK-NEXT: RET_ReallyLR
 #
 # ASM-LABEL: test_allocate_sve_gpr_callee_saves:
-# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG
-# ASM-NEXT:  .cfi_offset w20, -8
+# ASM:       .cfi_offset w20, -8
 # ASM-NEXT:  .cfi_offset w21, -16
 # ASM-NEXT:  .cfi_offset w29, -32
 #
-# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-NEXT:  DW_CFA_offset: reg20 -8
+# UNWINDINFO:       DW_CFA_offset: reg20 -8
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg21 -16
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -32
 name:            test_allocate_sve_gpr_callee_saves
@@ -184,16 +182,14 @@ body:             |
 
 # CHECK:      bb.0.entry:
 # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
 
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16
-# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 2
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16
-# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 1
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16
-# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 7
+# CHECK-NEXT: STR_ZXI $z0, $fp, -1
+# CHECK-NEXT: STR_ZXI $z1, $fp, -2
+# CHECK-NEXT: STR_PXI $p0, $fp, -17
 
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
@@ -201,11 +197,9 @@ body:             |
 # CHECK-NEXT: RET_ReallyLR
 #
 # ASM-LABEL:  test_address_sve:
-# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 24 * VG
-# ASM-NEXT:  .cfi_offset w29, -16
+# ASM:        .cfi_offset w29, -16
 #
-# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_offset: reg29 -16
 
 name:            test_address_sve
 frameInfo:
@@ -298,12 +292,12 @@ body:             |
 
 # CHECK:      bb.0.entry:
 # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
 
-# CHECK:      $[[TMP:x[0-9]+]] = ADDVL_XXI $sp, 1
-# CHECK-NEXT: $x0 = LDRXui killed $[[TMP]], 4
+# CHECK-NEXT: $x0 = LDRXui $fp, 2
 
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1
 # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
@@ -311,11 +305,9 @@ body:             |
 # CHECK-NEXT: RET_ReallyLR
 #
 # ASM-LABEL: test_stack_arg_sve:
-# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
-# ASM-NEXT:  .cfi_offset w29, -16
+# ASM:       .cfi_offset w29, -16
 #
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_offset: reg29 -16
 
 name:             test_stack_arg_sve
 fixedStack:
@@ -460,11 +452,9 @@ body:             |
 # CHECK: RET_ReallyLR
 #
 # ASM-LABEL: save_restore_pregs_sve:
-# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 8 * VG
-# ASM-NEXT:  .cfi_offset w29, -16
+# ASM:       .cfi_offset w29, -16
 #
-# UNWINDINFO:         DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-NEXT:    DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_offset: reg29 -16
 name: save_restore_pregs_sve
 stack:
   - { id: 0, stack-id: default, size: 32, alignment: 16 }
@@ -480,6 +470,7 @@ body:             |
 ...
 # CHECK-LABEL: name: save_restore_zregs_sve
 # CHECK:      $sp = frame-setup STRXpre killed $fp, $sp, -16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
 # CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0
 # CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1
@@ -496,13 +487,11 @@ body:             |
 # CHECK-NEXT: RET_ReallyLR
 #
 # ASM-LABEL: save_restore_zregs_sve:
-# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 24 * VG
-# ASM-NEXT:  .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8  @ cfa - 16 - 8 * VG
+# ASM:       .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8  @ cfa - 16 - 8 * VG
 # ASM-NEXT:  .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9  @ cfa - 16 - 16 * VG
 # ASM-NEXT:  .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10  @ cfa - 16 - 24 * VG
 
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
 # UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
 # UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
@@ -558,8 +547,7 @@ body:             |
 # CHECK: RET_ReallyLR
 #
 # ASM-LABEL: save_restore_sve:
-# ASM:       .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG
-# ASM-NEXT:  .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
+# ASM:       .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
 # ASM-NEXT:  .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
 # ASM-NEXT:  .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
 # ASM-NEXT:  .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 32 - 32 * VG
@@ -572,8 +560,7 @@ body:             |
 # ASM-NEXT:  .cfi_offset w21, -24
 # ASM-NEXT:  .cfi_offset w29, -32
 #
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
 # UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
 # UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
 # UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus

diff  --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 806dd7e57dee..f6bcb7e2e2d2 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -41,15 +41,18 @@ define float @foo2(double* %x0, double* %x1) nounwind {
 ; CHECK-LABEL: foo2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    addvl sp, sp, #-4
-; CHECK-NEXT:    sub sp, sp, #16 // =16
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x8, sp, #16 // =16
-; CHECK-NEXT:    add x9, sp, #16 // =16
+; CHECK-NEXT:    addvl x8, x29, #-4
 ; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    st1d { z16.d }, p0, [x29, #-4, mul vl]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov w1, #1
 ; CHECK-NEXT:    mov w2, #2
 ; CHECK-NEXT:    mov w3, #3
@@ -57,12 +60,8 @@ define float @foo2(double* %x0, double* %x1) nounwind {
 ; CHECK-NEXT:    mov w5, #5
 ; CHECK-NEXT:    mov w6, #6
 ; CHECK-NEXT:    mov w7, #7
+; CHECK-NEXT:    str x8, [sp, #-16]!
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    st1d { z16.d }, p0, [x9]
-; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
-; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
-; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
-; CHECK-NEXT:    str x8, [sp]
 ; CHECK-NEXT:    bl callee2
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    add sp, sp, #16 // =16