[llvm] [AArch64]Fix invalid use of ld1/st1 in stack alloc (PR #105518)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 21 06:01:13 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (Lukacma)
<details>
<summary>Changes</summary>
This patch fixes invalid usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by [c4bac7f](https://github.com/llvm/llvm-project/commit/c4bac7f7dcd931a5e561604e95656a24c3d1c9d9). This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly.
---
Patch is 220.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105518.diff
5 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+44-10)
- (modified) llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll (+47-27)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll (+504-296)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll (+504-296)
- (modified) llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll (+42-22)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index e0f8853b715354..e2648ed1da87c0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3229,7 +3229,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+ if (RPI.isPaired())
+ // When stack offset out of range for st1b scalar + imm variant,
+ // use scalar + scalar variant.
+ StrOpc = RPI.Offset >= -8 && RPI.Offset <= 7 ? AArch64::ST1B_2Z_IMM
+ : AArch64::ST1B_2Z;
+ else
+ StrOpc = AArch64::STR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3354,10 +3360,21 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addReg(AArch64::SP);
+ if (RPI.Offset >= -8 && RPI.Offset <= 7)
+ MIB.addImm(RPI.Offset);
+ else{
+ // When stack offset out of range for st1b scalar + imm variant,
+ // store offset in register for use in scalar + scalar variant.
+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(ScratchReg != AArch64::NoRegister);
+ BuildMI(MBB, std::prev(MI), DL, TII.get(AArch64::MOVZXi), ScratchReg)
+ .addImm(RPI.Offset)
+ .addImm(/*2*scale*/ 5)
+ .setMIFlags(MachineInstr::FrameSetup);
+ MIB.addReg(ScratchReg, RegState::Kill);
+ }
+ MIB.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOStore, Size, Alignment));
@@ -3470,7 +3487,13 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
+ if (RPI.isPaired())
+ // When stack offset out of range for st1b scalar + imm variant,
+ // use scalar + scalar variant.
+ LdrOpc = RPI.Offset >= -8 && RPI.Offset <= 7 ? AArch64::LD1B_2Z_IMM
+ : AArch64::LD1B_2Z;
+ else
+ LdrOpc = AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3521,10 +3544,21 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(PnReg);
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addReg(AArch64::SP);
+ if (RPI.Offset >= -8 && RPI.Offset <= 7)
+ MIB.addImm(RPI.Offset);
+ else{
+ // When stack offset out of range for ld1b scalar + imm variant,
+ // store offset in register for use in scalar + scalar variant.
+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(ScratchReg != AArch64::NoRegister);
+ BuildMI(MBB, std::prev(MBBI), DL, TII.get(AArch64::MOVZXi), ScratchReg)
+ .addImm(RPI.Offset)
+ .addImm(/*2*scale*/ 5)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ MIB.addReg(ScratchReg, RegState::Kill);
+ }
+ MIB.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOLoad, Size, Alignment));
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 6264ce0cf4ae6d..5972ab8469b4c0 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -335,13 +335,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -349,7 +344,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -360,7 +355,17 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
+; CHECK-NEXT: mov x9, #256 // =0x100
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #320 // =0x140
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #384 // =0x180
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #448 // =0x1c0
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #512 // =0x200
+; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .cfi_offset vg, -16
@@ -369,18 +374,23 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: bl scalable_callee
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .cfi_restore vg
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov x9, #256 // =0x100
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #320 // =0x140
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #384 // =0x180
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #448 // =0x1c0
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #512 // =0x200
+; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -430,13 +440,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -444,7 +449,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -454,7 +459,17 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
; FP-CHECK-NEXT: addvl sp, sp, #-1
+; FP-CHECK-NEXT: mov x9, #256 // =0x100
; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #320 // =0x140
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #384 // =0x180
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #448 // =0x1c0
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #512 // =0x200
+; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; FP-CHECK-NEXT: //APP
; FP-CHECK-NEXT: //NO_APP
; FP-CHECK-NEXT: .cfi_offset vg, -32
@@ -463,17 +478,22 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: bl scalable_callee
; FP-CHECK-NEXT: smstart sm
; FP-CHECK-NEXT: .cfi_restore vg
-; FP-CHECK-NEXT: addvl sp, sp, #1
; FP-CHECK-NEXT: ptrue pn8.b
-; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #256 // =0x100
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #320 // =0x140
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #384 // =0x180
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #448 // =0x1c0
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #512 // =0x200
+; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: addvl sp, sp, #1
+; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 29d3d68fc4c3de..0152941addad7d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -56,29 +56,37 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -94,13 +102,18 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -108,16 +121,17 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/105518
More information about the llvm-commits
mailing list