[llvm] 98e52e8 - [VE] Restructure eliminateFrameIndex
Kazushi Marukawa via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 5 03:28:17 PDT 2022
Author: Kazushi (Jam) Marukawa
Date: 2022-07-05T19:28:11+09:00
New Revision: 98e52e8bff525b1fb2b269f74b27f0a984588c9c
URL: https://github.com/llvm/llvm-project/commit/98e52e8bff525b1fb2b269f74b27f0a984588c9c
DIFF: https://github.com/llvm/llvm-project/commit/98e52e8bff525b1fb2b269f74b27f0a984588c9c.diff
LOG: [VE] Restructure eliminateFrameIndex
Restructure the current implementation of eliminateFrameIndex function
in order to support more instructions.
Reviewed By: efocht
Differential Revision: https://reviews.llvm.org/D129034
Added:
llvm/test/CodeGen/VE/Scalar/load_stk.ll
llvm/test/CodeGen/VE/Scalar/store_stk.ll
Modified:
llvm/lib/Target/VE/VERegisterInfo.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
index d175ad26c742c..cd2c1d75a8fcc 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -133,66 +133,179 @@ static unsigned offsetToDisp(MachineInstr &MI) {
return OffDisp;
}
-static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
- MachineInstr &MI, const DebugLoc &dl,
- unsigned FIOperandNum, int Offset, Register FrameReg) {
- // Replace frame index with a frame pointer reference directly.
- // VE has 32 bit offset field, so no need to expand a target instruction.
- // Directly encode it.
+class EliminateFrameIndex {
+ const TargetInstrInfo &TII;
+ const TargetRegisterInfo &TRI;
+ const DebugLoc &DL;
+ MachineBasicBlock &MBB;
+ MachineBasicBlock::iterator II;
+ Register clobber;
+
+ // Some helper functions for the ease of instruction building.
+ MachineFunction &getFunc() const { return *MBB.getParent(); }
+ inline MCRegister getSubReg(MCRegister Reg, unsigned Idx) const {
+ return TRI.getSubReg(Reg, Idx);
+ }
+ inline const MCInstrDesc &get(unsigned Opcode) const {
+ return TII.get(Opcode);
+ }
+ inline MachineInstrBuilder build(const MCInstrDesc &MCID, Register DestReg) {
+ return BuildMI(MBB, II, DL, MCID, DestReg);
+ }
+ inline MachineInstrBuilder build(unsigned InstOpc, Register DestReg) {
+ return build(get(InstOpc), DestReg);
+ }
+ inline MachineInstrBuilder build(const MCInstrDesc &MCID) {
+ return BuildMI(MBB, II, DL, MCID);
+ }
+ inline MachineInstrBuilder build(unsigned InstOpc) {
+ return build(get(InstOpc));
+ }
+
+ // Calculate an address of frame index from a frame register and a given
+ // offset if the offset doesn't fit in the immediate field. Use a clobber
+ // register to hold calculated address.
+ void prepareReplaceFI(MachineInstr &MI, Register &FrameReg, int64_t &Offset,
+ int64_t Bytes = 0);
+ // Replace the frame index in \p MI with a frame register and a given offset
+ // if it fits in the immediate field. Otherwise, use pre-calculated address
+ // in a clobber regsiter.
+ void replaceFI(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+
+ // Expand and eliminate Frame Index of pseudo STQrii and LDQrii.
+ void processSTQ(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+ void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+
+public:
+ EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+ const DebugLoc &DL, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II)
+ : TII(TII), TRI(TRI), DL(DL), MBB(MBB), II(II), clobber(VE::SX13) {}
+
+ // Expand and eliminate Frame Index from MI
+ void processMI(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+};
+
+// Prepare the frame index if it doesn't fit in the immediate field. Use
+// clobber register to hold calculated address.
+void EliminateFrameIndex::prepareReplaceFI(MachineInstr &MI, Register &FrameReg,
+ int64_t &Offset, int64_t Bytes) {
+ if (isInt<32>(Offset) && isInt<32>(Offset + Bytes)) {
+ // If the offset is small enough to fit in the immediate field, directly
+ // encode it. So, nothing to prepare here.
+ return;
+ }
+
+ // If the offset doesn't fit, emit following codes. This clobbers SX13
+ // which we always know is available here.
+ // lea %clobber, Offset at lo
+ // and %clobber, %clobber, (32)0
+ // lea.sl %clobber, Offset at hi(FrameReg, %clobber)
+ build(VE::LEAzii, clobber).addImm(0).addImm(0).addImm(Lo_32(Offset));
+ build(VE::ANDrm, clobber).addReg(clobber).addImm(M0(32));
+ build(VE::LEASLrri, clobber)
+ .addReg(clobber)
+ .addReg(FrameReg)
+ .addImm(Hi_32(Offset));
+
+ // Use clobber register as a frame register and 0 offset
+ FrameReg = clobber;
+ Offset = 0;
+}
+
+// Replace the frame index in \p MI with a proper byte and framereg offset.
+void EliminateFrameIndex::replaceFI(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(isInt<32>(Offset));
+
+ // The offset must be small enough to fit in the immediate field after
+ // call of prepareReplaceFI. Therefore, we directly encode it.
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
MI.getOperand(FIOperandNum + offsetToDisp(MI)).ChangeToImmediate(Offset);
}
+void EliminateFrameIndex::processSTQ(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::STQrii);
+ LLVM_DEBUG(dbgs() << "processSTQ: "; MI.dump());
+
+ prepareReplaceFI(MI, FrameReg, Offset, 8);
+
+ Register SrcReg = MI.getOperand(3).getReg();
+ Register SrcHiReg = getSubReg(SrcReg, VE::sub_even);
+ Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd);
+ // VE stores HiReg to 8(addr) and LoReg to 0(addr)
+ MachineInstr *StMI =
+ build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(SrcLoReg);
+ replaceFI(*StMI, FrameReg, Offset, 0);
+ // Mutate to 'hi' store.
+ MI.setDesc(get(VE::STrii));
+ MI.getOperand(3).setReg(SrcHiReg);
+ Offset += 8;
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::LDQrii);
+ LLVM_DEBUG(dbgs() << "processLDQ: "; MI.dump());
+
+ prepareReplaceFI(MI, FrameReg, Offset, 8);
+
+ Register DestReg = MI.getOperand(0).getReg();
+ Register DestHiReg = getSubReg(DestReg, VE::sub_even);
+ Register DestLoReg = getSubReg(DestReg, VE::sub_odd);
+ // VE loads HiReg from 8(addr) and LoReg from 0(addr)
+ MachineInstr *StMI =
+ build(VE::LDrii, DestLoReg).addReg(FrameReg).addImm(0).addImm(0);
+ replaceFI(*StMI, FrameReg, Offset, 1);
+ MI.setDesc(get(VE::LDrii));
+ MI.getOperand(0).setReg(DestHiReg);
+ Offset += 8;
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ switch (MI.getOpcode()) {
+ case VE::STQrii:
+ processSTQ(MI, FrameReg, Offset, FIOperandNum);
+ return;
+ case VE::LDQrii:
+ processLDQ(MI, FrameReg, Offset, FIOperandNum);
+ return;
+ }
+ prepareReplaceFI(MI, FrameReg, Offset);
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
assert(SPAdj == 0 && "Unexpected");
MachineInstr &MI = *II;
- DebugLoc dl = MI.getDebugLoc();
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
MachineFunction &MF = *MI.getParent()->getParent();
- const VEFrameLowering *TFI = getFrameLowering(MF);
+ const VESubtarget &Subtarget = MF.getSubtarget<VESubtarget>();
+ const VEFrameLowering &TFI = *getFrameLowering(MF);
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const VERegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ EliminateFrameIndex EFI(TII, TRI, DL, *MI.getParent(), II);
+ // Retrieve FrameReg and byte offset for stack slot.
Register FrameReg;
- int Offset;
- Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
-
+ int64_t Offset =
+ TFI.getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
Offset += MI.getOperand(FIOperandNum + offsetToDisp(MI)).getImm();
- if (MI.getOpcode() == VE::STQrii) {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- Register SrcReg = MI.getOperand(3).getReg();
- Register SrcHiReg = getSubReg(SrcReg, VE::sub_even);
- Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd);
- // VE stores HiReg to 8(addr) and LoReg to 0(addr)
- MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(VE::STrii))
- .addReg(FrameReg)
- .addImm(0)
- .addImm(0)
- .addReg(SrcLoReg);
- replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
- MI.setDesc(TII.get(VE::STrii));
- MI.getOperand(3).setReg(SrcHiReg);
- Offset += 8;
- } else if (MI.getOpcode() == VE::LDQrii) {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- Register DestReg = MI.getOperand(0).getReg();
- Register DestHiReg = getSubReg(DestReg, VE::sub_even);
- Register DestLoReg = getSubReg(DestReg, VE::sub_odd);
- // VE loads HiReg from 8(addr) and LoReg from 0(addr)
- MachineInstr *StMI =
- BuildMI(*MI.getParent(), II, dl, TII.get(VE::LDrii), DestLoReg)
- .addReg(FrameReg)
- .addImm(0)
- .addImm(0);
- replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
- MI.setDesc(TII.get(VE::LDrii));
- MI.getOperand(0).setReg(DestHiReg);
- Offset += 8;
- }
-
- replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
+ EFI.processMI(MI, FrameReg, Offset, FIOperandNum);
}
Register VERegisterInfo::getFrameRegister(const MachineFunction &MF) const {
diff --git a/llvm/test/CodeGen/VE/Scalar/load_stk.ll b/llvm/test/CodeGen/VE/Scalar/load_stk.ll
new file mode 100644
index 0000000000000..9ffab1464a992
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Scalar/load_stk.ll
@@ -0,0 +1,795 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve | FileCheck %s
+
+;;; Test store instructions
+;;;
+;;; Note:
+;;; We test store instructions using general stack, stack with dynamic
+;;; allocation, stack with dynamic allocation and alignment, and stack
+;;; with dynamic allocation, alignment, and spill.
+;;;
+;;; Fist test using a stack for leaf function.
+;;;
+;;; | | Higher address
+;;; |----------------------------------------------| <- old sp
+;;; | Local variables of fixed size |
+;;; |----------------------------------------------| <- sp
+;;; | | Lower address
+;;;
+;;; Access local variable using sp (%s11). In addition, please remember
+;;; that stack is aligned by 16 bytes.
+;;;
+;;; Second test using a general stack.
+;;;
+;;; | | Higher address
+;;; |----------------------------------------------|
+;;; | Parameter area for this function |
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for this function |
+;;; |----------------------------------------------|
+;;; | Return address for this function |
+;;; |----------------------------------------------|
+;;; | Frame pointer for this function |
+;;; |----------------------------------------------| <- fp(=old sp)
+;;; | Local variables of fixed size |
+;;; |----------------------------------------------|
+;;; |.variable-sized.local.variables.(VLAs)........|
+;;; |..............................................|
+;;; |..............................................|
+;;; |----------------------------------------------| <- returned by alloca
+;;; | Parameter area for callee |
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for callee |
+;;; |----------------------------------------------|
+;;; | Return address for callee |
+;;; |----------------------------------------------|
+;;; | Frame pointer for callee |
+;;; |----------------------------------------------| <- sp
+;;; | | Lower address
+;;;
+;;; Access local variable using fp (%s9) since the size of VLA is not
+;;; known. At the beginning of the functions, allocates 240 + data
+;;; bytes. 240 means RSA+RA+FP (=176) + Parameter (=64).
+;;;
+;;; Third test using a general stack.
+;;;
+;;; | | Higher address
+;;; |----------------------------------------------|
+;;; | Parameter area for this function |
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for this function |
+;;; |----------------------------------------------|
+;;; | Return address for this function |
+;;; |----------------------------------------------|
+;;; | Frame pointer for this function |
+;;; |----------------------------------------------| <- fp(=old sp)
+;;; |.empty.space.to.make.part.below.aligned.in....|
+;;; |.case.it.needs.more.than.the.standard.16-byte.| (size of this area is
+;;; |.alignment....................................| unknown at compile time)
+;;; |----------------------------------------------|
+;;; | Local variables of fixed size including spill|
+;;; | slots |
+;;; |----------------------------------------------| <- bp(not defined by ABI,
+;;; |.variable-sized.local.variables.(VLAs)........| LLVM chooses SX17)
+;;; |..............................................| (size of this area is
+;;; |..............................................| unknown at compile time)
+;;; |----------------------------------------------| <- stack top (returned by
+;;; | Parameter area for callee | alloca)
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for callee |
+;;; |----------------------------------------------|
+;;; | Return address for callee |
+;;; |----------------------------------------------|
+;;; | Frame pointer for callee |
+;;; |----------------------------------------------| <- sp
+;;; | | Lower address
+;;;
+;;; Access local variable using bp (%s17) since the size of alignment
+;;; and VLA are not known. At the beginning of the functions, allocates
+;;; pad(240 + data + align) bytes. Then, access data through bp + pad(240)
+;;; since this address doesn't change even if VLA is dynamically allocated.
+;;;
+;;; Fourth test using a general stack with some spills.
+;;;
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc i64 @loadi64_stk() {
+; CHECK-LABEL: loadi64_stk:
+; CHECK: # %bb.0:
+; CHECK-NEXT: adds.l %s11, -16, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: ld %s0, 8(, %s11)
+; CHECK-NEXT: adds.l %s11, 16, %s11
+; CHECK-NEXT: b.l.t (, %s10)
+ %1 = alloca i64, align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %1)
+ %2 = load volatile i64, ptr %1, align 8, !tbaa !3
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %1)
+ ret i64 %2
+}
+
+; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+
+; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc i64 @loadi64_stk_big() {
+; CHECK-LABEL: loadi64_stk_big:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s11, -2147483648(, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB1_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: ld %s0, 2147483640(, %s11)
+; CHECK-NEXT: or %s1, 0, (0)1
+; CHECK-NEXT: lea %s2, 2147483640
+; CHECK-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld %s3, (%s1, %s11)
+; CHECK-NEXT: lea %s1, 8(, %s1)
+; CHECK-NEXT: brne.l %s1, %s2, .LBB1_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483648
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %1 = alloca i64, align 8
+ %2 = alloca [268435455 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %1)
+ call void @llvm.lifetime.start.p0(i64 2147483640, ptr nonnull %2)
+ %3 = load volatile i64, ptr %1, align 8, !tbaa !3
+ br label %5
+
+4: ; preds = %5
+ call void @llvm.lifetime.end.p0(i64 2147483640, ptr nonnull %2)
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %1)
+ ret i64 %3
+
+5: ; preds = %0, %5
+ %6 = phi i64 [ 0, %0 ], [ %9, %5 ]
+ %7 = getelementptr inbounds [268435455 x i64], ptr %2, i64 0, i64 %6
+ %8 = load volatile i64, ptr %7, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %6, 1
+ %10 = icmp eq i64 %9, 268435455
+ br i1 %10, label %4, label %5, !llvm.loop !7
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc i64 @loadi64_stk_big2() {
+; CHECK-LABEL: loadi64_stk_big2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s13, 2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB2_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB2_4:
+; CHECK-NEXT: lea %s13, -2147483640
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s13, (%s11, %s13)
+; CHECK-NEXT: ld %s0, (, %s13)
+; CHECK-NEXT: or %s1, 0, (0)1
+; CHECK-NEXT: lea %s2, -2147483648
+; CHECK-NEXT: and %s2, %s2, (32)0
+; CHECK-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld %s3, 8(%s1, %s11)
+; CHECK-NEXT: lea %s1, 8(, %s1)
+; CHECK-NEXT: brne.l %s1, %s2, .LBB2_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %1 = alloca i64, align 8
+ %2 = alloca [268435456 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %1)
+ call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %2)
+ %3 = load volatile i64, ptr %1, align 8, !tbaa !3
+ br label %5
+
+4: ; preds = %5
+ call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %2)
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %1)
+ ret i64 %3
+
+5: ; preds = %0, %5
+ %6 = phi i64 [ 0, %0 ], [ %9, %5 ]
+ %7 = getelementptr inbounds [268435456 x i64], ptr %2, i64 0, i64 %6
+ %8 = load volatile i64, ptr %7, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %6, 1
+ %10 = icmp eq i64 %9, 268435456
+ br i1 %10, label %4, label %5, !llvm.loop !9
+}
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc i64 @loadi64_stk_dyn(i64 noundef %0) {
+; CHECK-LABEL: loadi64_stk_dyn:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -256(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB3_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB3_2:
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s0, -8(, %s9)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca i64, align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2)
+ %3 = alloca i8, i64 %0, align 8
+ %4 = load volatile i64, ptr %3, align 8, !tbaa !3
+ %5 = load volatile i64, ptr %2, align 8, !tbaa !3
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2)
+ ret i64 %5
+}
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc i64 @loadi64_stk_dyn_align(i64 noundef %0) {
+; CHECK-LABEL: loadi64_stk_dyn_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB4_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s0, 256(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca i64, align 32
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2)
+ %3 = alloca i8, i64 %0, align 8
+ %4 = load volatile i64, ptr %3, align 8, !tbaa !3
+ %5 = load volatile i64, ptr %2, align 32, !tbaa !10
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2)
+ ret i64 %5
+}
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc i64 @loadi64_stk_dyn_align2(i64 noundef %0) {
+; CHECK-LABEL: loadi64_stk_dyn_align2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -320(, %s11)
+; CHECK-NEXT: and %s11, %s11, (58)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB5_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB5_2:
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s0, 288(, %s17)
+; CHECK-NEXT: ld %s1, 256(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca i64, align 32
+ %3 = alloca i64, align 64
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2)
+ %4 = alloca i8, i64 %0, align 8
+ %5 = load volatile i64, ptr %4, align 8, !tbaa !3
+ %6 = load volatile i64, ptr %2, align 32, !tbaa !10
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %3)
+ %7 = load volatile i64, ptr %3, align 64, !tbaa !10
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %3)
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2)
+ ret i64 %6
+}
+
+; Function Attrs: nounwind
+define x86_fastcallcc i64 @loadi64_stk_dyn_align_spill(i64 noundef %0) {
+; CHECK-LABEL: loadi64_stk_dyn_align_spill:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB6_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s19, 56(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: or %s18, 0, %s0
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s19, 256(, %s17)
+; CHECK-NEXT: lea %s0, dummy at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, dummy at hi(, %s0)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, pass at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, pass at hi(, %s0)
+; CHECK-NEXT: or %s0, 0, %s18
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: or %s0, 0, %s19
+; CHECK-NEXT: ld %s19, 56(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca i64, align 32
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2)
+ %3 = alloca i8, i64 %0, align 8
+ %4 = load volatile i64, ptr %3, align 8, !tbaa !3
+ %5 = load volatile i64, ptr %2, align 32, !tbaa !10
+ tail call void (...) @dummy()
+ tail call void @pass(i64 noundef %0)
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2)
+ ret i64 %5
+}
+
+declare void @dummy(...)
+
+declare void @pass(i64 noundef)
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc fp128 @loadquad_stk() {
+; CHECK-LABEL: loadquad_stk:
+; CHECK: # %bb.0:
+; CHECK-NEXT: adds.l %s11, -16, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB7_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: ld %s1, (, %s11)
+; CHECK-NEXT: ld %s0, 8(, %s11)
+; CHECK-NEXT: adds.l %s11, 16, %s11
+; CHECK-NEXT: b.l.t (, %s10)
+ %1 = alloca fp128, align 16
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %1)
+ %2 = load volatile fp128, ptr %1, align 16, !tbaa !12
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %1)
+ ret fp128 %2
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc fp128 @loadquad_stk_big() {
+; CHECK-LABEL: loadquad_stk_big:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s13, 2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB8_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB8_4:
+; CHECK-NEXT: lea %s13, -2147483648
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s13, (%s11, %s13)
+; CHECK-NEXT: ld %s1, (, %s13)
+; CHECK-NEXT: ld %s0, 8(, %s13)
+; CHECK-NEXT: or %s2, 0, (0)1
+; CHECK-NEXT: lea %s3, 2147483640
+; CHECK-NEXT: .LBB8_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld %s4, 8(%s2, %s11)
+; CHECK-NEXT: lea %s2, 8(, %s2)
+; CHECK-NEXT: brne.l %s2, %s3, .LBB8_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %1 = alloca fp128, align 16
+ %2 = alloca [268435455 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %1)
+ call void @llvm.lifetime.start.p0(i64 2147483640, ptr nonnull %2)
+ %3 = load volatile fp128, ptr %1, align 16, !tbaa !12
+ br label %5
+
+4: ; preds = %5
+ call void @llvm.lifetime.end.p0(i64 2147483640, ptr nonnull %2)
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %1)
+ ret fp128 %3
+
+5: ; preds = %0, %5
+ %6 = phi i64 [ 0, %0 ], [ %9, %5 ]
+ %7 = getelementptr inbounds [268435455 x i64], ptr %2, i64 0, i64 %6
+ %8 = load volatile i64, ptr %7, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %6, 1
+ %10 = icmp eq i64 %9, 268435455
+ br i1 %10, label %4, label %5, !llvm.loop !14
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc fp128 @loadquad_stk_big2() {
+; CHECK-LABEL: loadquad_stk_big2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s13, 2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB9_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB9_4:
+; CHECK-NEXT: lea %s13, -2147483648
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s13, (%s11, %s13)
+; CHECK-NEXT: ld %s1, (, %s13)
+; CHECK-NEXT: ld %s0, 8(, %s13)
+; CHECK-NEXT: or %s2, 0, (0)1
+; CHECK-NEXT: lea %s3, -2147483648
+; CHECK-NEXT: and %s3, %s3, (32)0
+; CHECK-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld %s4, (%s2, %s11)
+; CHECK-NEXT: lea %s2, 8(, %s2)
+; CHECK-NEXT: brne.l %s2, %s3, .LBB9_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %1 = alloca fp128, align 16
+ %2 = alloca [268435456 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %1)
+ call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %2)
+ %3 = load volatile fp128, ptr %1, align 16, !tbaa !12
+ br label %5
+
+4: ; preds = %5
+ call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %2)
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %1)
+ ret fp128 %3
+
+5: ; preds = %0, %5
+ %6 = phi i64 [ 0, %0 ], [ %9, %5 ]
+ %7 = getelementptr inbounds [268435456 x i64], ptr %2, i64 0, i64 %6
+ %8 = load volatile i64, ptr %7, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %6, 1
+ %10 = icmp eq i64 %9, 268435456
+ br i1 %10, label %4, label %5, !llvm.loop !15
+}
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc fp128 @loadquad_stk_dyn(i64 noundef %0) {
+; CHECK-LABEL: loadquad_stk_dyn:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -256(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s1, 8(, %s0)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s1, -16(, %s9)
+; CHECK-NEXT: ld %s0, -8(, %s9)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca fp128, align 16
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %2)
+ %3 = alloca i8, i64 %0, align 16
+ %4 = load volatile fp128, ptr %3, align 16, !tbaa !12
+ %5 = load volatile fp128, ptr %2, align 16, !tbaa !12
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %2)
+ ret fp128 %5
+}
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc fp128 @loadquad_stk_dyn_align(i64 noundef %0) {
+; CHECK-LABEL: loadquad_stk_dyn_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s1, 8(, %s0)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s1, 256(, %s17)
+; CHECK-NEXT: ld %s0, 264(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca fp128, align 32
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %2)
+ %3 = alloca i8, i64 %0, align 16
+ %4 = load volatile fp128, ptr %3, align 16, !tbaa !12
+ %5 = load volatile fp128, ptr %2, align 32, !tbaa !16
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %2)
+ ret fp128 %5
+}
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn
+define x86_fastcallcc fp128 @loadquad_stk_dyn_align2(i64 noundef %0) {
+; CHECK-LABEL: loadquad_stk_dyn_align2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -320(, %s11)
+; CHECK-NEXT: and %s11, %s11, (58)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB12_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB12_2:
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s1, 8(, %s0)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s1, 288(, %s17)
+; CHECK-NEXT: ld %s0, 296(, %s17)
+; CHECK-NEXT: ld %s3, 256(, %s17)
+; CHECK-NEXT: ld %s2, 264(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca fp128, align 32
+ %3 = alloca fp128, align 64
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %2)
+ %4 = alloca i8, i64 %0, align 16
+ %5 = load volatile fp128, ptr %4, align 16, !tbaa !12
+ %6 = load volatile fp128, ptr %2, align 32, !tbaa !16
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %3)
+ %7 = load volatile fp128, ptr %3, align 64, !tbaa !16
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %3)
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %2)
+ ret fp128 %6
+}
+
+; Function Attrs: nounwind
+define x86_fastcallcc fp128 @loadquad_stk_dyn_align_spill(i64 noundef %0) {
+; CHECK-LABEL: loadquad_stk_dyn_align_spill:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB13_2:
+; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s20, 64(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s21, 72(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: or %s18, 0, %s0
+; CHECK-NEXT: lea %s0, 15(, %s0)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: ld %s1, 8(, %s0)
+; CHECK-NEXT: ld %s0, (, %s0)
+; CHECK-NEXT: ld %s21, 256(, %s17)
+; CHECK-NEXT: ld %s20, 264(, %s17)
+; CHECK-NEXT: lea %s0, dummy at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, dummy at hi(, %s0)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, pass at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, pass at hi(, %s0)
+; CHECK-NEXT: or %s0, 0, %s18
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: or %s0, 0, %s20
+; CHECK-NEXT: or %s1, 0, %s21
+; CHECK-NEXT: ld %s21, 72(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s20, 64(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca fp128, align 32
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %2)
+ %3 = alloca i8, i64 %0, align 16
+ %4 = load volatile fp128, ptr %3, align 16, !tbaa !12
+ %5 = load volatile fp128, ptr %2, align 32, !tbaa !16
+ tail call void (...) @dummy()
+ tail call void @pass(i64 noundef %0)
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %2)
+ ret fp128 %5
+}
+
+!3 = !{!4, !4, i64 0}
+!4 = !{!"long", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = distinct !{!9, !8}
+!10 = !{!11, !4, i64 0}
+!11 = !{!"", !4, i64 0}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"long double", !5, i64 0}
+!14 = distinct !{!14, !8}
+!15 = distinct !{!15, !8}
+!16 = !{!17, !13, i64 0}
+!17 = !{!"", !13, i64 0}
diff --git a/llvm/test/CodeGen/VE/Scalar/store_stk.ll b/llvm/test/CodeGen/VE/Scalar/store_stk.ll
new file mode 100644
index 0000000000000..76a3fda813620
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Scalar/store_stk.ll
@@ -0,0 +1,808 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve | FileCheck %s
+
+;;; Test store instructions
+;;;
+;;; Note:
+;;; We test store instructions using general stack, stack with dynamic
+;;; allocation, stack with dynamic allocation and alignment, and stack
+;;; with dynamic allocation, alignment, and spill.
+;;;
+;;; Fist test using a stack for leaf function.
+;;;
+;;; | | Higher address
+;;; |----------------------------------------------| <- old sp
+;;; | Local variables of fixed size |
+;;; |----------------------------------------------| <- sp
+;;; | | Lower address
+;;;
+;;; Access local variable using sp (%s11). In addition, please remember
+;;; that stack is aligned by 16 bytes.
+;;;
+;;; Second test using a general stack.
+;;;
+;;; | | Higher address
+;;; |----------------------------------------------|
+;;; | Parameter area for this function |
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for this function |
+;;; |----------------------------------------------|
+;;; | Return address for this function |
+;;; |----------------------------------------------|
+;;; | Frame pointer for this function |
+;;; |----------------------------------------------| <- fp(=old sp)
+;;; | Local variables of fixed size |
+;;; |----------------------------------------------|
+;;; |.variable-sized.local.variables.(VLAs)........|
+;;; |..............................................|
+;;; |..............................................|
+;;; |----------------------------------------------| <- returned by alloca
+;;; | Parameter area for callee |
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for callee |
+;;; |----------------------------------------------|
+;;; | Return address for callee |
+;;; |----------------------------------------------|
+;;; | Frame pointer for callee |
+;;; |----------------------------------------------| <- sp
+;;; | | Lower address
+;;;
+;;; Access local variable using fp (%s9) since the size of VLA is not
+;;; known. At the beginning of the functions, allocates 240 + data
+;;; bytes. 240 means RSA+RA+FP (=176) + Parameter (=64).
+;;;
+;;; Third test using a general stack.
+;;;
+;;; | | Higher address
+;;; |----------------------------------------------|
+;;; | Parameter area for this function |
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for this function |
+;;; |----------------------------------------------|
+;;; | Return address for this function |
+;;; |----------------------------------------------|
+;;; | Frame pointer for this function |
+;;; |----------------------------------------------| <- fp(=old sp)
+;;; |.empty.space.to.make.part.below.aligned.in....|
+;;; |.case.it.needs.more.than.the.standard.16-byte.| (size of this area is
+;;; |.alignment....................................| unknown at compile time)
+;;; |----------------------------------------------|
+;;; | Local variables of fixed size including spill|
+;;; | slots |
+;;; |----------------------------------------------| <- bp(not defined by ABI,
+;;; |.variable-sized.local.variables.(VLAs)........| LLVM chooses SX17)
+;;; |..............................................| (size of this area is
+;;; |..............................................| unknown at compile time)
+;;; |----------------------------------------------| <- stack top (returned by
+;;; | Parameter area for callee | alloca)
+;;; |----------------------------------------------|
+;;; | Register save area (RSA) for callee |
+;;; |----------------------------------------------|
+;;; | Return address for callee |
+;;; |----------------------------------------------|
+;;; | Frame pointer for callee |
+;;; |----------------------------------------------| <- sp
+;;; | | Lower address
+;;;
+;;; Access local variable using bp (%s17) since the size of alignment
+;;; and VLA are not known. At the beginning of the functions, allocates
+;;; pad(240 + data + align) bytes. Then, access data through bp + pad(240)
+;;; since this address doesn't change even if VLA is dynamically allocated.
+;;;
+;;; Fourth test using a general stack with some spills.
+;;;
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storei64_stk(i64 noundef %0) {
+; CHECK-LABEL: storei64_stk:
+; CHECK: # %bb.0:
+; CHECK-NEXT: adds.l %s11, -16, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: st %s0, 8(, %s11)
+; CHECK-NEXT: adds.l %s11, 16, %s11
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca i64, align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2)
+ store volatile i64 %0, ptr %2, align 8, !tbaa !3
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2)
+ ret void
+}
+
+; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+
+; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storei64_stk_big(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storei64_stk_big:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s11, -2147483648(, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB1_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: st %s0, 2147483640(, %s11)
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: lea %s2, 2147483640
+; CHECK-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: st %s1, (%s0, %s11)
+; CHECK-NEXT: lea %s0, 8(, %s0)
+; CHECK-NEXT: brne.l %s0, %s2, .LBB1_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483648
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca i64, align 8
+ %4 = alloca [268435455 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %3)
+ call void @llvm.lifetime.start.p0(i64 2147483640, ptr nonnull %4)
+ store volatile i64 %0, ptr %3, align 8, !tbaa !3
+ br label %6
+
+5: ; preds = %6
+ call void @llvm.lifetime.end.p0(i64 2147483640, ptr nonnull %4)
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %3)
+ ret void
+
+6: ; preds = %2, %6
+ %7 = phi i64 [ 0, %2 ], [ %9, %6 ]
+ %8 = getelementptr inbounds [268435455 x i64], ptr %4, i64 0, i64 %7
+ store volatile i64 %1, ptr %8, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %7, 1
+ %10 = icmp eq i64 %9, 268435455
+ br i1 %10, label %5, label %6, !llvm.loop !7
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storei64_stk_big2(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storei64_stk_big2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s13, 2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB2_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB2_4:
+; CHECK-NEXT: lea %s13, -2147483640
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s13, (%s11, %s13)
+; CHECK-NEXT: st %s0, (, %s13)
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: lea %s2, -2147483648
+; CHECK-NEXT: and %s2, %s2, (32)0
+; CHECK-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: st %s1, 8(%s0, %s11)
+; CHECK-NEXT: lea %s0, 8(, %s0)
+; CHECK-NEXT: brne.l %s0, %s2, .LBB2_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca i64, align 8
+ %4 = alloca [268435456 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %3)
+ call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %4)
+ store volatile i64 %0, ptr %3, align 8, !tbaa !3
+ br label %6
+
+5: ; preds = %6
+ call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %4)
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %3)
+ ret void
+
+6: ; preds = %2, %6
+ %7 = phi i64 [ 0, %2 ], [ %9, %6 ]
+ %8 = getelementptr inbounds [268435456 x i64], ptr %4, i64 0, i64 %7
+ store volatile i64 %1, ptr %8, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %7, 1
+ %10 = icmp eq i64 %9, 268435456
+ br i1 %10, label %5, label %6, !llvm.loop !9
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storei64_stk_dyn(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storei64_stk_dyn:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -256(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB3_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB3_2:
+; CHECK-NEXT: or %s2, 0, %s0
+; CHECK-NEXT: lea %s0, 15(, %s1)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: st %s2, (, %s0)
+; CHECK-NEXT: st %s2, -8(, %s9)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca i64, align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %3)
+ %4 = alloca i8, i64 %1, align 8
+ store volatile i64 %0, ptr %4, align 8, !tbaa !3
+ store volatile i64 %0, ptr %3, align 8, !tbaa !3
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %3)
+ ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storei64_stk_dyn_align(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storei64_stk_dyn_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB4_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: or %s2, 0, %s0
+; CHECK-NEXT: lea %s0, 15(, %s1)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: st %s2, (, %s0)
+; CHECK-NEXT: st %s2, 256(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca i64, align 32
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %3)
+ %4 = alloca i8, i64 %1, align 8
+ store volatile i64 %0, ptr %4, align 8, !tbaa !3
+ store volatile i64 %0, ptr %3, align 32, !tbaa !10
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %3)
+ ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storei64_stk_dyn_align2(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storei64_stk_dyn_align2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -320(, %s11)
+; CHECK-NEXT: and %s11, %s11, (58)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB5_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB5_2:
+; CHECK-NEXT: or %s2, 0, %s0
+; CHECK-NEXT: lea %s0, 15(, %s1)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: st %s2, (, %s0)
+; CHECK-NEXT: st %s2, 288(, %s17)
+; CHECK-NEXT: st %s2, 256(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca i64, align 32
+ %4 = alloca i64, align 64
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %3)
+ %5 = alloca i8, i64 %1, align 8
+ store volatile i64 %0, ptr %5, align 8, !tbaa !3
+ store volatile i64 %0, ptr %3, align 32, !tbaa !10
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %4)
+ store volatile i64 %0, ptr %4, align 64, !tbaa !10
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %4)
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %3)
+ ret void
+}
+
+; Function Attrs: nounwind
+define x86_fastcallcc void @storei64_stk_dyn_align_spill(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storei64_stk_dyn_align_spill:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB6_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s19, 56(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s20, 64(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: or %s18, 0, %s1
+; CHECK-NEXT: or %s19, 0, %s0
+; CHECK-NEXT: lea %s0, 15(, %s1)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s20, 240(, %s11)
+; CHECK-NEXT: lea %s0, dummy at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, dummy at hi(, %s0)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, pass at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, pass at hi(, %s0)
+; CHECK-NEXT: or %s0, 0, %s18
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: st %s19, (, %s20)
+; CHECK-NEXT: st %s19, 256(, %s17)
+; CHECK-NEXT: ld %s20, 64(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s19, 56(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca i64, align 32
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %3)
+ %4 = alloca i8, i64 %1, align 8
+ tail call void (...) @dummy()
+ tail call void @pass(i64 noundef %1)
+ store volatile i64 %0, ptr %4, align 8, !tbaa !3
+ store volatile i64 %0, ptr %3, align 32, !tbaa !10
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %3)
+ ret void
+}
+
+declare void @dummy(...)
+
+declare void @pass(i64 noundef)
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storequad_stk(fp128 noundef %0) {
+; CHECK-LABEL: storequad_stk:
+; CHECK: # %bb.0:
+; CHECK-NEXT: adds.l %s11, -16, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB7_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: st %s1, (, %s11)
+; CHECK-NEXT: st %s0, 8(, %s11)
+; CHECK-NEXT: adds.l %s11, 16, %s11
+; CHECK-NEXT: b.l.t (, %s10)
+ %2 = alloca fp128, align 16
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %2)
+ store volatile fp128 %0, ptr %2, align 16, !tbaa !12
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %2)
+ ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storequad_stk_big(fp128 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storequad_stk_big:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s13, 2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB8_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB8_4:
+; CHECK-NEXT: lea %s13, -2147483648
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s13, (%s11, %s13)
+; CHECK-NEXT: st %s1, (, %s13)
+; CHECK-NEXT: st %s0, 8(, %s13)
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: lea %s1, 2147483640
+; CHECK-NEXT: .LBB8_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: st %s2, 8(%s0, %s11)
+; CHECK-NEXT: lea %s0, 8(, %s0)
+; CHECK-NEXT: brne.l %s0, %s1, .LBB8_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca fp128, align 16
+ %4 = alloca [268435455 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %3)
+ call void @llvm.lifetime.start.p0(i64 2147483640, ptr nonnull %4)
+ store volatile fp128 %0, ptr %3, align 16, !tbaa !12
+ br label %6
+
+5: ; preds = %6
+ call void @llvm.lifetime.end.p0(i64 2147483640, ptr nonnull %4)
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %3)
+ ret void
+
+6: ; preds = %2, %6
+ %7 = phi i64 [ 0, %2 ], [ %9, %6 ]
+ %8 = getelementptr inbounds [268435455 x i64], ptr %4, i64 0, i64 %7
+ store volatile i64 %1, ptr %8, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %7, 1
+ %10 = icmp eq i64 %9, 268435455
+ br i1 %10, label %5, label %6, !llvm.loop !14
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storequad_stk_big2(fp128 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storequad_stk_big2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s13, 2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11)
+; CHECK-NEXT: brge.l %s11, %s8, .LBB9_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB9_4:
+; CHECK-NEXT: lea %s13, -2147483648
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s13, (%s11, %s13)
+; CHECK-NEXT: st %s1, (, %s13)
+; CHECK-NEXT: st %s0, 8(, %s13)
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: lea %s1, -2147483648
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: st %s2, (%s0, %s11)
+; CHECK-NEXT: lea %s0, 8(, %s0)
+; CHECK-NEXT: brne.l %s0, %s1, .LBB9_1
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: lea %s13, -2147483632
+; CHECK-NEXT: and %s13, %s13, (32)0
+; CHECK-NEXT: lea.sl %s11, (%s13, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca fp128, align 16
+ %4 = alloca [268435456 x i64], align 8
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %3)
+ call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %4)
+ store volatile fp128 %0, ptr %3, align 16, !tbaa !12
+ br label %6
+
+5: ; preds = %6
+ call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %4)
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %3)
+ ret void
+
+6: ; preds = %2, %6
+ %7 = phi i64 [ 0, %2 ], [ %9, %6 ]
+ %8 = getelementptr inbounds [268435456 x i64], ptr %4, i64 0, i64 %7
+ store volatile i64 %1, ptr %8, align 8, !tbaa !3
+ %9 = add nuw nsw i64 %7, 1
+ %10 = icmp eq i64 %9, 268435456
+ br i1 %10, label %5, label %6, !llvm.loop !15
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storequad_stk_dyn(fp128 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storequad_stk_dyn:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -256(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: or %s4, 0, %s0
+; CHECK-NEXT: or %s5, 0, %s1
+; CHECK-NEXT: lea %s0, 15(, %s2)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: st %s4, 8(, %s0)
+; CHECK-NEXT: st %s5, (, %s0)
+; CHECK-NEXT: st %s5, -16(, %s9)
+; CHECK-NEXT: st %s4, -8(, %s9)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca fp128, align 16
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %3)
+ %4 = alloca i8, i64 %1, align 16
+ store volatile fp128 %0, ptr %4, align 16, !tbaa !12
+ store volatile fp128 %0, ptr %3, align 16, !tbaa !12
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %3)
+ ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storequad_stk_dyn_align(fp128 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storequad_stk_dyn_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: or %s4, 0, %s0
+; CHECK-NEXT: or %s5, 0, %s1
+; CHECK-NEXT: lea %s0, 15(, %s2)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: st %s4, 8(, %s0)
+; CHECK-NEXT: st %s5, (, %s0)
+; CHECK-NEXT: st %s5, 256(, %s17)
+; CHECK-NEXT: st %s4, 264(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca fp128, align 32
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %3)
+ %4 = alloca i8, i64 %1, align 16
+ store volatile fp128 %0, ptr %4, align 16, !tbaa !12
+ store volatile fp128 %0, ptr %3, align 32, !tbaa !16
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %3)
+ ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind
+define x86_fastcallcc void @storequad_stk_dyn_align2(fp128 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storequad_stk_dyn_align2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -320(, %s11)
+; CHECK-NEXT: and %s11, %s11, (58)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB12_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB12_2:
+; CHECK-NEXT: or %s4, 0, %s0
+; CHECK-NEXT: or %s5, 0, %s1
+; CHECK-NEXT: lea %s0, 15(, %s2)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, 240(, %s11)
+; CHECK-NEXT: st %s4, 8(, %s0)
+; CHECK-NEXT: st %s5, (, %s0)
+; CHECK-NEXT: st %s5, 288(, %s17)
+; CHECK-NEXT: st %s4, 296(, %s17)
+; CHECK-NEXT: st %s5, 256(, %s17)
+; CHECK-NEXT: st %s4, 264(, %s17)
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca fp128, align 32
+ %4 = alloca fp128, align 64
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %3)
+ %5 = alloca i8, i64 %1, align 16
+ store volatile fp128 %0, ptr %5, align 16, !tbaa !12
+ store volatile fp128 %0, ptr %3, align 32, !tbaa !16
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %4)
+ store volatile fp128 %0, ptr %4, align 64, !tbaa !16
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %4)
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %3)
+ ret void
+}
+
+; Function Attrs: nounwind
+define x86_fastcallcc void @storequad_stk_dyn_align_spill(fp128 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: storequad_stk_dyn_align_spill:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: st %s17, 40(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -288(, %s11)
+; CHECK-NEXT: and %s11, %s11, (59)1
+; CHECK-NEXT: or %s17, 0, %s11
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB13_2:
+; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s19, 56(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s20, 64(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: st %s21, 72(, %s9) # 8-byte Folded Spill
+; CHECK-NEXT: or %s18, 0, %s2
+; CHECK-NEXT: or %s20, 0, %s0
+; CHECK-NEXT: or %s21, 0, %s1
+; CHECK-NEXT: lea %s0, 15(, %s2)
+; CHECK-NEXT: and %s0, -16, %s0
+; CHECK-NEXT: lea %s1, __ve_grow_stack at lo
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea.sl %s12, __ve_grow_stack at hi(, %s1)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s19, 240(, %s11)
+; CHECK-NEXT: lea %s0, dummy at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, dummy at hi(, %s0)
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: lea %s0, pass at lo
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lea.sl %s12, pass at hi(, %s0)
+; CHECK-NEXT: or %s0, 0, %s18
+; CHECK-NEXT: bsic %s10, (, %s12)
+; CHECK-NEXT: st %s20, 8(, %s19)
+; CHECK-NEXT: st %s21, (, %s19)
+; CHECK-NEXT: st %s21, 256(, %s17)
+; CHECK-NEXT: st %s20, 264(, %s17)
+; CHECK-NEXT: ld %s21, 72(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s20, 64(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s19, 56(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload
+; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s17, 40(, %s11)
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
+ %3 = alloca fp128, align 32
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %3)
+ %4 = alloca i8, i64 %1, align 16
+ tail call void (...) @dummy()
+ tail call void @pass(i64 noundef %1)
+ store volatile fp128 %0, ptr %4, align 16, !tbaa !12
+ store volatile fp128 %0, ptr %3, align 32, !tbaa !16
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %3)
+ ret void
+}
+
+!3 = !{!4, !4, i64 0}
+!4 = !{!"long", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = distinct !{!9, !8}
+!10 = !{!11, !4, i64 0}
+!11 = !{!"", !4, i64 0}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"long double", !5, i64 0}
+!14 = distinct !{!14, !8}
+!15 = distinct !{!15, !8}
+!16 = !{!17, !13, i64 0}
+!17 = !{!"", !13, i64 0}
More information about the llvm-commits
mailing list