[llvm] cb112eb - [X86][CodeGen] Teach frame lowering to spill/reload registers w/ PUSHP/POPP, PUSH2[P]/POP2[P] (#73292)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 27 05:37:11 PST 2023
Author: Shengchen Kan
Date: 2023-11-27T21:37:07+08:00
New Revision: cb112eb16cff222d8fbe7cfd3cb0834f538a691d
URL: https://github.com/llvm/llvm-project/commit/cb112eb16cff222d8fbe7cfd3cb0834f538a691d
DIFF: https://github.com/llvm/llvm-project/commit/cb112eb16cff222d8fbe7cfd3cb0834f538a691d.diff
LOG: [X86][CodeGen] Teach frame lowering to spill/reload registers w/ PUSHP/POPP, PUSH2[P]/POP2[P] (#73292)
#73092 supported the encoding/decoding for PUSHP/POPP
#73233 supported the encoding/decoding for PUSH2[P]/POP2[P]
In this patch, we teach frame lowering to spill/reload registers w/
these instructions.
1. Use PPX for balanced spill/reload
2. Use PUSH2/POP2 for continuous spills/reloads
3. PUSH2/POP2 must be 16B-aligned on the stack, so pad when necessary
Added:
llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll
llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll
llvm/test/CodeGen/X86/apx/push2-pop2.ll
llvm/test/CodeGen/X86/apx/pushp-popp.ll
Modified:
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86FrameLowering.cpp
llvm/lib/Target/X86/X86MachineFunctionInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index ade175d99c89a8d..522d8513c9aff52 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -343,6 +343,10 @@ def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true
[FeatureAVX10_1, FeatureEVEX512]>;
def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true",
"Support extended general purpose register">;
+def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true",
+ "Support PUSH2/POP2 instructions">;
+def FeaturePPX : SubtargetFeature<"ppx", "HasPPX", "true",
+ "Support Push-Pop Acceleration">;
// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
// "string operations"). See "REP String Enhancement" in the Intel Software
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index b042f6865f40d01..c0d358ead2787b2 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -41,6 +41,7 @@
STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
STATISTIC(NumFrameExtraProbe,
"Number of extra stack probes generated in prologue");
+STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");
using namespace llvm;
@@ -139,6 +140,38 @@ static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
return X86::MOV32ri;
}
+// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
+// value written by the PUSH from the stack. The processor tracks these marked
+// instructions internally and fast-forwards register data between matching PUSH
+// and POP instructions, without going through memory or through the training
+// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
+// memory-renaming optimization can be used.
+//
+// The PPX hint is purely a performance hint. Instructions with this hint have
+// the same functional semantics as those without. PPX hints set by the
+// compiler that violate the balancing rule may turn off the PPX optimization,
+// but they will not affect program semantics.
+//
+// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
+// are not considered).
+//
+// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
+// GPRs at a time to/from the stack.
+static unsigned getPUSHOpcode(const X86Subtarget &ST) {
+ return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)
+ : X86::PUSH32r;
+}
+static unsigned getPOPOpcode(const X86Subtarget &ST) {
+ return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)
+ : X86::POP32r;
+}
+static unsigned getPUSH2Opcode(const X86Subtarget &ST) {
+ return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;
+}
+static unsigned getPOP2Opcode(const X86Subtarget &ST) {
+ return ST.hasPPX() ? X86::POP2P : X86::POP2;
+}
+
static bool isEAXLiveIn(MachineBasicBlock &MBB) {
for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
unsigned Reg = RegMask.PhysReg;
@@ -1679,7 +1712,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
NumBytes = alignTo(NumBytes, MaxAlign);
// Save EBP/RBP into the appropriate stack slot.
- BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ BuildMI(MBB, MBBI, DL,
+ TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
.addReg(MachineFramePtr, RegState::Kill)
.setMIFlag(MachineInstr::FrameSetup);
@@ -1818,18 +1852,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Skip the callee-saved push instructions.
bool PushedRegs = false;
int StackOffset = 2 * stackGrowth;
+ MachineBasicBlock::const_iterator LastCSPush = MBBI;
+ auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
+ if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup))
+ return false;
+ unsigned Opc = MBBI->getOpcode();
+ return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||
+ Opc == X86::PUSH2 || Opc == X86::PUSH2P;
+ };
- while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) &&
- (MBBI->getOpcode() == X86::PUSH32r ||
- MBBI->getOpcode() == X86::PUSH64r)) {
+ while (IsCSPush(MBBI)) {
PushedRegs = true;
Register Reg = MBBI->getOperand(0).getReg();
+ LastCSPush = MBBI;
++MBBI;
+ unsigned Opc = LastCSPush->getOpcode();
if (!HasFP && NeedsDwarfCFI) {
// Mark callee-saved push instruction.
// Define the current CFA rule to use the provided offset.
assert(StackSize);
+ // Compared to push, push2 introduces more stack offset (one more
+ // register).
+ if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
+ StackOffset += stackGrowth;
BuildCFI(MBB, MBBI, DL,
MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
MachineInstr::FrameSetup);
@@ -1841,6 +1887,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
.addImm(Reg)
.setMIFlag(MachineInstr::FrameSetup);
+ if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(LastCSPush->getOperand(1).getReg())
+ .setMIFlag(MachineInstr::FrameSetup);
}
}
@@ -2317,7 +2367,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
}
// Pop EBP.
- BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ BuildMI(MBB, MBBI, DL,
+ TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
MachineFramePtr)
.setMIFlag(MachineInstr::FrameDestroy);
@@ -2357,10 +2408,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
unsigned Opc = PI->getOpcode();
if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
- if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
- (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
- (Opc != X86::BTR64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)) &&
- (Opc != X86::ADD64ri32 || !PI->getFlag(MachineInstr::FrameDestroy)))
+ if (!PI->getFlag(MachineInstr::FrameDestroy) ||
+ (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
+ Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
+ Opc != X86::POP2P && Opc != X86::LEA64r))
break;
FirstCSPop = PI;
}
@@ -2451,8 +2502,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock::iterator PI = MBBI;
unsigned Opc = PI->getOpcode();
++MBBI;
- if (Opc == X86::POP32r || Opc == X86::POP64r) {
+ if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||
+ Opc == X86::POP2 || Opc == X86::POP2P) {
Offset += SlotSize;
+ // Compared to pop, pop2 introduces more stack offset (one more
+ // register).
+ if (Opc == X86::POP2 || Opc == X86::POP2P)
+ Offset += SlotSize;
BuildCFI(MBB, MBBI, DL,
MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
MachineInstr::FrameDestroy);
@@ -2735,6 +2791,30 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
}
}
+ // Strategy:
+ // 1. Use push2 when
+ // a) number of CSR > 1 if no need padding
+ // b) number of CSR > 2 if need padding
+ // 2. When the number of CSR push is odd
+ // a. Start to use push2 from the 1st push if stack is 16B aligned.
+ // b. Start to use push2 from the 2nd push if stack is not 16B aligned.
+ // 3. When the number of CSR push is even, start to use push2 from the 1st
+ // push and make the stack 16B aligned before the push
+ unsigned NumRegsForPush2 = 0;
+ if (STI.hasPush2Pop2()) {
+ unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
+ return X86::GR64RegClass.contains(I.getReg());
+ });
+ bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0);
+ bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1;
+ X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);
+ NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0;
+ if (X86FI->padForPush2Pop2()) {
+ SpillSlotOffset -= SlotSize;
+ MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ }
+ }
+
// Assign slots for GPRs. It increases frame size.
for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
Register Reg = I.getReg();
@@ -2742,6 +2822,13 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
continue;
+ // A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
+ // or only an odd number of registers in the candidates.
+ if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&
+ (SpillSlotOffset % 16 == 0 ||
+ X86FI->getNumCandidatesForPush2Pop2() % 2))
+ X86FI->addCandidateForPush2Pop2(Reg);
+
SpillSlotOffset -= SlotSize;
CalleeSavedFrameSize += SlotSize;
@@ -2759,6 +2846,10 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// TODO: saving the slot index is better?
X86FI->setRestoreBasePointer(CalleeSavedFrameSize);
}
+ assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 &&
+ "Expect even candidates for push2/pop2");
+ if (X86FI->getNumCandidatesForPush2Pop2())
+ ++NumFunctionUsingPush2Pop2;
X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
@@ -2808,41 +2899,50 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
// Push GPRs. It increases frame size.
const MachineFunction &MF = *MBB.getParent();
- unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
- for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
- Register Reg = I.getReg();
-
- if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
- continue;
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (X86FI->padForPush2Pop2())
+ emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false);
+ // Update LiveIn of the basic block and decide whether we can add a kill flag
+ // to the use.
+ auto UpdateLiveInCheckCanKill = [&](Register Reg) {
const MachineRegisterInfo &MRI = MF.getRegInfo();
- bool isLiveIn = MRI.isLiveIn(Reg);
- if (!isLiveIn)
- MBB.addLiveIn(Reg);
-
- // Decide whether we can add a kill flag to the use.
- bool CanKill = !isLiveIn;
- // Check if any subregister is live-in
- if (CanKill) {
- for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
- if (MRI.isLiveIn(*AReg)) {
- CanKill = false;
- break;
- }
- }
- }
-
// Do not set a kill flag on values that are also marked as live-in. This
// happens with the @llvm-returnaddress intrinsic and with arguments
// passed in callee saved registers.
// Omitting the kill flags is conservatively correct even if the live-in
// is not used after all.
- BuildMI(MBB, MI, DL, TII.get(Opc))
- .addReg(Reg, getKillRegState(CanKill))
- .setMIFlag(MachineInstr::FrameSetup);
+ if (MRI.isLiveIn(Reg))
+ return false;
+ MBB.addLiveIn(Reg);
+ // Check if any subregister is live-in
+ for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)
+ if (MRI.isLiveIn(*AReg))
+ return false;
+ return true;
+ };
+ auto UpdateLiveInGetKillRegState = [&](Register Reg) {
+ return getKillRegState(UpdateLiveInCheckCanKill(Reg));
+ };
+
+ for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {
+ Register Reg = RI->getReg();
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ if (X86FI->isCandidateForPush2Pop2(Reg)) {
+ Register Reg2 = (++RI)->getReg();
+ BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI)))
+ .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
+ .addReg(Reg2, UpdateLiveInGetKillRegState(Reg2))
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI)))
+ .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
- const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
if (X86FI->getRestoreBasePointer()) {
unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
Register BaseReg = this->TRI->getBaseRegister();
@@ -2958,15 +3058,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
}
// POP GPRs.
- unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
- for (const CalleeSavedInfo &I : CSI) {
- Register Reg = I.getReg();
+ for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+ Register Reg = I->getReg();
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
continue;
- BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
- .setMIFlag(MachineInstr::FrameDestroy);
+ if (X86FI->isCandidateForPush2Pop2(Reg))
+ BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg)
+ .addReg((++I)->getReg(), RegState::Define)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ else
+ BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg)
+ .setMIFlag(MachineInstr::FrameDestroy);
}
+ if (X86FI->padForPush2Pop2())
+ emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true);
+
return true;
}
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 9b2cc35c57e00ec..f6e853270e073bd 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -17,6 +17,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include <set>
namespace llvm {
@@ -117,6 +118,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// determine if we should insert tilerelease in frame lowering.
bool HasVirtualTileReg = false;
+ /// Ajust stack for push2/pop2
+ bool PadForPush2Pop2 = false;
+
+ /// Candidate registers for push2/pop2
+ std::set<Register> CandidatesForPush2Pop2;
+
/// True if this function has CFI directives that adjust the CFA.
/// This is used to determine if we should direct the debugger to use
/// the CFA instead of the stack pointer.
@@ -165,7 +172,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
return WinEHXMMSlotInfo; }
- unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ unsigned getCalleeSavedFrameSize() const {
+ return CalleeSavedFrameSize + 8 * padForPush2Pop2();
+ }
void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
@@ -232,6 +241,19 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
bool hasVirtualTileReg() const { return HasVirtualTileReg; }
void setHasVirtualTileReg(bool v) { HasVirtualTileReg = v; }
+ bool padForPush2Pop2() const { return PadForPush2Pop2; }
+ void setPadForPush2Pop2(bool V) { PadForPush2Pop2 = V; }
+
+ bool isCandidateForPush2Pop2(Register Reg) const {
+ return CandidatesForPush2Pop2.find(Reg) != CandidatesForPush2Pop2.end();
+ }
+ void addCandidateForPush2Pop2(Register Reg) {
+ CandidatesForPush2Pop2.insert(Reg);
+ }
+ size_t getNumCandidatesForPush2Pop2() const {
+ return CandidatesForPush2Pop2.size();
+ }
+
bool hasCFIAdjustCfa() const { return HasCFIAdjustCfa; }
void setHasCFIAdjustCfa(bool v) { HasCFIAdjustCfa = v; }
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll
new file mode 100644
index 000000000000000..6c9fdc2adce2ff5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=LIN-REF
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+push2pop2 | FileCheck %s --check-prefix=LIN
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+push2pop2,+ppx | FileCheck %s --check-prefix=LIN-PPX
+; RUN: llc < %s -mtriple=x86_64-windows-msvc | FileCheck %s --check-prefix=WIN-REF
+; RUN: llc < %s -mtriple=x86_64-windows-msvc -mattr=+push2pop2 | FileCheck %s --check-prefix=WIN
+; RUN: llc < %s -mtriple=x86_64-windows-msvc -mattr=+push2pop2,+ppx | FileCheck %s --check-prefix=WIN-PPX
+
+define i32 @csr6_alloc16(ptr %argv) {
+; LIN-REF-LABEL: csr6_alloc16:
+; LIN-REF: # %bb.0: # %entry
+; LIN-REF-NEXT: pushq %rbp
+; LIN-REF-NEXT: .cfi_def_cfa_offset 16
+; LIN-REF-NEXT: pushq %r15
+; LIN-REF-NEXT: .cfi_def_cfa_offset 24
+; LIN-REF-NEXT: pushq %r14
+; LIN-REF-NEXT: .cfi_def_cfa_offset 32
+; LIN-REF-NEXT: pushq %r13
+; LIN-REF-NEXT: .cfi_def_cfa_offset 40
+; LIN-REF-NEXT: pushq %r12
+; LIN-REF-NEXT: .cfi_def_cfa_offset 48
+; LIN-REF-NEXT: pushq %rbx
+; LIN-REF-NEXT: .cfi_def_cfa_offset 56
+; LIN-REF-NEXT: subq $24, %rsp
+; LIN-REF-NEXT: .cfi_def_cfa_offset 80
+; LIN-REF-NEXT: .cfi_offset %rbx, -56
+; LIN-REF-NEXT: .cfi_offset %r12, -48
+; LIN-REF-NEXT: .cfi_offset %r13, -40
+; LIN-REF-NEXT: .cfi_offset %r14, -32
+; LIN-REF-NEXT: .cfi_offset %r15, -24
+; LIN-REF-NEXT: .cfi_offset %rbp, -16
+; LIN-REF-NEXT: #APP
+; LIN-REF-NEXT: #NO_APP
+; LIN-REF-NEXT: xorl %ecx, %ecx
+; LIN-REF-NEXT: xorl %eax, %eax
+; LIN-REF-NEXT: callq *%rcx
+; LIN-REF-NEXT: addq $24, %rsp
+; LIN-REF-NEXT: .cfi_def_cfa_offset 56
+; LIN-REF-NEXT: popq %rbx
+; LIN-REF-NEXT: .cfi_def_cfa_offset 48
+; LIN-REF-NEXT: popq %r12
+; LIN-REF-NEXT: .cfi_def_cfa_offset 40
+; LIN-REF-NEXT: popq %r13
+; LIN-REF-NEXT: .cfi_def_cfa_offset 32
+; LIN-REF-NEXT: popq %r14
+; LIN-REF-NEXT: .cfi_def_cfa_offset 24
+; LIN-REF-NEXT: popq %r15
+; LIN-REF-NEXT: .cfi_def_cfa_offset 16
+; LIN-REF-NEXT: popq %rbp
+; LIN-REF-NEXT: .cfi_def_cfa_offset 8
+; LIN-REF-NEXT: retq
+;
+; LIN-LABEL: csr6_alloc16:
+; LIN: # %bb.0: # %entry
+; LIN-NEXT: pushq %rax
+; LIN-NEXT: .cfi_def_cfa_offset 16
+; LIN-NEXT: push2 %r15, %rbp
+; LIN-NEXT: .cfi_def_cfa_offset 32
+; LIN-NEXT: push2 %r13, %r14
+; LIN-NEXT: .cfi_def_cfa_offset 48
+; LIN-NEXT: push2 %rbx, %r12
+; LIN-NEXT: .cfi_def_cfa_offset 64
+; LIN-NEXT: subq $32, %rsp
+; LIN-NEXT: .cfi_def_cfa_offset 96
+; LIN-NEXT: .cfi_offset %rbx, -64
+; LIN-NEXT: .cfi_offset %r12, -56
+; LIN-NEXT: .cfi_offset %r13, -48
+; LIN-NEXT: .cfi_offset %r14, -40
+; LIN-NEXT: .cfi_offset %r15, -32
+; LIN-NEXT: .cfi_offset %rbp, -24
+; LIN-NEXT: #APP
+; LIN-NEXT: #NO_APP
+; LIN-NEXT: xorl %ecx, %ecx
+; LIN-NEXT: xorl %eax, %eax
+; LIN-NEXT: callq *%rcx
+; LIN-NEXT: addq $32, %rsp
+; LIN-NEXT: .cfi_def_cfa_offset 64
+; LIN-NEXT: pop2 %r12, %rbx
+; LIN-NEXT: .cfi_def_cfa_offset 48
+; LIN-NEXT: pop2 %r14, %r13
+; LIN-NEXT: .cfi_def_cfa_offset 32
+; LIN-NEXT: pop2 %rbp, %r15
+; LIN-NEXT: .cfi_def_cfa_offset 16
+; LIN-NEXT: popq %rcx
+; LIN-NEXT: .cfi_def_cfa_offset 8
+; LIN-NEXT: retq
+;
+; LIN-PPX-LABEL: csr6_alloc16:
+; LIN-PPX: # %bb.0: # %entry
+; LIN-PPX-NEXT: pushq %rax
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 16
+; LIN-PPX-NEXT: push2p %r15, %rbp
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 32
+; LIN-PPX-NEXT: push2p %r13, %r14
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 48
+; LIN-PPX-NEXT: push2p %rbx, %r12
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 64
+; LIN-PPX-NEXT: subq $32, %rsp
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 96
+; LIN-PPX-NEXT: .cfi_offset %rbx, -64
+; LIN-PPX-NEXT: .cfi_offset %r12, -56
+; LIN-PPX-NEXT: .cfi_offset %r13, -48
+; LIN-PPX-NEXT: .cfi_offset %r14, -40
+; LIN-PPX-NEXT: .cfi_offset %r15, -32
+; LIN-PPX-NEXT: .cfi_offset %rbp, -24
+; LIN-PPX-NEXT: #APP
+; LIN-PPX-NEXT: #NO_APP
+; LIN-PPX-NEXT: xorl %ecx, %ecx
+; LIN-PPX-NEXT: xorl %eax, %eax
+; LIN-PPX-NEXT: callq *%rcx
+; LIN-PPX-NEXT: addq $32, %rsp
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 64
+; LIN-PPX-NEXT: pop2p %r12, %rbx
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 48
+; LIN-PPX-NEXT: pop2p %r14, %r13
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 32
+; LIN-PPX-NEXT: pop2p %rbp, %r15
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 16
+; LIN-PPX-NEXT: popq %rcx
+; LIN-PPX-NEXT: .cfi_def_cfa_offset 8
+; LIN-PPX-NEXT: retq
+;
+; WIN-REF-LABEL: csr6_alloc16:
+; WIN-REF: # %bb.0: # %entry
+; WIN-REF-NEXT: pushq %r15
+; WIN-REF-NEXT: .seh_pushreg %r15
+; WIN-REF-NEXT: pushq %r14
+; WIN-REF-NEXT: .seh_pushreg %r14
+; WIN-REF-NEXT: pushq %r13
+; WIN-REF-NEXT: .seh_pushreg %r13
+; WIN-REF-NEXT: pushq %r12
+; WIN-REF-NEXT: .seh_pushreg %r12
+; WIN-REF-NEXT: pushq %rbp
+; WIN-REF-NEXT: .seh_pushreg %rbp
+; WIN-REF-NEXT: pushq %rbx
+; WIN-REF-NEXT: .seh_pushreg %rbx
+; WIN-REF-NEXT: subq $56, %rsp
+; WIN-REF-NEXT: .seh_stackalloc 56
+; WIN-REF-NEXT: .seh_endprologue
+; WIN-REF-NEXT: #APP
+; WIN-REF-NEXT: #NO_APP
+; WIN-REF-NEXT: xorl %eax, %eax
+; WIN-REF-NEXT: callq *%rax
+; WIN-REF-NEXT: nop
+; WIN-REF-NEXT: addq $56, %rsp
+; WIN-REF-NEXT: popq %rbx
+; WIN-REF-NEXT: popq %rbp
+; WIN-REF-NEXT: popq %r12
+; WIN-REF-NEXT: popq %r13
+; WIN-REF-NEXT: popq %r14
+; WIN-REF-NEXT: popq %r15
+; WIN-REF-NEXT: retq
+; WIN-REF-NEXT: .seh_endproc
+;
+; WIN-LABEL: csr6_alloc16:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: pushq %rax
+; WIN-NEXT: .seh_pushreg %rax
+; WIN-NEXT: push2 %r14, %r15
+; WIN-NEXT: .seh_pushreg %r15
+; WIN-NEXT: .seh_pushreg %r14
+; WIN-NEXT: push2 %r12, %r13
+; WIN-NEXT: .seh_pushreg %r13
+; WIN-NEXT: .seh_pushreg %r12
+; WIN-NEXT: push2 %rbx, %rbp
+; WIN-NEXT: .seh_pushreg %rbp
+; WIN-NEXT: .seh_pushreg %rbx
+; WIN-NEXT: subq $64, %rsp
+; WIN-NEXT: .seh_stackalloc 64
+; WIN-NEXT: .seh_endprologue
+; WIN-NEXT: #APP
+; WIN-NEXT: #NO_APP
+; WIN-NEXT: xorl %eax, %eax
+; WIN-NEXT: callq *%rax
+; WIN-NEXT: nop
+; WIN-NEXT: addq $64, %rsp
+; WIN-NEXT: pop2 %rbp, %rbx
+; WIN-NEXT: pop2 %r13, %r12
+; WIN-NEXT: pop2 %r15, %r14
+; WIN-NEXT: popq %rcx
+; WIN-NEXT: retq
+; WIN-NEXT: .seh_endproc
+;
+; WIN-PPX-LABEL: csr6_alloc16:
+; WIN-PPX: # %bb.0: # %entry
+; WIN-PPX-NEXT: pushq %rax
+; WIN-PPX-NEXT: .seh_pushreg %rax
+; WIN-PPX-NEXT: push2p %r14, %r15
+; WIN-PPX-NEXT: .seh_pushreg %r15
+; WIN-PPX-NEXT: .seh_pushreg %r14
+; WIN-PPX-NEXT: push2p %r12, %r13
+; WIN-PPX-NEXT: .seh_pushreg %r13
+; WIN-PPX-NEXT: .seh_pushreg %r12
+; WIN-PPX-NEXT: push2p %rbx, %rbp
+; WIN-PPX-NEXT: .seh_pushreg %rbp
+; WIN-PPX-NEXT: .seh_pushreg %rbx
+; WIN-PPX-NEXT: subq $64, %rsp
+; WIN-PPX-NEXT: .seh_stackalloc 64
+; WIN-PPX-NEXT: .seh_endprologue
+; WIN-PPX-NEXT: #APP
+; WIN-PPX-NEXT: #NO_APP
+; WIN-PPX-NEXT: xorl %eax, %eax
+; WIN-PPX-NEXT: callq *%rax
+; WIN-PPX-NEXT: nop
+; WIN-PPX-NEXT: addq $64, %rsp
+; WIN-PPX-NEXT: pop2p %rbp, %rbx
+; WIN-PPX-NEXT: pop2p %r13, %r12
+; WIN-PPX-NEXT: pop2p %r15, %r14
+; WIN-PPX-NEXT: popq %rcx
+; WIN-PPX-NEXT: retq
+; WIN-PPX-NEXT: .seh_endproc
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"()
+ %a = alloca [3 x ptr], align 8
+ %b = call ptr (...) null()
+ ret i32 undef
+}
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll
new file mode 100644
index 000000000000000..aa5c54d30e3bc45
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Check PUSH2/POP2 is not used for vector registers
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+push2pop2 -frame-pointer=all | FileCheck %s --check-prefix=FRAME
+
+define void @widget(float %arg) nounwind {
+; CHECK-LABEL: widget:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: push2 %rbp, %rsi
+; CHECK-NEXT: subq $48, %rsp
+; CHECK-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: movaps %xmm0, %xmm6
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: callq *%rsi
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: xorl %r8d, %r8d
+; CHECK-NEXT: callq *%rsi
+; CHECK-NEXT: movss %xmm6, 0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; CHECK-NEXT: addq $48, %rsp
+; CHECK-NEXT: pop2 %rsi, %rbp
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: retq
+;
+; FRAME-LABEL: widget:
+; FRAME: # %bb.0: # %bb
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: push2 %rsi, %r15
+; FRAME-NEXT: subq $48, %rsp
+; FRAME-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
+; FRAME-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FRAME-NEXT: movaps %xmm0, %xmm6
+; FRAME-NEXT: xorl %esi, %esi
+; FRAME-NEXT: xorl %ecx, %ecx
+; FRAME-NEXT: callq *%rsi
+; FRAME-NEXT: xorl %ecx, %ecx
+; FRAME-NEXT: xorl %edx, %edx
+; FRAME-NEXT: xorl %r8d, %r8d
+; FRAME-NEXT: callq *%rsi
+; FRAME-NEXT: movss %xmm6, 0
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; FRAME-NEXT: addq $48, %rsp
+; FRAME-NEXT: pop2 %r15, %rsi
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: retq
+bb:
+ %call = tail call float null(ptr null)
+ %call1 = tail call i32 null(ptr null, i32 0, i32 0)
+ store float %arg, ptr null, align 4
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2.ll b/llvm/test/CodeGen/X86/apx/push2-pop2.ll
new file mode 100644
index 000000000000000..25139f1da8272c9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2.ll
@@ -0,0 +1,432 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2,+ppx | FileCheck %s --check-prefix=PPX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 -frame-pointer=all | FileCheck %s --check-prefix=FRAME
+
+define void @csr1() nounwind {
+; CHECK-LABEL: csr1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+;
+; PPX-LABEL: csr1:
+; PPX: # %bb.0: # %entry
+; PPX-NEXT: pushp %rbp
+; PPX-NEXT: #APP
+; PPX-NEXT: #NO_APP
+; PPX-NEXT: popp %rbp
+; PPX-NEXT: retq
+;
+; FRAME-LABEL: csr1:
+; FRAME: # %bb.0: # %entry
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr2() nounwind {
+; CHECK-LABEL: csr2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+;
+; PPX-LABEL: csr2:
+; PPX: # %bb.0: # %entry
+; PPX-NEXT: pushp %rbp
+; PPX-NEXT: pushp %r15
+; PPX-NEXT: #APP
+; PPX-NEXT: #NO_APP
+; PPX-NEXT: popp %r15
+; PPX-NEXT: popp %rbp
+; PPX-NEXT: retq
+;
+; FRAME-LABEL: csr2:
+; FRAME: # %bb.0: # %entry
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: pushq %r15
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: popq %r15
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr3() nounwind {
+; CHECK-LABEL: csr3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: push2 %r14, %r15
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pop2 %r15, %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+;
+; PPX-LABEL: csr3:
+; PPX: # %bb.0: # %entry
+; PPX-NEXT: pushp %rbp
+; PPX-NEXT: push2p %r14, %r15
+; PPX-NEXT: #APP
+; PPX-NEXT: #NO_APP
+; PPX-NEXT: pop2p %r15, %r14
+; PPX-NEXT: popp %rbp
+; PPX-NEXT: retq
+;
+; FRAME-LABEL: csr3:
+; FRAME: # %bb.0: # %entry
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: push2 %r14, %r15
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: pop2 %r15, %r14
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr4() nounwind {
+; CHECK-LABEL: csr4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: push2 %r15, %rbp
+; CHECK-NEXT: push2 %r13, %r14
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pop2 %r14, %r13
+; CHECK-NEXT: pop2 %rbp, %r15
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+;
+; PPX-LABEL: csr4:
+; PPX: # %bb.0: # %entry
+; PPX-NEXT: pushq %rax
+; PPX-NEXT: push2p %r15, %rbp
+; PPX-NEXT: push2p %r13, %r14
+; PPX-NEXT: #APP
+; PPX-NEXT: #NO_APP
+; PPX-NEXT: pop2p %r14, %r13
+; PPX-NEXT: pop2p %rbp, %r15
+; PPX-NEXT: popq %rax
+; PPX-NEXT: retq
+;
+; FRAME-LABEL: csr4:
+; FRAME: # %bb.0: # %entry
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: push2 %r14, %r15
+; FRAME-NEXT: pushq %r13
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: popq %r13
+; FRAME-NEXT: pop2 %r15, %r14
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr5() nounwind {
+; CHECK-LABEL: csr5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: push2 %r14, %r15
+; CHECK-NEXT: push2 %r12, %r13
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pop2 %r13, %r12
+; CHECK-NEXT: pop2 %r15, %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+;
+; PPX-LABEL: csr5:
+; PPX: # %bb.0: # %entry
+; PPX-NEXT: pushp %rbp
+; PPX-NEXT: push2p %r14, %r15
+; PPX-NEXT: push2p %r12, %r13
+; PPX-NEXT: #APP
+; PPX-NEXT: #NO_APP
+; PPX-NEXT: pop2p %r13, %r12
+; PPX-NEXT: pop2p %r15, %r14
+; PPX-NEXT: popp %rbp
+; PPX-NEXT: retq
+;
+; FRAME-LABEL: csr5:
+; FRAME: # %bb.0: # %entry
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: push2 %r14, %r15
+; FRAME-NEXT: push2 %r12, %r13
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: pop2 %r13, %r12
+; FRAME-NEXT: pop2 %r15, %r14
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr6() nounwind {
+; CHECK-LABEL: csr6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: push2 %r15, %rbp
+; CHECK-NEXT: push2 %r13, %r14
+; CHECK-NEXT: push2 %rbx, %r12
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pop2 %r12, %rbx
+; CHECK-NEXT: pop2 %r14, %r13
+; CHECK-NEXT: pop2 %rbp, %r15
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+;
+; PPX-LABEL: csr6:
+; PPX: # %bb.0: # %entry
+; PPX-NEXT: pushq %rax
+; PPX-NEXT: push2p %r15, %rbp
+; PPX-NEXT: push2p %r13, %r14
+; PPX-NEXT: push2p %rbx, %r12
+; PPX-NEXT: #APP
+; PPX-NEXT: #NO_APP
+; PPX-NEXT: pop2p %r12, %rbx
+; PPX-NEXT: pop2p %r14, %r13
+; PPX-NEXT: pop2p %rbp, %r15
+; PPX-NEXT: popq %rax
+; PPX-NEXT: retq
+;
+; FRAME-LABEL: csr6:
+; FRAME: # %bb.0: # %entry
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: push2 %r14, %r15
+; FRAME-NEXT: push2 %r12, %r13
+; FRAME-NEXT: pushq %rbx
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: popq %rbx
+; FRAME-NEXT: pop2 %r13, %r12
+; FRAME-NEXT: pop2 %r15, %r14
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind {
+; CHECK-LABEL: lea_in_epilog:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: je .LBB6_5
+; CHECK-NEXT: # %bb.1: # %bb13
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: push2 %r15, %rbp
+; CHECK-NEXT: push2 %r13, %r14
+; CHECK-NEXT: push2 %rbx, %r12
+; CHECK-NEXT: subq $16, %rsp
+; CHECK-NEXT: movq %r9, %r14
+; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; CHECK-NEXT: addq %r14, %r13
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT: addq %r14, %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT: addq %r14, %rbx
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: xorl %r12d, %r12d
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB6_2: # %bb15
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: incq %r12
+; CHECK-NEXT: movl $432, %edx # imm = 0x1B0
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: movq %r15, %rsi
+; CHECK-NEXT: callq memcpy at PLT
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: addq %rax, %r13
+; CHECK-NEXT: addq %rax, %r15
+; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq $8, %rbp
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: je .LBB6_2
+; CHECK-NEXT: # %bb.3: # %bb11
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
+; CHECK-NEXT: pop2 %r12, %rbx
+; CHECK-NEXT: pop2 %r14, %r13
+; CHECK-NEXT: pop2 %rbp, %r15
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
+; CHECK-NEXT: jne .LBB6_5
+; CHECK-NEXT: # %bb.4: # %bb12
+; CHECK-NEXT: movq $0, (%rax)
+; CHECK-NEXT: .LBB6_5: # %bb14
+; CHECK-NEXT: retq
+;
+; PPX-LABEL: lea_in_epilog:
+; PPX: # %bb.0: # %bb
+; PPX-NEXT: testb $1, %dil
+; PPX-NEXT: je .LBB6_5
+; PPX-NEXT: # %bb.1: # %bb13
+; PPX-NEXT: pushq %rax
+; PPX-NEXT: push2p %r15, %rbp
+; PPX-NEXT: push2p %r13, %r14
+; PPX-NEXT: push2p %rbx, %r12
+; PPX-NEXT: subq $16, %rsp
+; PPX-NEXT: movq %r9, %r14
+; PPX-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; PPX-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; PPX-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; PPX-NEXT: addq %r14, %r13
+; PPX-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; PPX-NEXT: addq %r14, %r15
+; PPX-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; PPX-NEXT: addq %r14, %rbx
+; PPX-NEXT: xorl %ebp, %ebp
+; PPX-NEXT: xorl %r12d, %r12d
+; PPX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; PPX-NEXT: .p2align 4, 0x90
+; PPX-NEXT: .LBB6_2: # %bb15
+; PPX-NEXT: # =>This Inner Loop Header: Depth=1
+; PPX-NEXT: incq %r12
+; PPX-NEXT: movl $432, %edx # imm = 0x1B0
+; PPX-NEXT: xorl %edi, %edi
+; PPX-NEXT: movq %r15, %rsi
+; PPX-NEXT: callq memcpy at PLT
+; PPX-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; PPX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; PPX-NEXT: addq %rax, %r13
+; PPX-NEXT: addq %rax, %r15
+; PPX-NEXT: addq %rax, %rbx
+; PPX-NEXT: addq %rax, %r14
+; PPX-NEXT: addq $8, %rbp
+; PPX-NEXT: testb $1, %dil
+; PPX-NEXT: je .LBB6_2
+; PPX-NEXT: # %bb.3: # %bb11
+; PPX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; PPX-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
+; PPX-NEXT: pop2p %r12, %rbx
+; PPX-NEXT: pop2p %r14, %r13
+; PPX-NEXT: pop2p %rbp, %r15
+; PPX-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
+; PPX-NEXT: jne .LBB6_5
+; PPX-NEXT: # %bb.4: # %bb12
+; PPX-NEXT: movq $0, (%rax)
+; PPX-NEXT: .LBB6_5: # %bb14
+; PPX-NEXT: retq
+;
+; FRAME-LABEL: lea_in_epilog:
+; FRAME: # %bb.0: # %bb
+; FRAME-NEXT: testb $1, %dil
+; FRAME-NEXT: je .LBB6_5
+; FRAME-NEXT: # %bb.1: # %bb13
+; FRAME-NEXT: pushq %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: push2 %r14, %r15
+; FRAME-NEXT: push2 %r12, %r13
+; FRAME-NEXT: pushq %rbx
+; FRAME-NEXT: subq $24, %rsp
+; FRAME-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FRAME-NEXT: addq 16(%rbp), %r9
+; FRAME-NEXT: movq 48(%rbp), %rbx
+; FRAME-NEXT: addq %r9, %rbx
+; FRAME-NEXT: movq 40(%rbp), %r12
+; FRAME-NEXT: addq %r9, %r12
+; FRAME-NEXT: movq 32(%rbp), %r15
+; FRAME-NEXT: addq %r9, %r15
+; FRAME-NEXT: xorl %r13d, %r13d
+; FRAME-NEXT: xorl %r14d, %r14d
+; FRAME-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; FRAME-NEXT: .p2align 4, 0x90
+; FRAME-NEXT: .LBB6_2: # %bb15
+; FRAME-NEXT: # =>This Inner Loop Header: Depth=1
+; FRAME-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FRAME-NEXT: incq %r14
+; FRAME-NEXT: movl $432, %edx # imm = 0x1B0
+; FRAME-NEXT: xorl %edi, %edi
+; FRAME-NEXT: movq %r12, %rsi
+; FRAME-NEXT: callq memcpy at PLT
+; FRAME-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; FRAME-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; FRAME-NEXT: movq 16(%rbp), %rax
+; FRAME-NEXT: addq %rax, %rbx
+; FRAME-NEXT: addq %rax, %r12
+; FRAME-NEXT: addq %rax, %r15
+; FRAME-NEXT: addq %rax, %r9
+; FRAME-NEXT: addq $8, %r13
+; FRAME-NEXT: testb $1, %dil
+; FRAME-NEXT: je .LBB6_2
+; FRAME-NEXT: # %bb.3: # %bb11
+; FRAME-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; FRAME-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
+; FRAME-NEXT: popq %rbx
+; FRAME-NEXT: pop2 %r13, %r12
+; FRAME-NEXT: pop2 %r15, %r14
+; FRAME-NEXT: popq %rbp
+; FRAME-NEXT: jne .LBB6_5
+; FRAME-NEXT: # %bb.4: # %bb12
+; FRAME-NEXT: movq $0, (%rax)
+; FRAME-NEXT: .LBB6_5: # %bb14
+; FRAME-NEXT: retq
+bb:
+ br i1 %arg, label %bb13, label %bb14
+
+bb11:
+ br i1 %arg, label %bb14, label %bb12
+
+bb12:
+ store double 0.000000e+00, ptr %arg1, align 8
+ br label %bb14
+
+bb13:
+ %getelementptr = getelementptr i8, ptr null, i64 %arg5
+ br label %bb15
+
+bb14:
+ ret void
+
+bb15:
+ %phi = phi i64 [ 0, %bb13 ], [ %add, %bb15 ]
+ %getelementptr16 = getelementptr double, ptr null, i64 %phi
+ %add = add i64 %phi, 1
+ %mul = mul i64 %arg6, %add
+ %getelementptr17 = getelementptr i8, ptr %getelementptr, i64 %mul
+ call void @llvm.memcpy.p0.p0.i64(ptr %getelementptr16, ptr %getelementptr17, i64 0, i1 false)
+ %getelementptr18 = getelementptr i8, ptr %getelementptr17, i64 %arg7
+ %getelementptr19 = getelementptr i8, ptr %getelementptr17, i64 %arg8
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr19, i64 0, i1 false)
+ %getelementptr20 = getelementptr i8, ptr %getelementptr17, i64 %arg9
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr20, i64 432, i1 false)
+ %getelementptr21 = getelementptr i8, ptr %getelementptr17, i64 %arg10
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr21, i64 0, i1 false)
+ br i1 %arg, label %bb11, label %bb15
+}
diff --git a/llvm/test/CodeGen/X86/apx/pushp-popp.ll b/llvm/test/CodeGen/X86/apx/pushp-popp.ll
new file mode 100644
index 000000000000000..ad4306fccce6697
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/pushp-popp.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ppx | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ppx -frame-pointer=all | FileCheck %s --check-prefix=FRAME
+
+define void @csr2() nounwind {
+; CHECK-LABEL: csr2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushp %rbp
+; CHECK-NEXT: pushp %r15
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popp %r15
+; CHECK-NEXT: popp %rbp
+; CHECK-NEXT: retq
+;
+; FRAME-LABEL: csr2:
+; FRAME: # %bb.0: # %entry
+; FRAME-NEXT: pushp %rbp
+; FRAME-NEXT: movq %rsp, %rbp
+; FRAME-NEXT: pushp %r15
+; FRAME-NEXT: #APP
+; FRAME-NEXT: #NO_APP
+; FRAME-NEXT: popp %r15
+; FRAME-NEXT: popp %rbp
+; FRAME-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
More information about the llvm-commits
mailing list