[llvm] r304530 - [ARM] Cortex-A57 scheduling model for ARM backend (AArch32)
Javed Absar via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 2 01:53:20 PDT 2017
Author: javed.absar
Date: Fri Jun 2 03:53:19 2017
New Revision: 304530
URL: http://llvm.org/viewvc/llvm-project?rev=304530&view=rev
Log:
[ARM] Cortex-A57 scheduling model for ARM backend (AArch32)
This patch implements the Cortex-A57 scheduling model.
The main code is in ARMScheduleA57.td, ARMScheduleA57WriteRes.td.
Small changes in cpp,.h files to support required scheduling predicates.
Scheduling model implemented according to:
http://infocenter.arm.com/help/topic/com.arm.doc.uan0015b/Cortex_A57_Software_Optimization_Guide_external.pdf.
Patch by : Andrew Zhogin (submitted on his behalf, as requested).
Rewiewed by: Renato Golin, Diana Picus, Javed Absar, Kristof Beyls.
Differential Revision: https://reviews.llvm.org/D28152
Added:
llvm/trunk/lib/Target/ARM/ARMScheduleA57.td
llvm/trunk/lib/Target/ARM/ARMScheduleA57WriteRes.td
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-alu.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-basic.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm.ll
Modified:
llvm/trunk/include/llvm/CodeGen/TargetSchedule.h
llvm/trunk/lib/Target/ARM/ARM.td
llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.h
llvm/trunk/lib/Target/ARM/ARMSchedule.td
llvm/trunk/lib/Target/ARM/ARMSubtarget.h
Modified: llvm/trunk/include/llvm/CodeGen/TargetSchedule.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetSchedule.h?rev=304530&r1=304529&r2=304530&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetSchedule.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetSchedule.h Fri Jun 2 03:53:19 2017
@@ -55,6 +55,9 @@ public:
/// Return the MCSchedClassDesc for this instruction.
const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const;
+ /// \brief TargetSubtargetInfo getter.
+ const TargetSubtargetInfo *getSubtargetInfo() const { return STI; }
+
/// \brief TargetInstrInfo getter.
const TargetInstrInfo *getInstrInfo() const { return TII; }
Modified: llvm/trunk/lib/Target/ARM/ARM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARM.td?rev=304530&r1=304529&r2=304530&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARM.td (original)
+++ llvm/trunk/lib/Target/ARM/ARM.td Fri Jun 2 03:53:19 2017
@@ -205,6 +205,13 @@ def FeatureAvoidPartialCPSR : SubtargetF
"AvoidCPSRPartialUpdate", "true",
"Avoid CPSR partial update for OOO execution">;
+/// Disable +1 predication cost for instructions updating CPSR.
+/// Enabled for Cortex-A57.
+def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr",
+ "CheapPredicableCPSRDef",
+ "true",
+ "Disable +1 predication cost for instructions updating CPSR">;
+
def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
"AvoidMOVsShifterOperand", "true",
"Avoid movs instructions with shifter operand">;
@@ -788,12 +795,14 @@ def : ProcNoItin<"cortex-a53",
FeatureCRC,
FeatureFPAO]>;
-def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57,
- FeatureHWDivThumb,
- FeatureHWDivARM,
- FeatureCrypto,
- FeatureCRC,
- FeatureFPAO]>;
+def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFPAO,
+ FeatureAvoidPartialCPSR,
+ FeatureCheapPredicableCPSR]>;
def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72,
FeatureHWDivThumb,
Modified: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp?rev=304530&r1=304529&r2=304530&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp Fri Jun 2 03:53:19 2017
@@ -558,13 +558,68 @@ bool ARMBaseInstrInfo::DefinesPredicate(
return Found;
}
-static bool isCPSRDefined(const MachineInstr *MI) {
- for (const auto &MO : MI->operands())
+bool ARMBaseInstrInfo::isCPSRDefined(const MachineInstr &MI) {
+ for (const auto &MO : MI.operands())
if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead())
return true;
return false;
}
+bool ARMBaseInstrInfo::isAddrMode3OpImm(const MachineInstr &MI,
+ unsigned Op) const {
+ const MachineOperand &Offset = MI.getOperand(Op + 1);
+ return Offset.getReg() != 0;
+}
+
+// Load with negative register offset requires additional 1cyc and +I unit
+// for Cortex A57
+bool ARMBaseInstrInfo::isAddrMode3OpMinusReg(const MachineInstr &MI,
+ unsigned Op) const {
+ const MachineOperand &Offset = MI.getOperand(Op + 1);
+ const MachineOperand &Opc = MI.getOperand(Op + 2);
+ assert(Opc.isImm());
+ assert(Offset.isReg());
+ int64_t OpcImm = Opc.getImm();
+
+ bool isSub = ARM_AM::getAM3Op(OpcImm) == ARM_AM::sub;
+ return (isSub && Offset.getReg() != 0);
+}
+
+bool ARMBaseInstrInfo::isLdstScaledReg(const MachineInstr &MI,
+ unsigned Op) const {
+ const MachineOperand &Opc = MI.getOperand(Op + 2);
+ unsigned OffImm = Opc.getImm();
+ return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
+}
+
+// Load, scaled register offset, not plus LSL2
+bool ARMBaseInstrInfo::isLdstScaledRegNotPlusLsl2(const MachineInstr &MI,
+ unsigned Op) const {
+ const MachineOperand &Opc = MI.getOperand(Op + 2);
+ unsigned OffImm = Opc.getImm();
+
+ bool isAdd = ARM_AM::getAM2Op(OffImm) == ARM_AM::add;
+ unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+ ARM_AM::ShiftOpc ShiftOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+ if (ShiftOpc == ARM_AM::no_shift) return false; // not scaled
+ bool SimpleScaled = (isAdd && ShiftOpc == ARM_AM::lsl && Amt == 2);
+ return !SimpleScaled;
+}
+
+// Minus reg for ldstso addr mode
+bool ARMBaseInstrInfo::isLdstSoMinusReg(const MachineInstr &MI,
+ unsigned Op) const {
+ unsigned OffImm = MI.getOperand(Op + 2).getImm();
+ return ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+}
+
+// Load, scaled register offset
+bool ARMBaseInstrInfo::isAm2ScaledReg(const MachineInstr &MI,
+ unsigned Op) const {
+ unsigned OffImm = MI.getOperand(Op + 2).getImm();
+ return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
+}
+
static bool isEligibleForITBlock(const MachineInstr *MI) {
switch (MI->getOpcode()) {
default: return true;
@@ -590,7 +645,7 @@ static bool isEligibleForITBlock(const M
case ARM::tSUBi3: // SUB (immediate) T1
case ARM::tSUBi8: // SUB (immediate) T2
case ARM::tSUBrr: // SUB (register) T1
- return !isCPSRDefined(MI);
+ return !ARMBaseInstrInfo::isCPSRDefined(*MI);
}
}
@@ -3349,6 +3404,22 @@ ARMBaseInstrInfo::getVLDMDefCycle(const
return DefCycle;
}
+bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
+ unsigned BaseReg = MI.getOperand(0).getReg();
+ for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
+ const auto &Op = MI.getOperand(i);
+ if (Op.isReg() && Op.getReg() == BaseReg)
+ return true;
+ }
+ return false;
+}
+unsigned
+ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const {
+ // ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops
+ // (outs GPR:$wb), (ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops)
+ return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands();
+}
+
int
ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
const MCInstrDesc &DefMCID,
@@ -4119,7 +4190,8 @@ unsigned ARMBaseInstrInfo::getPredicatio
const MCInstrDesc &MCID = MI.getDesc();
- if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+ if (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) &&
+ !Subtarget.cheapPredicableCPSRDef())) {
// When predicated, CPSR is an additional source operand for CPSR updating
// instructions, this apparently increases their latencies.
return 1;
@@ -4148,7 +4220,8 @@ unsigned ARMBaseInstrInfo::getInstrLaten
}
const MCInstrDesc &MCID = MI.getDesc();
- if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
+ if (PredCost && (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) &&
+ !Subtarget.cheapPredicableCPSRDef()))) {
// When predicated, CPSR is an additional source operand for CPSR updating
// instructions, this apparently increases their latencies.
*PredCost = 1;
Modified: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.h?rev=304530&r1=304529&r2=304530&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.h Fri Jun 2 03:53:19 2017
@@ -159,6 +159,24 @@ public:
bool isPredicable(const MachineInstr &MI) const override;
+ // CPSR defined in instruction
+ static bool isCPSRDefined(const MachineInstr &MI);
+ bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const;
+ bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const;
+
+ // Load, scaled register offset
+ bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const;
+ // Load, scaled register offset, not plus LSL2
+ bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const;
+ // Minus reg for ldstso addr mode
+ bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const;
+ // Scaled register offset in address mode 2
+ bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const;
+ // Load multiple, base reg in list
+ bool isLDMBaseRegInList(const MachineInstr &MI) const;
+ // get LDM variable defs size
+ unsigned getLDMVariableDefsSize(const MachineInstr &MI) const;
+
/// GetInstSize - Returns the size of the specified MachineInstr.
///
unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
Modified: llvm/trunk/lib/Target/ARM/ARMSchedule.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSchedule.td?rev=304530&r1=304529&r2=304530&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSchedule.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMSchedule.td Fri Jun 2 03:53:19 2017
@@ -147,6 +147,9 @@ def : PredicateProlog<[{
const ARMBaseInstrInfo *TII =
static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
(void)TII;
+ const ARMSubtarget *STI =
+ static_cast<const ARMSubtarget*>(SchedModel->getSubtargetInfo());
+ (void)STI;
}]>;
def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
@@ -420,3 +423,4 @@ include "ARMScheduleA8.td"
include "ARMScheduleA9.td"
include "ARMScheduleSwift.td"
include "ARMScheduleR52.td"
+include "ARMScheduleA57.td"
Added: llvm/trunk/lib/Target/ARM/ARMScheduleA57.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMScheduleA57.td?rev=304530&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMScheduleA57.td (added)
+++ llvm/trunk/lib/Target/ARM/ARMScheduleA57.td Fri Jun 2 03:53:19 2017
@@ -0,0 +1,1471 @@
+//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// *** Common description and scheduling model parameters taken from AArch64 ***
+// The Cortex-A57 is a traditional superscalar microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
+def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
+def IsCPSRDefinedAndPredicatedPred :
+ SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
+
+// Cortex A57 rev. r1p0 or later (false = r0px)
+def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
+
+// If Addrmode3 contains register offset (not immediate)
+def IsLdrAm3RegOffPred :
+ SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
+// The same predicate with operand offset 2 and 3:
+def IsLdrAm3RegOffPredX2 :
+ SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
+def IsLdrAm3RegOffPredX3 :
+ SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
+
+// If Addrmode3 contains "minus register"
+def IsLdrAm3NegRegOffPred :
+ SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
+// The same predicate with operand offset 2 and 3:
+def IsLdrAm3NegRegOffPredX2 :
+ SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
+def IsLdrAm3NegRegOffPredX3 :
+ SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
+
+// Load, scaled register offset, not plus LSL2
+def IsLdstsoScaledNotOptimalPredX0 :
+ SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
+def IsLdstsoScaledNotOptimalPred :
+ SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
+def IsLdstsoScaledNotOptimalPredX2 :
+ SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
+
+// Load, scaled register offset
+def IsLdstsoScaledPred :
+ SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
+def IsLdstsoScaledPredX2 :
+ SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
+
+def IsLdstsoMinusRegPredX0 :
+ SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
+def IsLdstsoMinusRegPred :
+ SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
+def IsLdstsoMinusRegPredX2 :
+ SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
+
+// Load, scaled register offset
+def IsLdrAm2ScaledPred :
+ SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
+
+// LDM, base reg in list
+def IsLdmBaseRegInList :
+ SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
+
+class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
+ list <SchedWriteRes> Writes = writes;
+ SchedMachineModel SchedModel = ?;
+}
+
+// *** Common description and scheduling model parameters taken from AArch64 ***
+// (AArch64SchedA57.td)
+def CortexA57Model : SchedMachineModel {
+ let IssueWidth = 3; // 3-way decode and dispatch
+ let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+ let LoadLatency = 4; // Optimistic load latency
+ let MispredictPenalty = 16; // Fetch + Decode/Rename/Dispatch + Branch
+
+ // Enable partial & runtime unrolling.
+ let LoopMicroOpBufferSize = 16;
+ let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1>; // Type B micro-ops
+def A57UnitI : ProcResource<2>; // Type I micro-ops
+def A57UnitM : ProcResource<1>; // Type M micro-ops
+def A57UnitL : ProcResource<1>; // Type L micro-ops
+def A57UnitS : ProcResource<1>; // Type S micro-ops
+
+def A57UnitX : ProcResource<1>; // Type X micro-ops (F1)
+def A57UnitW : ProcResource<1>; // Type W micro-ops (F0)
+
+let SchedModel = CortexA57Model in {
+ def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops
+}
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "ARMScheduleA57WriteRes.td"
+
+// To have "CompleteModel = 1", support of pseudos and special instructions
+def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
+ "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
+ "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
+ "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
+ "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
+ "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG",
+ "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>;
+
+def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
+
+// Specific memory instrs
+def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
+ "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
+
+// coprocessor moves
+def : InstRW<[WriteNoop, WriteNoop], (instregex
+ "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
+ "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
+ "(t2)?MSR(banked|i|_AR|_M)?$")>;
+
+// Deprecated instructions
+def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
+
+// Pseudos
+def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
+ "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
+ "tLDRpci_pic", "t2SUBS_PC_LR",
+ "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
+ "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
+ "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
+ "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
+ "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
+ "WIN__CHKSTK", "WIN__DBZCHK")>;
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
+
+// --- 3.2 Branch Instructions ---
+// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
+
+def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
+ "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
+def : InstRW<[A57Write_1cyc_1B_1I],
+ (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
+// Pseudos
+def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
+def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
+ "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
+def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
+
+// --- 3.3 Arithmetic and Logical Instructions ---
+// ADD{S}, ADC{S}, ADR, AND{S}, BIC{S}, CMN, CMP, EOR{S}, ORN{S}, ORR{S},
+// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
+
+def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
+
+// shift by register, conditional or unconditional
+// TODO: according to the doc, conditional uses I0/I1, unconditional uses M
+// Why more complex instruction uses more simple pipeline?
+// May be an error in doc.
+def A57WriteALUsi : SchedWriteVariant<[
+ // lsl #2, lsl #1, or lsr #1.
+ SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
+]>;
+def A57WriteALUsr : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
+]>;
+def A57WriteALUSsr : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
+]>;
+def A57ReadALUsr : SchedReadVariant<[
+ SchedVar<IsPredicatedPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>
+]>;
+def : SchedAlias<WriteALUsi, A57WriteALUsi>;
+def : SchedAlias<WriteALUsr, A57WriteALUsr>;
+def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
+def : SchedAlias<ReadALUsr, A57ReadALUsr>;
+
+def A57WriteCMPsr : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
+]>;
+def : SchedAlias<WriteCMP, A57Write_1cyc_1I>;
+def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
+def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
+
+// --- 3.4 Move and Shift Instructions ---
+// Move, basic
+// MOV{S}, MOVW, MVN{S}
+def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
+ "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
+ "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
+
+// Move, shift by immed, setflags/no setflags
+// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
+// setflags = isCPSRDefined
+def A57WriteMOVsi : SchedWriteVariant<[
+ SchedVar<IsCPSRDefinedPred, [A57Write_2cyc_1M]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
+ "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
+ "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
+
+// shift by register, conditional or unconditional, setflags/no setflags
+def A57WriteMOVsr : SchedWriteVariant<[
+ SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
+ SchedVar<IsCPSRDefinedPred, [A57Write_2cyc_1M]>,
+ SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
+ "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
+ "(t2|t)RORrr")>;
+
+// Move, top
+// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
+def A57WriteMOVT : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_1cyc_1I]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
+]>;
+def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
+
+def A57WriteI2pc :
+ WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
+def A57WriteI2ld :
+ WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
+def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+// +2cyc for branch forms
+def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
+
+// --- 3.5 Divide and Multiply Instructions ---
+// Divide: SDIV, UDIV
+// latency from documentration: 4 Ââ 20, maximum taken
+def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
+// Multiply: tMul not bound to common WriteRes types
+def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
+def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
+def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
+def : ReadAdvance<ReadMUL, 0>;
+
+// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
+// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
+// Multiply-accumulate pipelines support late-forwarding of accumulate operands
+// from similar μops, allowing a typical sequence of multiply-accumulate μops
+// to issue one every 1 cycle (sched advance = 2).
+def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
+def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
+def A57ReadMLA : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
+
+def : SchedAlias<WriteMAC16, A57WriteMLA>;
+def : SchedAlias<WriteMAC32, A57WriteMLA>;
+def : SchedAlias<ReadMAC, A57ReadMLA>;
+
+def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
+def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
+
+// Multiply long: SMULL, UMULL
+def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
+def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
+
+// --- 3.6 Saturating and Parallel Arithmetic Instructions ---
+// Parallel arith
+// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
+// Conditional GE-setting instructions require three extra μops
+// and two additional cycles to conditionally update the GE field.
+def A57WriteParArith : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1I_1M]>
+]>;
+def : InstRW< [A57WriteParArith], (instregex
+ "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
+ "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
+
+// Parallel arith with exchange: SASX, SSAX, UASX, USAX
+def A57WriteParArithExch : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
+ SchedVar<NoSchedPred, [A57Write_3cyc_1I_1M]>
+]>;
+def : InstRW<[A57WriteParArithExch],
+ (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
+
+// Parallel halving arith
+// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16, UHSUB8
+def : InstRW<[A57Write_2cyc_1M], (instregex
+ "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
+ "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
+
+// Parallel halving arith with exchange
+// SHASX, SHSAX, UHASX, UHSAX
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
+ "(t2)?UHASX", "(t2)?UHSAX")>;
+
+// Parallel saturating arith
+// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
+def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
+ "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
+
+// Parallel saturating arith with exchange
+// QASX, QSAX, UQASX, UQSAX
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
+ "(t2)?UQASX", "(t2)?UQSAX")>;
+
+// Saturate: SSAT, SSAT16, USAT, USAT16
+def : InstRW<[A57Write_2cyc_1M],
+ (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
+
+// Saturating arith: QADD, QSUB
+def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
+
+// Saturating doubling arith: QDADD, QDSUB
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
+
+// --- 3.7 Miscellaneous Data-Processing Instructions ---
+// Bit field extract: SBFX, UBFX
+def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
+
+// Bit field insert/clear: BFI, BFC
+def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
+
+// Select bytes, conditional/unconditional
+def A57WriteSEL : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
+
+// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
+def : InstRW<[A57Write_1cyc_1I],
+ (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
+
+// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
+def : InstRW<[A57Write_2cyc_1M],
+ (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
+
+// Sign/zero extend and add, parallel: SXTAB16, UXTAB16
+def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
+
+// Sum of absolute differences: USAD8, USADA8
+def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
+
+// --- 3.8 Load Instructions ---
+
+// Load, immed offset
+// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
+def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
+ "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
+ "PICLDR", "tLDR")>;
+
+def : InstRW<[A57Write_4cyc_1L],
+ (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
+
+// For "Load, register offset, minus" we need +1cyc, +1I
+def A57WriteLdrAm3 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
+def A57WriteLdrAm3X2 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
+def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
+
+def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
+ SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
+ SchedVar<IsLdstsoMinusRegPred, [A57Write_5cyc_1I_1L]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
+
+def A57WrBackOne : SchedWriteRes<[]> {
+ let Latency = 1;
+ let NumMicroOps = 0;
+}
+def A57WrBackTwo : SchedWriteRes<[]> {
+ let Latency = 2;
+ let NumMicroOps = 0;
+}
+def A57WrBackThree : SchedWriteRes<[]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+
+// --- LDR pre-indexed ---
+// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
+ "LDRB_PRE_IMM", "t2LDRB_PRE")>;
+
+// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
+// (5 cyc load result for not-lsl2 scaled)
+def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
+ SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]>
+]>;
+def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
+ (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
+
+def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
+ SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
+ SchedVar<NoSchedPred, [A57WrBackOne]>
+]>;
+def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
+ (instregex "LDR(H|SH|SB)_PRE")>;
+def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
+ (instregex "t2LDR(H|SH|SB)?_PRE")>;
+
+// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
+def A57WriteLdrDAm3Pre : SchedWriteVariant<[
+ SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]>
+]>;
+def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
+ SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
+ SchedVar<NoSchedPred, [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
+ (instregex "LDRD_PRE")>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
+ (instregex "t2LDRD_PRE")>;
+
+// --- LDR post-indexed ---
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
+ "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
+
+def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
+ SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
+ SchedVar<NoSchedPred, [A57WrBackOne]>
+]>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
+ (instregex "LDR(H|SH|SB)_POST")>;
+def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
+ (instregex "t2LDR(H|SH|SB)?_POST")>;
+
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
+ "LDRB_POST_REG", "LDR(B?)T_POST$")>;
+
+def A57WriteLdrTRegPost : SchedWriteVariant<[
+ SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]>
+]>;
+def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
+ SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
+ SchedVar<NoSchedPred, [A57WrBackTwo]>
+]>;
+// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
+def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
+ (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
+
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
+
+def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
+ SchedVar<NoSchedPred, [A57WrBackOne]>
+]>;
+// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+ A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
+ (instregex "t2LDRD_POST")>;
+
+// --- Preload instructions ---
+// Preload, immed offset
+def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
+ "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
+
+// Preload, register offset,
+// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
+// otherwise 4cyc "L"
+def A57WritePLD : SchedWriteVariant<[
+ SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
+ SchedVar<IsLdstsoMinusRegPredX0, [A57Write_5cyc_1I_1L]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
+
+// --- Load multiple instructions ---
+foreach NumAddr = 1-8 in {
+ def A57LMAddrPred#NumAddr :
+ SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
+}
+
+def A57LDMOpsListNoregin : A57WriteLMOpsListType<
+ [A57Write_3cyc_1L, A57Write_3cyc_1L,
+ A57Write_4cyc_1L, A57Write_4cyc_1L,
+ A57Write_5cyc_1L, A57Write_5cyc_1L,
+ A57Write_6cyc_1L, A57Write_6cyc_1L,
+ A57Write_7cyc_1L, A57Write_7cyc_1L,
+ A57Write_8cyc_1L, A57Write_8cyc_1L,
+ A57Write_9cyc_1L, A57Write_9cyc_1L,
+ A57Write_10cyc_1L, A57Write_10cyc_1L]>;
+def A57WriteLDMnoreginlist : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, A57LDMOpsListNoregin.Writes[0-1]>,
+ SchedVar<A57LMAddrPred2, A57LDMOpsListNoregin.Writes[0-3]>,
+ SchedVar<A57LMAddrPred3, A57LDMOpsListNoregin.Writes[0-5]>,
+ SchedVar<A57LMAddrPred4, A57LDMOpsListNoregin.Writes[0-7]>,
+ SchedVar<A57LMAddrPred5, A57LDMOpsListNoregin.Writes[0-9]>,
+ SchedVar<A57LMAddrPred6, A57LDMOpsListNoregin.Writes[0-11]>,
+ SchedVar<A57LMAddrPred7, A57LDMOpsListNoregin.Writes[0-13]>,
+ SchedVar<A57LMAddrPred8, A57LDMOpsListNoregin.Writes[0-15]>,
+ SchedVar<NoSchedPred, A57LDMOpsListNoregin.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57LDMOpsListRegin : A57WriteLMOpsListType<
+ [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+ A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+ A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+ A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+ A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+ A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+ A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
+ A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
+def A57WriteLDMreginlist : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, A57LDMOpsListRegin.Writes[0-1]>,
+ SchedVar<A57LMAddrPred2, A57LDMOpsListRegin.Writes[0-3]>,
+ SchedVar<A57LMAddrPred3, A57LDMOpsListRegin.Writes[0-5]>,
+ SchedVar<A57LMAddrPred4, A57LDMOpsListRegin.Writes[0-7]>,
+ SchedVar<A57LMAddrPred5, A57LDMOpsListRegin.Writes[0-9]>,
+ SchedVar<A57LMAddrPred6, A57LDMOpsListRegin.Writes[0-11]>,
+ SchedVar<A57LMAddrPred7, A57LDMOpsListRegin.Writes[0-13]>,
+ SchedVar<A57LMAddrPred8, A57LDMOpsListRegin.Writes[0-15]>,
+ SchedVar<NoSchedPred, A57LDMOpsListRegin.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57LDMOpsList_Upd : A57WriteLMOpsListType<
+ [A57WrBackOne,
+ A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
+ A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+ A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+ A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+ A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+ A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+ A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+ A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
+def A57WriteLDM_Upd : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, A57LDMOpsList_Upd.Writes[0-2]>,
+ SchedVar<A57LMAddrPred2, A57LDMOpsList_Upd.Writes[0-4]>,
+ SchedVar<A57LMAddrPred3, A57LDMOpsList_Upd.Writes[0-6]>,
+ SchedVar<A57LMAddrPred4, A57LDMOpsList_Upd.Writes[0-8]>,
+ SchedVar<A57LMAddrPred5, A57LDMOpsList_Upd.Writes[0-10]>,
+ SchedVar<A57LMAddrPred6, A57LDMOpsList_Upd.Writes[0-12]>,
+ SchedVar<A57LMAddrPred7, A57LDMOpsList_Upd.Writes[0-14]>,
+ SchedVar<A57LMAddrPred8, A57LDMOpsList_Upd.Writes[0-16]>,
+ SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]>
+]> { let Variadic=1; }
+
+def A57WriteLDM : SchedWriteVariant<[
+ SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
+ SchedVar<NoSchedPred, [A57WriteLDMnoreginlist]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
+
+// TODO: no writeback latency defined in documentation (implemented as 1 cyc)
+def : InstRW<[A57WriteLDM_Upd],
+ (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
+
+// --- 3.9 Store Instructions ---
+
+// Store, immed offset
+def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
+ "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
+
+// Store, register offset
+// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
+// otherwise 1cyc S.
+def A57WriteStrAmLDSTSO : SchedWriteVariant<[
+ SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
+ SchedVar<IsLdstsoMinusRegPred, [A57Write_3cyc_1I_1S]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
+
+// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
+def A57WriteStrAm3 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
+def A57WriteStrAm3X2 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
+
+// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
+ "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
+ "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
+
+// Store, register pre-indexed:
+// 1(1) "S, I0/I1" for plus reg
+// 3(2) "I0/I1, S" for minus reg
+// 1(2) "S, M" for scaled plus lsl2
+// 3(2) "I0/I1, S" for other scaled
+def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
+ SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
+ SchedVar<IsLdstsoMinusRegPredX2, [A57Write_3cyc_1I_1S]>,
+ SchedVar<IsLdstsoScaledPredX2, [A57Write_1cyc_1S_1M]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
+ SchedVar<IsLdstsoScaledPredX2, [A57WrBackTwo]>,
+ SchedVar<IsLdstsoMinusRegPredX2, [A57WrBackTwo]>,
+ SchedVar<NoSchedPred, [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
+ (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
+
+// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
+// 1(1) "S, I0/I1" for imm or reg plus
+// 3(2) "I0/I1, S" for reg minus
+def A57WriteStrAm3PreX2 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
+ SchedVar<NoSchedPred, [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
+ (instregex "STRH_PRE")>;
+
+def A57WriteStrAm3PreX3 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
+ SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
+ SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
+ SchedVar<NoSchedPred, [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
+ (instregex "STRD_PRE")>;
+
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
+ "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
+
+// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
+def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
+ "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
+
+// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
+// 1(1) "S, I0/I1" both for reg or imm
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
+ (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
+
+// --- Store multiple instructions ---
+// TODO: no writeback latency defined in documentation
+def A57WriteSTM : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
+ SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
+ SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
+ SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
+ SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
+ SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
+ SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
+ SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1S]>
+]>;
+def A57WriteSTM_Upd : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]>
+]>;
+
+def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
+def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
+ (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
+
+// --- 3.10 FP Data Processing Instructions ---
+def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
+def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
+
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
+
+// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
+def A57WriteVcmp : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
+ SchedVar<NoSchedPred, [A57Write_3cyc_1X]>
+]>;
+def : InstRW<[A57WriteVcmp],
+ (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
+
+// fp convert
+def : InstRW<[A57Write_5cyc_1V], (instregex
+ "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
+
+def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
+
+// FP round to integral
+def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
+
+// FP divide, FP square root
+def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
+def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
+def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
+def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
+
+// FP max/min
+def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
+
+// FP multiply-accumulate pipelines support late forwarding of the result
+// from FP multiply μops to the accumulate operands of an
+// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
+// after the FP multiply μop has been issued
+// FP multiply, FZ
+def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
+
+def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
+def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
+def : ReadAdvance<ReadFPMUL, 0>;
+
+// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
+// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
+def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+
+// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
+// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
+// Currently, there is no way to define different read advances for VFMA operand
+// from VFMA or from VMUL, so there will be 5 read advance.
+// Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
+// The same situation with ASIMD VMUL/VFMA instructions
+// def A57ReadVFMA : SchedRead;
+// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
+// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
+def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
+
+def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
+def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
+def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
+
+def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
+
+// --- 3.11 FP Miscellaneous Instructions ---
+// VMOV: 3cyc "F0/F1" for imm/reg
+def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
+
+// 5cyc L for FP transfer, vfp to core reg,
+// 5cyc L for FP transfer, core reg to vfp
+def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
+// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
+def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
+
+// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
+def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
+
+// --- 3.12 FP Load Instructions ---
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
+
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
+
+// FP load multiple (VLDM)
+
+def A57VLDMOpsListUncond : A57WriteLMOpsListType<
+ [A57Write_5cyc_1L, A57Write_5cyc_1L,
+ A57Write_6cyc_1L, A57Write_6cyc_1L,
+ A57Write_7cyc_1L, A57Write_7cyc_1L,
+ A57Write_8cyc_1L, A57Write_8cyc_1L,
+ A57Write_9cyc_1L, A57Write_9cyc_1L,
+ A57Write_10cyc_1L, A57Write_10cyc_1L,
+ A57Write_11cyc_1L, A57Write_11cyc_1L,
+ A57Write_12cyc_1L, A57Write_12cyc_1L]>;
+def A57WriteVLDMuncond : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond.Writes[0-1]>,
+ SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond.Writes[0-3]>,
+ SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond.Writes[0-5]>,
+ SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond.Writes[0-7]>,
+ SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond.Writes[0-9]>,
+ SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond.Writes[0-11]>,
+ SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond.Writes[0-13]>,
+ SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond.Writes[0-15]>,
+ SchedVar<NoSchedPred, A57VLDMOpsListUncond.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57VLDMOpsListCond : A57WriteLMOpsListType<
+ [A57Write_5cyc_1L, A57Write_6cyc_1L,
+ A57Write_7cyc_1L, A57Write_8cyc_1L,
+ A57Write_9cyc_1L, A57Write_10cyc_1L,
+ A57Write_11cyc_1L, A57Write_12cyc_1L,
+ A57Write_13cyc_1L, A57Write_14cyc_1L,
+ A57Write_15cyc_1L, A57Write_16cyc_1L,
+ A57Write_17cyc_1L, A57Write_18cyc_1L,
+ A57Write_19cyc_1L, A57Write_20cyc_1L]>;
+def A57WriteVLDMcond : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, A57VLDMOpsListCond.Writes[0-1]>,
+ SchedVar<A57LMAddrPred2, A57VLDMOpsListCond.Writes[0-3]>,
+ SchedVar<A57LMAddrPred3, A57VLDMOpsListCond.Writes[0-5]>,
+ SchedVar<A57LMAddrPred4, A57VLDMOpsListCond.Writes[0-7]>,
+ SchedVar<A57LMAddrPred5, A57VLDMOpsListCond.Writes[0-9]>,
+ SchedVar<A57LMAddrPred6, A57VLDMOpsListCond.Writes[0-11]>,
+ SchedVar<A57LMAddrPred7, A57VLDMOpsListCond.Writes[0-13]>,
+ SchedVar<A57LMAddrPred8, A57VLDMOpsListCond.Writes[0-15]>,
+ SchedVar<NoSchedPred, A57VLDMOpsListCond.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57WriteVLDM : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
+ SchedVar<NoSchedPred, [A57WriteVLDMuncond]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
+
+def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
+ [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+ A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+ A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+ A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+ A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+ A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
+ A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
+ A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
+def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond_Upd.Writes[0-1]>,
+ SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond_Upd.Writes[0-3]>,
+ SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond_Upd.Writes[0-5]>,
+ SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond_Upd.Writes[0-7]>,
+ SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond_Upd.Writes[0-9]>,
+ SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond_Upd.Writes[0-11]>,
+ SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond_Upd.Writes[0-13]>,
+ SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond_Upd.Writes[0-15]>,
+ SchedVar<NoSchedPred, A57VLDMOpsListUncond_Upd.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
+ [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
+ A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
+ A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
+ A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
+ A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
+ A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
+ A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
+ A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
+def A57WriteVLDMcond_UPD : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, A57VLDMOpsListCond_Upd.Writes[0-1]>,
+ SchedVar<A57LMAddrPred2, A57VLDMOpsListCond_Upd.Writes[0-3]>,
+ SchedVar<A57LMAddrPred3, A57VLDMOpsListCond_Upd.Writes[0-5]>,
+ SchedVar<A57LMAddrPred4, A57VLDMOpsListCond_Upd.Writes[0-7]>,
+ SchedVar<A57LMAddrPred5, A57VLDMOpsListCond_Upd.Writes[0-9]>,
+ SchedVar<A57LMAddrPred6, A57VLDMOpsListCond_Upd.Writes[0-11]>,
+ SchedVar<A57LMAddrPred7, A57VLDMOpsListCond_Upd.Writes[0-13]>,
+ SchedVar<A57LMAddrPred8, A57VLDMOpsListCond_Upd.Writes[0-15]>,
+ SchedVar<NoSchedPred, A57VLDMOpsListCond_Upd.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57WriteVLDM_UPD : SchedWriteVariant<[
+ SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
+ SchedVar<NoSchedPred, [A57WriteVLDMuncond_UPD]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
+ (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
+
+// --- 3.13 FP Store Instructions ---
+def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
+
+def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
+
+def A57WriteVSTMs : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
+ SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
+ SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
+ SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
+ SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
+ SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
+ SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
+ SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1S]>
+]>;
+def A57WriteVSTMd : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
+ SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
+ SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
+ SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
+ SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
+ SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
+ SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
+ SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1S]>
+]>;
+def A57WriteVSTMs_Upd : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]>
+]>;
+def A57WriteVSTMd_Upd : SchedWriteVariant<[
+ SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
+ SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
+ SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]>
+]>;
+
+def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
+def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
+def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
+ (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
+def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
+ (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
+
+// --- 3.14 ASIMD Integer Instructions ---
+
+// ASIMD absolute diff, 3cyc F0/F1 for integer VABD
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
+
+// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
+def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVABAD : SchedReadAdvance<3, [A57WriteVABAD]>;
+def : InstRW<[A57WriteVABAD, A57ReadVABAD],
+ (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
+def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
+def A57ReadVABAQ : SchedReadAdvance<3, [A57WriteVABAQ]>;
+def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
+ (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
+
+// ASIMD absolute diff accum long: 4(1) F1 for VABAL
+def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVABAL : SchedReadAdvance<3, [A57WriteVABAL]>;
+def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
+
+// ASIMD absolute diff long: 3cyc F0/F1 for VABDL
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
+
+// ASIMD arith, basic
+def : InstRW<[A57Write_3cyc_1V], (instregex "VADD", "VADDL", "VADDW",
+ "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
+ "VPADDi", "VPADDL", "VSUB", "VSUBL", "VSUBW")>;
+
+// ASIMD arith, complex
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
+ "VQABS", "VQADD", "VQNEG", "VQSUB",
+ "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
+
+// ASIMD compare
+def : InstRW<[A57Write_3cyc_1V],
+ (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
+
+// ASIMD logical
+def : InstRW<[A57Write_3cyc_1V],
+ (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
+
+// ASIMD max/min
+def : InstRW<[A57Write_3cyc_1V],
+ (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
+
+// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
+// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
+// and multiply-with-accumulate instructions relative to r0pX.
+def A57WriteVMULD_VecInt : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+ SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>;
+def : InstRW<[A57WriteVMULD_VecInt], (instregex
+ "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
+ "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
+
+// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
+def A57WriteVMULQ_VecInt : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
+ SchedVar<NoSchedPred, [A57Write_6cyc_1W]>]>;
+def : InstRW<[A57WriteVMULQ_VecInt], (instregex
+ "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
+ "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate, D-form
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAD_VecInt : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+ SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>;
+def A57ReadVMLAD_VecInt : SchedReadVariant<[
+ SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
+ SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
+ (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
+
+// ASIMD multiply accumulate, Q-form
+// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
+ SchedVar<NoSchedPred, [A57Write_6cyc_1W]>]>;
+def A57ReadVMLAQ_VecInt : SchedReadVariant<[
+ SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
+ SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
+ (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAL_VecInt : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+ SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>;
+def A57ReadVMLAL_VecInt : SchedReadVariant<[
+ SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
+ SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
+ (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
+
+// ASIMD multiply accumulate saturating long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
+// (3 or 2 ReadAdvance)
+def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+ SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>;
+def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
+ SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
+ SchedVar<NoSchedPred, [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
+ (instregex "VQDMLAL", "VQDMLSL")>;
+
+// ASIMD multiply long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
+def A57WriteVMULL_VecInt : SchedWriteVariant<[
+ SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+ SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>;
+def : InstRW<[A57WriteVMULL_VecInt],
+ (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
+
+// ASIMD pairwise add and accumulate
+// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
+def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVPADAL : SchedReadAdvance<3, [A57WriteVPADAL]>;
+def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
+
+// ASIMD shift accumulate
+// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
+def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVSRA : SchedReadAdvance<3, [A57WriteVSRA]>;
+def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[A57Write_3cyc_1X],
+ (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex
+ "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
+ "VRSHRN")>;
+
+// ASIMD shift by immed and insert, basic, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex
+ "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by immed and insert, basic, Q-form
+def : InstRW<[A57Write_5cyc_1X], (instregex
+ "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, basic, D-form
+def : InstRW<[A57Write_3cyc_1X], (instregex
+ "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_1X], (instregex
+ "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+// VQRSHL, VQSHL, VRSHL
+def : InstRW<[A57Write_4cyc_1X], (instregex
+ "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
+ "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_1X], (instregex
+ "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
+ "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
+
+// --- 3.15 ASIMD Floating-Point Instructions ---
+// ASIMD FP absolute value
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
+
+// ASIMD FP arith
+def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
+ "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
+
+// ASIMD FP compare
+def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
+ "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
+
+// ASIMD FP convert, integer
+def : InstRW<[A57Write_5cyc_1V], (instregex
+ "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
+ "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
+ "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
+
+// ASIMD FP convert, half-precision: 8cyc F0/F1
+def : InstRW<[A57Write_8cyc_1V], (instregex
+ "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
+ "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
+ "VCVT(f2h|h2f)")>;
+
+// ASIMD FP max/min
+def : InstRW<[A57Write_5cyc_1V], (instregex
+ "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
+
+// ASIMD FP multiply
+def A57WriteVMUL_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
+def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
+
+// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
+def A57WriteVMLA_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+def A57ReadVMLA_VecFP :
+ SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
+def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
+ (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
+
+// ASIMD FP negate
+def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
+
+// ASIMD FP round to integral
+def : InstRW<[A57Write_5cyc_1V], (instregex
+ "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
+
+// --- 3.16 ASIMD Miscellaneous Instructions ---
+
+// ASIMD bitwise insert
+def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
+
+// ASIMD count
+def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
+
+// ASIMD duplicate, core reg: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
+
+// ASIMD duplicate, scalar: 3cyc "F0/F1"
+def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
+
+// ASIMD extract
+def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
+
+// ASIMD move, immed
+def : InstRW<[A57Write_3cyc_1V], (instregex
+ "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
+ "VMOVQ0")>;
+
+// ASIMD move, narrowing
+def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
+
+// ASIMD reciprocal estimate
+def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
+
+// ASIMD reciprocal step, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
+
+// ASIMD reverse, swap, table lookup (1-2 reg)
+def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
+
+// ASIMD table lookup (3-4 reg)
+def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
+
+// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
+def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
+
+// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
+
+// ASIMD transpose
+def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
+
+// ASIMD unzip/zip, D-form
+def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
+ (instregex "VUZPd", "VZIPd")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
+ (instregex "VUZPq", "VZIPq")>;
+
+// --- 3.17 ASIMD Load Instructions ---
+
+// Overriden via InstRW for this processor.
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
+ (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+
+// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
+def : InstRW<[A57Write_6cyc_1L],
+ (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
+
+def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
+ (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
+
+// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex
+ "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
+ "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V],
+ (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+
+// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD2b(8|16|32)wb")>;
+
+// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+ (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+ "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+// 2 results + wb result
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
+ (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+// 1 result + wb result
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
+ "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
+// 3 results
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
+ (instregex "VLD3(d|q)(8|16|32)$")>;
+// 1 result
+def : InstRW<[A57Write_9cyc_1L_1V],
+ (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+// 3 results + wb
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+ A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+// 1 result + wb
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+ (instregex "VLD3LN(d|q)32$",
+ "VLD3LN(d|q)32Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+ A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3LN(d|q)32_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
+
+// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
+ (instregex "VLD3LN(d|q)(8|16)$",
+ "VLD3LN(d|q)(8|16)Pseudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+ A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3LN(d|q)(8|16)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
+
+// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+ (instregex "VLD3DUP(d|q)(8|16|32)$",
+ "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+ A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
+ A57Write_9cyc_1L_1V],
+ (instregex "VLD4(d|q)(8|16|32)$")>;
+def : InstRW<[A57Write_9cyc_1L_1V],
+ (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+ A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
+ A57Write_8cyc_1L_1V],
+ (instregex "VLD4LN(d|q)32$",
+ "VLD4LN(d|q)32Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+ A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+ A57WrBackOne],
+ (instregex "VLD4LN(d|q)32_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
+
+// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
+ A57Write_9cyc_1L_1V],
+ (instregex "VLD4LN(d|q)(8|16)$",
+ "VLD4LN(d|q)(8|16)Pseudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+ A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+ A57WrBackOne],
+ (instregex "VLD4LN(d|q)(8|16)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
+
+// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
+ A57Write_8cyc_1L_1V],
+ (instregex "VLD4DUP(d|q)(8|16|32)$",
+ "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+ A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+ A57WrBackOne],
+ (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+ (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
+
+// --- 3.18 ASIMD Store Instructions ---
+
+// ASIMD store, 1 element, multiple, 1 reg: 1cyc S
+def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
+ (instregex "VST1d(8|16|32|64)wb")>;
+// ASIMD store, 1 element, multiple, 2 reg: 2cyc S
+def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
+def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
+ (instregex "VST1q(8|16|32|64)wb")>;
+// ASIMD store, 1 element, multiple, 3 reg: 3cyc S
+def : InstRW<[A57Write_3cyc_1S],
+ (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
+ (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+// ASIMD store, 1 element, multiple, 4 reg: 4cyc S
+def : InstRW<[A57Write_4cyc_1S],
+ (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
+ (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+ (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+ (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+ (instregex "VST2(d|b)(8|16|32)$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+ (instregex "VST2(b|d)(8|16|32)wb")>;
+// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
+def : InstRW<[A57Write_4cyc_1S_1V],
+ (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
+ (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+ (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+ (instregex "VST2LN(d|q)(8|16|32)_UPD",
+ "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 3 element, multiple, 3 reg
+def : InstRW<[A57Write_3cyc_1S_1V],
+ (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+ (instregex "VST3(d|q)(8|16|32)_UPD",
+ "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+// ASIMD store, 3 element, one lane
+def : InstRW<[A57Write_3cyc_1S_1V],
+ (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+ (instregex "VST3LN(d|q)(8|16|32)_UPD",
+ "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 4 element, multiple, 4 reg
+def : InstRW<[A57Write_4cyc_1S_1V],
+ (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
+ (instregex "VST4(d|q)(8|16|32)_UPD",
+ "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+// ASIMD store, 4 element, one lane
+def : InstRW<[A57Write_3cyc_1S_1V],
+ (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+ (instregex "VST4LN(d|q)(8|16|32)_UPD",
+ "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+// --- 3.19 Cryptography Extensions ---
+// Crypto AES ops
+// AESD, AESE, AESIMC, AESMC: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
+// Crypto SHA1 xor ops: 6cyc F0/F1
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+// Crypto SHA1 fast ops: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+// Crypto SHA1 slow ops: 6cyc F0
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+// Crypto SHA256 fast ops: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+// Crypto SHA256 slow ops: 6cyc F0
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+
+// --- 3.20 CRC ---
+def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
+
+// -----------------------------------------------------------------------------
+// Common definitions
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
+
+def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
+def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
+def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
+
+def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
+def : SchedAlias<WriteST, A57Write_1cyc_1S>;
+def : ReadAdvance<ReadALU, 0>;
+
+} // SchedModel = CortexA57Model
+
Added: llvm/trunk/lib/Target/ARM/ARMScheduleA57WriteRes.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMScheduleA57WriteRes.td?rev=304530&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMScheduleA57WriteRes.td (added)
+++ llvm/trunk/lib/Target/ARM/ARMScheduleA57WriteRes.td Fri Jun 2 03:53:19 2017
@@ -0,0 +1,323 @@
+//=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+// Prefix: A57Write
+// Latency: #cyc
+// MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+// 11 micro-ops to be issued as follows: one to I pipe, six to S pipes and
+// four to V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; }
+def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
+def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
+ let ResourceCycles = [17]; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18;
+ let ResourceCycles = [18]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+ let ResourceCycles = [19]; }
+def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20;
+ let ResourceCycles = [20]; }
+def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; }
+def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; }
+def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; }
+def A57Write_3cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 3; }
+def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; }
+def A57Write_2cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 2; }
+def A57Write_3cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 3; }
+def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; }
+def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
+ let ResourceCycles = [32]; }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
+ let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+ let ResourceCycles = [35]; }
+def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
+def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; }
+def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; }
+def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; }
+
+// A57Write_3cyc_1L - A57Write_20cyc_1L
+foreach Lat = 3-20 in {
+ def A57Write_#Lat#cyc_1L : SchedWriteRes<[A57UnitL]> {
+ let Latency = Lat;
+ }
+}
+
+// A57Write_4cyc_1S - A57Write_16cyc_1S
+foreach Lat = 4-16 in {
+ def A57Write_#Lat#cyc_1S : SchedWriteRes<[A57UnitS]> {
+ let Latency = Lat;
+ }
+}
+
+def A57Write_4cyc_1M : SchedWriteRes<[A57UnitL]> { let Latency = 4; }
+def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57Write_4cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 4; }
+def A57Write_5cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
+def A57Write_6cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 6; }
+def A57Write_6cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 6; }
+def A57Write_8cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 8; }
+def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; }
+def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 64;
+ let NumMicroOps = 2;
+ let ResourceCycles = [32, 32];
+}
+def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI,
+ A57UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_1V_1X : SchedWriteRes<[A57UnitV,
+ A57UnitX]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV,
+ A57UnitX]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL,
+ A57UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def A57Write_9cyc_1L_1V : SchedWriteRes<[A57UnitL,
+ A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI,
+ A57UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1M : SchedWriteRes<[A57UnitI,
+ A57UnitM]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+ A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB,
+ A57UnitI]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI,
+ A57UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def A57Write_1cyc_1S_1I : SchedWriteRes<[A57UnitS,
+ A57UnitI]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_1S_1I : SchedWriteRes<[A57UnitS,
+ A57UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1I : SchedWriteRes<[A57UnitS,
+ A57UnitI]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_1cyc_1S_1M : SchedWriteRes<[A57UnitS,
+ A57UnitM]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB,
+ A57UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_3cyc_1B_1I : SchedWriteRes<[A57UnitB,
+ A57UnitI]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_6cyc_1B_1L : SchedWriteRes<[A57UnitB,
+ A57UnitI]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_1I_1M : SchedWriteRes<[A57UnitI,
+ A57UnitM]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 36;
+ let NumMicroOps = 2;
+ let ResourceCycles = [18, 18];
+}
+def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI,
+ A57UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1M : SchedWriteRes<[A57UnitI,
+ A57UnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// A57Write_3cyc_1L_1I - A57Write_20cyc_1L_1I
+foreach Lat = 3-20 in {
+ def A57Write_#Lat#cyc_1L_1I : SchedWriteRes<[A57UnitL, A57UnitI]> {
+ let Latency = Lat; let NumMicroOps = 2;
+ }
+}
+
+def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI,
+ A57UnitS]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS,
+ A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def A57Write_4cyc_1S_1V : SchedWriteRes<[A57UnitS,
+ A57UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+// A57Write_4cyc_1S_1I - A57Write_16cyc_1S_1I
+foreach Lat = 4-16 in {
+ def A57Write_#Lat#cyc_1S_1I : SchedWriteRes<[A57UnitS, A57UnitI]> {
+ let Latency = Lat; let NumMicroOps = 2;
+ }
+}
+
+def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI,
+ A57UnitS,
+ A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def A57Write_3cyc_1S_1V_1I : SchedWriteRes<[A57UnitS,
+ A57UnitV,
+ A57UnitI]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def A57Write_4cyc_1S_1V_1I : SchedWriteRes<[A57UnitS,
+ A57UnitV,
+ A57UnitI]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+def A57Write_4cyc_1I_1L_1M : SchedWriteRes<[A57UnitI, A57UnitL, A57UnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_1V_1I : SchedWriteRes<[A57UnitL,
+ A57UnitV,
+ A57UnitI]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def A57Write_9cyc_1L_1V_1I : SchedWriteRes<[A57UnitL,
+ A57UnitV,
+ A57UnitI]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+}
Modified: llvm/trunk/lib/Target/ARM/ARMSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSubtarget.h?rev=304530&r1=304529&r2=304530&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSubtarget.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMSubtarget.h Fri Jun 2 03:53:19 2017
@@ -234,6 +234,10 @@ protected:
/// CPSR setting instruction.
bool AvoidCPSRPartialUpdate = false;
+ /// CheapPredicableCPSRDef - If true, disable +1 predication cost
+ /// for instructions updating CPSR. Enabled for Cortex-A57.
+ bool CheapPredicableCPSRDef = false;
+
/// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
/// movs with shifter operand (i.e. asr, lsl, lsr).
bool AvoidMOVsShifterOperand = false;
@@ -543,6 +547,7 @@ public:
bool nonpipelinedVFP() const { return NonpipelinedVFP; }
bool prefers32BitThumb() const { return Pref32BitThumb; }
bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
+ bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
bool hasRetAddrStack() const { return HasRetAddrStack; }
bool hasMPExtension() const { return HasMPExtension; }
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-alu.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-alu.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-alu.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-alu.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,81 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; Check the latency for ALU shifted operand variants.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: foo:BB#0 entry
+
+; ALU, basic - 1 cyc I0/I1
+; CHECK: EORrr
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 1
+
+; ALU, shift by immed - 2 cyc M
+; CHECK: ADDrsi
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 2
+
+; ALU, shift by register, unconditional - 2 cyc M
+; CHECK: RSBrsr
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 2
+
+; ALU, shift by register, conditional - 2 cyc I0/I1
+; CHECK: ANDrsr
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 2
+
+; Checking scheduling units
+
+; CHECK: ** ScheduleDAGMILive::schedule picking next node
+; Skipping COPY
+; CHECK: ** ScheduleDAGMILive::schedule picking next node
+; CHECK: Scheduling
+; CHECK-SAME: ANDrsr
+; CHECK: Ready
+; CHECK-NEXT: A57UnitI
+
+; CHECK: ** ScheduleDAGMILive::schedule picking next node
+; CHECK: Scheduling
+; CHECK-SAME: CMPri
+; CHECK: Ready
+; CHECK-NEXT: A57UnitI
+
+; CHECK: ** ScheduleDAGMILive::schedule picking next node
+; CHECK: Scheduling
+; CHECK-SAME: RSBrsr
+; CHECK: Ready
+; CHECK-NEXT: A57UnitM
+
+; CHECK: ** ScheduleDAGMILive::schedule picking next node
+; CHECK: Scheduling
+; CHECK-SAME: ADDrsi
+; CHECK: Ready
+; CHECK-NEXT: A57UnitM
+
+; CHECK: ** ScheduleDAGMILive::schedule picking next node
+; CHECK: Scheduling
+; CHECK-SAME: EORrr
+; CHECK: Ready
+; CHECK-NEXT: A57UnitI
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8r-arm-none-eabi"
+
+; Function Attrs: norecurse nounwind readnone
+define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
+entry:
+ %xor = xor i32 %a, %b
+ %xor_shl = shl i32 %xor, 2
+ %add = add i32 %xor_shl, %d
+ %add_ashr = ashr i32 %add, %a
+ %sub = sub i32 %add_ashr, %a
+ %sub_lshr_pred = lshr i32 %sub, %c
+ %pred = icmp sgt i32 %a, 4
+ %and = and i32 %sub_lshr_pred, %b
+ %rv = select i1 %pred, i32 %and, i32 %d
+ ret i32 %rv
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-basic.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-basic.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-basic.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,53 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=A57_SCHED
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+
+; Check the latency for instructions for both generic and cortex-a57.
+; SDIV should be scheduled at the block's begin (20 cyc of independent M unit).
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: foo:BB#0 entry
+
+; GENERIC: SDIV
+; GENERIC: Latency : 1
+; GENERIC: EORrr
+; GENERIC: Latency : 1
+; GENERIC: LDRi12
+; GENERIC: Latency : 4
+; GENERIC: ADDrr
+; GENERIC: Latency : 1
+; GENERIC: SUBrr
+; GENERIC: Latency : 1
+
+; A57_SCHED: SDIV
+; A57_SCHED: Latency : 20
+; A57_SCHED: EORrr
+; A57_SCHED: Latency : 1
+; A57_SCHED: LDRi12
+; A57_SCHED: Latency : 4
+; A57_SCHED: ADDrr
+; A57_SCHED: Latency : 1
+; A57_SCHED: SUBrr
+; A57_SCHED: Latency : 1
+
+; CHECK: ** Final schedule for BB#0 ***
+; GENERIC: LDRi12
+; GENERIC: SDIV
+; A57_SCHED: SDIV
+; A57_SCHED: LDRi12
+; CHECK: ********** INTERVALS **********
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8r-arm-none-eabi"
+
+; Function Attrs: norecurse nounwind readnone
+define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr #0 {
+entry:
+ %xor = xor i32 %c, %b
+ %ld = load i32, i32* %d
+ %add = add nsw i32 %xor, %ld
+ %div = sdiv i32 %a, %b
+ %sub = sub i32 %div, %add
+ ret i32 %sub
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+;
+
+ at a = global i32 0, align 4
+ at b = global i32 0, align 4
+ at c = global i32 0, align 4
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have LDM instruction combined from single-loads
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDMIA_UPD
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 4
+; CHECK: Successors:
+; CHECK: data
+; CHECK-SAME: Latency=1
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=3
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=3
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=4
+define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
+ %1 = load i32, i32* @a, align 4
+ %2 = load i32, i32* @b, align 4
+ %3 = load i32, i32* @c, align 4
+
+ %ptr_after = getelementptr i32, i32* @a, i32 3
+
+ %ptr_val = ptrtoint i32* %ptr_after to i32
+ %mul1 = mul i32 %ptr_val, %1
+ %mul2 = mul i32 %mul1, %2
+ %mul3 = mul i32 %mul2, %3
+ ret i32 %mul3
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-ldm.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,28 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have LDM instruction combined from single-loads
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDMIA
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 3
+; CHECK: Successors:
+; CHECK: data
+; CHECK-SAME: Latency=3
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=3
+
+define i32 @foo(i32* %a) nounwind optsize {
+entry:
+ %b = getelementptr i32, i32* %a, i32 1
+ %c = getelementptr i32, i32* %a, i32 2
+ %0 = load i32, i32* %a, align 4
+ %1 = load i32, i32* %b, align 4
+ %2 = load i32, i32* %c, align 4
+
+ %mul1 = mul i32 %0, %1
+ %mul2 = mul i32 %mul1, %2
+ ret i32 %mul2
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; N=3 STMIA_UPD should have latency 2cyc and writeback latency 1cyc
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have STM instruction combined from single-stores
+; CHECK: ********** MI Scheduling **********
+; CHECK: schedule starting
+; CHECK: STMIA_UPD
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 2
+; CHECK: Successors
+; CHECK: data
+; CHECK-SAME: Latency=1
+
+define i32 @bar(i32 %v0, i32 %v1, i32 %v2, i32* %addr) {
+
+ %addr.1 = getelementptr i32, i32* %addr, i32 0
+ store i32 %v0, i32* %addr.1
+
+ %addr.2 = getelementptr i32, i32* %addr, i32 1
+ store i32 %v1, i32* %addr.2
+
+ %addr.3 = getelementptr i32, i32* %addr, i32 2
+ store i32 %v2, i32* %addr.3
+
+ %ptr_after = getelementptr i32, i32* %addr, i32 3
+ %val = ptrtoint i32* %ptr_after to i32
+
+ %rv1 = mul i32 %val, %v0
+ %rv2 = mul i32 %rv1, %v1
+ %rv3 = mul i32 %rv2, %v2
+
+ ret i32 %rv3
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-stm.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,29 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; N=3 STMIB should have latency 2cyc
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have STM instruction combined from single-stores
+; CHECK: ********** MI Scheduling **********
+; CHECK: schedule starting
+; CHECK: STMIB
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 2
+
+define i32 @test_stm(i32 %v0, i32 %v1, i32* %addr) {
+
+ %addr.1 = getelementptr i32, i32* %addr, i32 1
+ store i32 %v0, i32* %addr.1
+
+ %addr.2 = getelementptr i32, i32* %addr, i32 2
+ store i32 %v1, i32* %addr.2
+
+ %addr.3 = getelementptr i32, i32* %addr, i32 3
+ %val = ptrtoint i32* %addr to i32
+ store i32 %val, i32* %addr.3
+
+ %rv = add i32 %v0, %v1
+
+ ret i32 %rv
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,77 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; Check latencies of vmul/vfma accumulate chains.
+
+define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
+; CHECK: ********** MI Scheduling **********
+; CHECK: Test1:BB#0
+
+; CHECK: VMULS
+; > VMULS common latency = 5
+; CHECK: Latency : 5
+; CHECK: Successors:
+; CHECK: data
+; > VMULS read-advanced latency to VMLAS = 0
+; CHECK-SAME: Latency=0
+
+; CHECK: VMLAS
+; > VMLAS common latency = 9
+; CHECK: Latency : 9
+; CHECK: Successors:
+; CHECK: data
+; > VMLAS read-advanced latency to the next VMLAS = 4
+; CHECK-SAME: Latency=4
+
+; CHECK: VMLAS
+; CHECK: Latency : 9
+; CHECK: Successors:
+; CHECK: data
+; > VMLAS not-optimized latency to VMOVRS = 9
+; CHECK-SAME: Latency=9
+
+; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS
+ %mul1 = fmul float %f1, %f2
+ %mul2 = fmul float %f3, %f4
+ %mul3 = fmul float %f5, %f6
+ %add1 = fadd float %mul1, %mul2
+ %add2 = fadd float %add1, %mul3
+ ret float %add2
+}
+
+; ASIMD form
+define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
+; CHECK: ********** MI Scheduling **********
+; CHECK: Test2:BB#0
+
+; CHECK: VMULfd
+; > VMULfd common latency = 5
+; CHECK: Latency : 5
+; CHECK: Successors:
+; CHECK: data
+; VMULfd read-advanced latency to VMLAfd = 0
+; CHECK-SAME: Latency=0
+
+; CHECK: VMLAfd
+; > VMLAfd common latency = 9
+; CHECK: Latency : 9
+; CHECK: Successors:
+; CHECK: data
+; > VMLAfd read-advanced latency to the next VMLAfd = 4
+; CHECK-SAME: Latency=4
+
+; CHECK: VMLAfd
+; CHECK: Latency : 9
+; CHECK: Successors:
+; CHECK: data
+; > VMLAfd not-optimized latency to VMOVRRD = 9
+; CHECK-SAME: Latency=9
+
+; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS
+ %mul1 = fmul <2 x float> %f1, %f2
+ %mul2 = fmul <2 x float> %f3, %f4
+ %mul3 = fmul <2 x float> %f5, %f6
+ %add1 = fadd <2 x float> %mul1, %mul2
+ %add2 = fadd <2 x float> %add1, %mul3
+ ret <2 x float> %add2
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,50 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+;
+
+ at a = global double 0.0, align 4
+ at b = global double 0.0, align 4
+ at c = global double 0.0, align 4
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
+; CHECK: ********** MI Scheduling **********
+; CHECK: VLDMDIA_UPD
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 6
+; CHECK: Successors:
+; CHECK: data
+; CHECK-SAME: Latency=1
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=1
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=5
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=5
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=6
+define i32 @bar(i32* %iptr) minsize optsize {
+ %1 = load double, double* @a, align 8
+ %2 = load double, double* @b, align 8
+ %3 = load double, double* @c, align 8
+
+ %ptr_after = getelementptr double, double* @a, i32 3
+
+ %ptr_new_ival = ptrtoint double* %ptr_after to i32
+ %ptr_new = inttoptr i32 %ptr_new_ival to i32*
+
+ store i32 %ptr_new_ival, i32* %iptr, align 8
+
+ %v1 = fptoui double %1 to i32
+
+ %mul1 = mul i32 %ptr_new_ival, %v1
+
+ %v2 = fptoui double %2 to i32
+ %v3 = fptoui double %3 to i32
+
+ %mul2 = mul i32 %mul1, %v2
+ %mul3 = mul i32 %mul2, %v3
+
+ ret i32 %mul3
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vldm.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,30 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
+; CHECK: ********** MI Scheduling **********
+; CHECK: VLDMDIA
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 6
+; CHECK: Successors:
+; CHECK: data
+; CHECK-SAME: Latency=5
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=5
+; CHECK-NEXT: data
+; CHECK-SAME: Latency=6
+
+define double @foo(double* %a) nounwind optsize {
+entry:
+ %b = getelementptr double, double* %a, i32 1
+ %c = getelementptr double, double* %a, i32 2
+ %0 = load double, double* %a, align 4
+ %1 = load double, double* %b, align 4
+ %2 = load double, double* %c, align 4
+
+ %mul1 = fmul double %0, %1
+ %mul2 = fmul double %mul1, %2
+ ret double %mul2
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
+; CHECK: ********** MI Scheduling **********
+; CHECK: schedule starting
+; CHECK: VSTMDIA_UPD
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 4
+; CHECK: Successors:
+; CHECK: data
+; CHECK-SAME: Latency=1
+
+ at a = global double 0.0, align 4
+ at b = global double 0.0, align 4
+ at c = global double 0.0, align 4
+
+define i32 @bar(double* %vptr, i32 %iv1, i32* %iptr) minsize {
+
+ %vp2 = getelementptr double, double* %vptr, i32 1
+ %vp3 = getelementptr double, double* %vptr, i32 2
+
+ %v1 = load double, double* %vptr, align 8
+ %v2 = load double, double* %vp2, align 8
+ %v3 = load double, double* %vp3, align 8
+
+ store double %v1, double* @a, align 8
+ store double %v2, double* @b, align 8
+ store double %v3, double* @c, align 8
+
+ %ptr_after = getelementptr double, double* @a, i32 3
+
+ %ptr_new_ival = ptrtoint double* %ptr_after to i32
+ %ptr_new = inttoptr i32 %ptr_new_ival to i32*
+
+ store i32 %ptr_new_ival, i32* %iptr, align 8
+
+ %mul1 = mul i32 %ptr_new_ival, %iv1
+
+ ret i32 %mul1
+}
+
Added: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm.ll?rev=304530&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vstm.ll Fri Jun 2 03:53:19 2017
@@ -0,0 +1,23 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK: ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
+; CHECK: ********** MI Scheduling **********
+; CHECK: schedule starting
+; CHECK: VSTMDIA
+; CHECK: rdefs left
+; CHECK-NEXT: Latency : 2
+
+%bigVec = type [2 x double]
+
+ at var = global %bigVec zeroinitializer
+
+define void @bar(%bigVec* %ptr) {
+
+ %tmp = load %bigVec, %bigVec* %ptr
+ store %bigVec %tmp, %bigVec* @var
+
+ ret void
+}
+
More information about the llvm-commits
mailing list