[llvm] 21f7c62 - [LLVM][ARM] Latency mutations for cortex m55, m7 and m85 (#115153)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 13 09:13:46 PST 2024
Author: Nashe Mncube
Date: 2024-11-13T17:13:41Z
New Revision: 21f7c626270d5b39c40f7d8f978ee91937d11dbb
URL: https://github.com/llvm/llvm-project/commit/21f7c626270d5b39c40f7d8f978ee91937d11dbb
DIFF: https://github.com/llvm/llvm-project/commit/21f7c626270d5b39c40f7d8f978ee91937d11dbb.diff
LOG: [LLVM][ARM] Latency mutations for cortex m55,m7 and m85 (#115153)
This patch adds latency mutations as a scheduling related speedup for
the above mentioned cores. When benchmarking this pass on selected
benchmarks we see a performance improvement of 1% on most benchmarks
with some improving by up to 6%.
Author: David Penry <david.penry at arm.com>
Co-authored-by: Nashe Mncube <nashe.mncube at arm.com
Added:
llvm/lib/Target/ARM/ARMLatencyMutations.cpp
llvm/lib/Target/ARM/ARMLatencyMutations.h
Modified:
llvm/lib/Target/ARM/ARMBaseInstrInfo.h
llvm/lib/Target/ARM/ARMProcessors.td
llvm/lib/Target/ARM/ARMSubtarget.h
llvm/lib/Target/ARM/ARMTargetMachine.cpp
llvm/lib/Target/ARM/CMakeLists.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index aee9797585dbd2..b6f20e6f99a0a9 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -973,6 +973,34 @@ unsigned getBLXOpcode(const MachineFunction &MF);
unsigned gettBLXrOpcode(const MachineFunction &MF);
unsigned getBLXpredOpcode(const MachineFunction &MF);
+inline bool isMVEVectorInstruction(const MachineInstr *MI) {
+ // This attempts to remove non-mve instructions (scalar shifts), which
+ // are just DPU CX instruction.
+ switch (MI->getOpcode()) {
+ case ARM::MVE_SQSHL:
+ case ARM::MVE_SRSHR:
+ case ARM::MVE_UQSHL:
+ case ARM::MVE_URSHR:
+ case ARM::MVE_SQRSHR:
+ case ARM::MVE_UQRSHL:
+ case ARM::MVE_ASRLr:
+ case ARM::MVE_ASRLi:
+ case ARM::MVE_LSLLr:
+ case ARM::MVE_LSLLi:
+ case ARM::MVE_LSRL:
+ case ARM::MVE_SQRSHRL:
+ case ARM::MVE_SQSHLL:
+ case ARM::MVE_SRSHRL:
+ case ARM::MVE_UQRSHLL:
+ case ARM::MVE_UQSHLL:
+ case ARM::MVE_URSHRL:
+ return false;
+ }
+ const MCInstrDesc &MCID = MI->getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::DomainMask) == ARMII::DomainMVE;
+}
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/llvm/lib/Target/ARM/ARMLatencyMutations.cpp b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
new file mode 100644
index 00000000000000..3c86e8ab5892e4
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
@@ -0,0 +1,978 @@
+//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM definition DAG scheduling mutations which
+/// change inter-instruction latencies
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMLatencyMutations.h"
+#include "ARMSubtarget.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <memory>
+#include <utility>
+
+namespace llvm {
+
+namespace {
+
+// Precompute information about opcodes to speed up pass
+
+class InstructionInformation {
+protected:
+ struct IInfo {
+ bool HasBRegAddr : 1; // B-side of addr gen is a register
+ bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
+ bool IsDivide : 1; // Some form of integer divide
+ bool IsInlineShiftALU : 1; // Inline shift+ALU
+ bool IsMultiply : 1; // Some form of integer multiply
+ bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation
+ bool IsNonSubwordLoad : 1; // Load which is a word or larger
+ bool IsShift : 1; // Shift operation
+ bool IsRev : 1; // REV operation
+ bool ProducesQP : 1; // Produces a vector register result
+ bool ProducesDP : 1; // Produces a double-precision register result
+ bool ProducesSP : 1; // Produces a single-precision register result
+ bool ConsumesQP : 1; // Consumes a vector register result
+ bool ConsumesDP : 1; // Consumes a double-precision register result
+ bool ConsumesSP : 1; // Consumes a single-precision register result
+ unsigned MVEIntMACMatched; // Matched operand type (for MVE)
+ unsigned AddressOpMask; // Mask indicating which operands go into AGU
+ IInfo()
+ : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
+ IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
+ IsNonSubwordLoad(false), IsShift(false), IsRev(false),
+ ProducesQP(false), ProducesDP(false), ProducesSP(false),
+ ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
+ MVEIntMACMatched(0), AddressOpMask(0) {}
+ };
+ typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
+ IInfoArray Info;
+
+public:
+ // Always available information
+ unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
+ bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
+ bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
+ bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
+ bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
+ bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
+ bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
+ bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
+ bool isRev(unsigned Op) { return Info[Op].IsRev; }
+ bool isShift(unsigned Op) { return Info[Op].IsShift; }
+
+ // information available if markDPConsumers is called.
+ bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
+ bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
+ bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
+ bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
+ bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
+ bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
+
+ bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
+ return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
+ }
+
+ InstructionInformation(const ARMBaseInstrInfo *TII);
+
+protected:
+ void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
+};
+
+InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
+ using namespace ARM;
+
+ std::initializer_list<unsigned> hasBRegAddrList = {
+ t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
+ tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr,
+ };
+ for (auto op : hasBRegAddrList) {
+ Info[op].HasBRegAddr = true;
+ }
+
+ std::initializer_list<unsigned> hasBRegAddrShiftList = {
+ t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
+ };
+ for (auto op : hasBRegAddrShiftList) {
+ Info[op].HasBRegAddrShift = true;
+ }
+
+ Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
+
+ std::initializer_list<unsigned> isInlineShiftALUList = {
+ t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs,
+ t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs,
+ t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs,
+ };
+ for (auto op : isInlineShiftALUList) {
+ Info[op].IsInlineShiftALU = true;
+ }
+
+ Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
+
+ std::initializer_list<unsigned> isMultiplyList = {
+ t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX,
+ t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
+ t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX,
+ t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD,
+ t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT,
+ t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL,
+ };
+ for (auto op : isMultiplyList) {
+ Info[op].IsMultiply = true;
+ }
+
+ std::initializer_list<unsigned> isMVEIntMACList = {
+ MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8,
+ MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8,
+ MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8,
+ MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8,
+ MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8,
+ MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
+ MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8,
+ MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8,
+ MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8,
+ MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8,
+ MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8,
+ MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8,
+ MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8,
+ MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8,
+ };
+ for (auto op : isMVEIntMACList) {
+ Info[op].IsMVEIntMAC = true;
+ }
+
+ std::initializer_list<unsigned> isNonSubwordLoadList = {
+ t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci,
+ t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
+ tLDRpci, tLDRr, tLDRspi,
+ };
+ for (auto op : isNonSubwordLoadList) {
+ Info[op].IsNonSubwordLoad = true;
+ }
+
+ std::initializer_list<unsigned> isRevList = {
+ t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
+ };
+ for (auto op : isRevList) {
+ Info[op].IsRev = true;
+ }
+
+ std::initializer_list<unsigned> isShiftList = {
+ t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
+ tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR,
+ };
+ for (auto op : isShiftList) {
+ Info[op].IsShift = true;
+ }
+
+ std::initializer_list<unsigned> Address1List = {
+ t2LDRBi12,
+ t2LDRBi8,
+ t2LDRBpci,
+ t2LDRBs,
+ t2LDRHi12,
+ t2LDRHi8,
+ t2LDRHpci,
+ t2LDRHs,
+ t2LDRSBi12,
+ t2LDRSBi8,
+ t2LDRSBpci,
+ t2LDRSBs,
+ t2LDRSHi12,
+ t2LDRSHi8,
+ t2LDRSHpci,
+ t2LDRSHs,
+ t2LDRi12,
+ t2LDRi8,
+ t2LDRpci,
+ t2LDRs,
+ tLDRBi,
+ tLDRBr,
+ tLDRHi,
+ tLDRHr,
+ tLDRSB,
+ tLDRSH,
+ tLDRi,
+ tLDRpci,
+ tLDRr,
+ tLDRspi,
+ t2STRBi12,
+ t2STRBi8,
+ t2STRBs,
+ t2STRHi12,
+ t2STRHi8,
+ t2STRHs,
+ t2STRi12,
+ t2STRi8,
+ t2STRs,
+ tSTRBi,
+ tSTRBr,
+ tSTRHi,
+ tSTRHr,
+ tSTRi,
+ tSTRr,
+ tSTRspi,
+ VLDRD,
+ VLDRH,
+ VLDRS,
+ VSTRD,
+ VSTRH,
+ VSTRS,
+ MVE_VLD20_16,
+ MVE_VLD20_32,
+ MVE_VLD20_8,
+ MVE_VLD21_16,
+ MVE_VLD21_32,
+ MVE_VLD21_8,
+ MVE_VLD40_16,
+ MVE_VLD40_32,
+ MVE_VLD40_8,
+ MVE_VLD41_16,
+ MVE_VLD41_32,
+ MVE_VLD41_8,
+ MVE_VLD42_16,
+ MVE_VLD42_32,
+ MVE_VLD42_8,
+ MVE_VLD43_16,
+ MVE_VLD43_32,
+ MVE_VLD43_8,
+ MVE_VLDRBS16,
+ MVE_VLDRBS16_rq,
+ MVE_VLDRBS32,
+ MVE_VLDRBS32_rq,
+ MVE_VLDRBU16,
+ MVE_VLDRBU16_rq,
+ MVE_VLDRBU32,
+ MVE_VLDRBU32_rq,
+ MVE_VLDRBU8,
+ MVE_VLDRBU8_rq,
+ MVE_VLDRDU64_qi,
+ MVE_VLDRDU64_rq,
+ MVE_VLDRDU64_rq_u,
+ MVE_VLDRHS32,
+ MVE_VLDRHS32_rq,
+ MVE_VLDRHS32_rq_u,
+ MVE_VLDRHU16,
+ MVE_VLDRHU16_rq,
+ MVE_VLDRHU16_rq_u,
+ MVE_VLDRHU32,
+ MVE_VLDRHU32_rq,
+ MVE_VLDRHU32_rq_u,
+ MVE_VLDRWU32,
+ MVE_VLDRWU32_qi,
+ MVE_VLDRWU32_rq,
+ MVE_VLDRWU32_rq_u,
+ MVE_VST20_16,
+ MVE_VST20_32,
+ MVE_VST20_8,
+ MVE_VST21_16,
+ MVE_VST21_32,
+ MVE_VST21_8,
+ MVE_VST40_16,
+ MVE_VST40_32,
+ MVE_VST40_8,
+ MVE_VST41_16,
+ MVE_VST41_32,
+ MVE_VST41_8,
+ MVE_VST42_16,
+ MVE_VST42_32,
+ MVE_VST42_8,
+ MVE_VST43_16,
+ MVE_VST43_32,
+ MVE_VST43_8,
+ MVE_VSTRB16,
+ MVE_VSTRB16_rq,
+ MVE_VSTRB32,
+ MVE_VSTRB32_rq,
+ MVE_VSTRBU8,
+ MVE_VSTRB8_rq,
+ MVE_VSTRD64_qi,
+ MVE_VSTRD64_rq,
+ MVE_VSTRD64_rq_u,
+ MVE_VSTRH32,
+ MVE_VSTRH32_rq,
+ MVE_VSTRH32_rq_u,
+ MVE_VSTRHU16,
+ MVE_VSTRH16_rq,
+ MVE_VSTRH16_rq_u,
+ MVE_VSTRWU32,
+ MVE_VSTRW32_qi,
+ MVE_VSTRW32_rq,
+ MVE_VSTRW32_rq_u,
+ };
+ std::initializer_list<unsigned> Address2List = {
+ t2LDRB_POST,
+ t2LDRB_PRE,
+ t2LDRDi8,
+ t2LDRH_POST,
+ t2LDRH_PRE,
+ t2LDRSB_POST,
+ t2LDRSB_PRE,
+ t2LDRSH_POST,
+ t2LDRSH_PRE,
+ t2LDR_POST,
+ t2LDR_PRE,
+ t2STRB_POST,
+ t2STRB_PRE,
+ t2STRDi8,
+ t2STRH_POST,
+ t2STRH_PRE,
+ t2STR_POST,
+ t2STR_PRE,
+ MVE_VLD20_16_wb,
+ MVE_VLD20_32_wb,
+ MVE_VLD20_8_wb,
+ MVE_VLD21_16_wb,
+ MVE_VLD21_32_wb,
+ MVE_VLD21_8_wb,
+ MVE_VLD40_16_wb,
+ MVE_VLD40_32_wb,
+ MVE_VLD40_8_wb,
+ MVE_VLD41_16_wb,
+ MVE_VLD41_32_wb,
+ MVE_VLD41_8_wb,
+ MVE_VLD42_16_wb,
+ MVE_VLD42_32_wb,
+ MVE_VLD42_8_wb,
+ MVE_VLD43_16_wb,
+ MVE_VLD43_32_wb,
+ MVE_VLD43_8_wb,
+ MVE_VLDRBS16_post,
+ MVE_VLDRBS16_pre,
+ MVE_VLDRBS32_post,
+ MVE_VLDRBS32_pre,
+ MVE_VLDRBU16_post,
+ MVE_VLDRBU16_pre,
+ MVE_VLDRBU32_post,
+ MVE_VLDRBU32_pre,
+ MVE_VLDRBU8_post,
+ MVE_VLDRBU8_pre,
+ MVE_VLDRDU64_qi_pre,
+ MVE_VLDRHS32_post,
+ MVE_VLDRHS32_pre,
+ MVE_VLDRHU16_post,
+ MVE_VLDRHU16_pre,
+ MVE_VLDRHU32_post,
+ MVE_VLDRHU32_pre,
+ MVE_VLDRWU32_post,
+ MVE_VLDRWU32_pre,
+ MVE_VLDRWU32_qi_pre,
+ MVE_VST20_16_wb,
+ MVE_VST20_32_wb,
+ MVE_VST20_8_wb,
+ MVE_VST21_16_wb,
+ MVE_VST21_32_wb,
+ MVE_VST21_8_wb,
+ MVE_VST40_16_wb,
+ MVE_VST40_32_wb,
+ MVE_VST40_8_wb,
+ MVE_VST41_16_wb,
+ MVE_VST41_32_wb,
+ MVE_VST41_8_wb,
+ MVE_VST42_16_wb,
+ MVE_VST42_32_wb,
+ MVE_VST42_8_wb,
+ MVE_VST43_16_wb,
+ MVE_VST43_32_wb,
+ MVE_VST43_8_wb,
+ MVE_VSTRB16_post,
+ MVE_VSTRB16_pre,
+ MVE_VSTRB32_post,
+ MVE_VSTRB32_pre,
+ MVE_VSTRBU8_post,
+ MVE_VSTRBU8_pre,
+ MVE_VSTRD64_qi_pre,
+ MVE_VSTRH32_post,
+ MVE_VSTRH32_pre,
+ MVE_VSTRHU16_post,
+ MVE_VSTRHU16_pre,
+ MVE_VSTRWU32_post,
+ MVE_VSTRWU32_pre,
+ MVE_VSTRW32_qi_pre,
+ };
+ std::initializer_list<unsigned> Address3List = {
+ t2LDRD_POST,
+ t2LDRD_PRE,
+ t2STRD_POST,
+ t2STRD_PRE,
+ };
+ // Compute a mask of which operands are involved in address computation
+ for (auto &op : Address1List) {
+ Info[op].AddressOpMask = 0x6;
+ }
+ for (auto &op : Address2List) {
+ Info[op].AddressOpMask = 0xc;
+ }
+ for (auto &op : Address3List) {
+ Info[op].AddressOpMask = 0x18;
+ }
+ for (auto &op : hasBRegAddrShiftList) {
+ Info[op].AddressOpMask |= 0x8;
+ }
+}
+
+void InstructionInformation::markDPProducersConsumers(
+ const ARMBaseInstrInfo *TII) {
+ // Learn about all instructions which have FP source/dest registers
+ for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
+ const MCInstrDesc &MID = TII->get(MI);
+ auto Operands = MID.operands();
+ for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
+ bool MarkQP = false, MarkDP = false, MarkSP = false;
+ switch (Operands[OI].RegClass) {
+ case ARM::MQPRRegClassID:
+ case ARM::DPRRegClassID:
+ case ARM::DPR_8RegClassID:
+ case ARM::DPR_VFP2RegClassID:
+ case ARM::DPairRegClassID:
+ case ARM::DPairSpcRegClassID:
+ case ARM::DQuadRegClassID:
+ case ARM::DQuadSpcRegClassID:
+ case ARM::DTripleRegClassID:
+ case ARM::DTripleSpcRegClassID:
+ MarkDP = true;
+ break;
+ case ARM::QPRRegClassID:
+ case ARM::QPR_8RegClassID:
+ case ARM::QPR_VFP2RegClassID:
+ case ARM::QQPRRegClassID:
+ case ARM::QQQQPRRegClassID:
+ MarkQP = true;
+ break;
+ case ARM::SPRRegClassID:
+ case ARM::SPR_8RegClassID:
+ case ARM::FPWithVPRRegClassID:
+ MarkSP = true;
+ break;
+ default:
+ break;
+ }
+ if (MarkQP) {
+ if (OI < MID.getNumDefs())
+ Info[MI].ProducesQP = true;
+ else
+ Info[MI].ConsumesQP = true;
+ }
+ if (MarkDP) {
+ if (OI < MID.getNumDefs())
+ Info[MI].ProducesDP = true;
+ else
+ Info[MI].ConsumesDP = true;
+ }
+ if (MarkSP) {
+ if (OI < MID.getNumDefs())
+ Info[MI].ProducesSP = true;
+ else
+ Info[MI].ConsumesSP = true;
+ }
+ }
+ }
+}
+
+} // anonymous namespace
+
+static bool hasImplicitCPSRUse(const MachineInstr *MI) {
+ return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
+}
+
+void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
+ unsigned latency) {
+ SDep Reverse = SrcDep;
+ Reverse.setSUnit(&SrcSU);
+ for (SDep &PDep : SrcDep.getSUnit()->Preds) {
+ if (PDep == Reverse) {
+ PDep.setLatency(latency);
+ SrcDep.getSUnit()->setDepthDirty();
+ break;
+ }
+ }
+ SrcDep.setLatency(latency);
+ SrcSU.setHeightDirty();
+}
+
+static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
+ return (a & 0xe) != (b & 0xe);
+}
+
+// Set output dependences to zero latency for processors which can
+// simultaneously issue to the same register. Returns true if a change
+// was made.
+bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
+ if (Dep.getKind() == SDep::Output) {
+ setBidirLatencies(ISU, Dep, 0);
+ return true;
+ }
+ return false;
+}
+
+// The graph doesn't look inside of bundles to determine their
+// scheduling boundaries and reports zero latency into and out of them
+// (except for CPSR into the bundle, which has latency 1).
+// Make some better scheduling assumptions:
+// 1) CPSR uses have zero latency; other uses have incoming latency 1
+// 2) CPSR defs retain a latency of zero; others have a latency of 1.
+//
+// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
+unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
+
+ SUnit &DepSU = *Dep.getSUnit();
+ const MachineInstr *SrcMI = ISU.getInstr();
+ unsigned SrcOpcode = SrcMI->getOpcode();
+ const MachineInstr *DstMI = DepSU.getInstr();
+ unsigned DstOpcode = DstMI->getOpcode();
+
+ if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
+ setBidirLatencies(
+ ISU, Dep,
+ (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
+ return 1;
+ }
+ if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
+ Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
+ setBidirLatencies(ISU, Dep, 1);
+ return 2;
+ }
+ return 0;
+}
+
+// Determine whether there is a memory RAW hazard here and set up latency
+// accordingly
+bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
+ unsigned latency) {
+ if (!Dep.isNormalMemory())
+ return false;
+ auto &SrcInst = *ISU.getInstr();
+ auto &DstInst = *Dep.getSUnit()->getInstr();
+ if (!SrcInst.mayStore() || !DstInst.mayLoad())
+ return false;
+
+ auto SrcMO = *SrcInst.memoperands().begin();
+ auto DstMO = *DstInst.memoperands().begin();
+ auto SrcVal = SrcMO->getValue();
+ auto DstVal = DstMO->getValue();
+ auto SrcPseudoVal = SrcMO->getPseudoValue();
+ auto DstPseudoVal = DstMO->getPseudoValue();
+ if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
+ SrcMO->getOffset() == DstMO->getOffset()) {
+ setBidirLatencies(ISU, Dep, latency);
+ return true;
+ } else if (SrcPseudoVal && DstPseudoVal &&
+ SrcPseudoVal->kind() == DstPseudoVal->kind() &&
+ SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
+ // Spills/fills
+ auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
+ auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
+ if (FS0 == FS1) {
+ setBidirLatencies(ISU, Dep, latency);
+ return true;
+ }
+ }
+ return false;
+}
+
+namespace {
+
+std::unique_ptr<InstructionInformation> II;
+
+class CortexM7InstructionInformation : public InstructionInformation {
+public:
+ CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
+ : InstructionInformation(TII) {}
+};
+
+class CortexM7Overrides : public ARMOverrideBypasses {
+public:
+ CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
+ : ARMOverrideBypasses(TII, AA) {
+ if (!II)
+ II.reset(new CortexM7InstructionInformation(TII));
+ }
+
+ void modifyBypasses(SUnit &) override;
+};
+
+void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
+ const MachineInstr *SrcMI = ISU.getInstr();
+ unsigned SrcOpcode = SrcMI->getOpcode();
+ bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
+
+ // Walk the successors looking for latency overrides that are needed
+ for (SDep &Dep : ISU.Succs) {
+
+ // Output dependences should have 0 latency, as M7 is able to
+ // schedule writers to the same register for simultaneous issue.
+ if (zeroOutputDependences(ISU, Dep))
+ continue;
+
+ if (memoryRAWHazard(ISU, Dep, 4))
+ continue;
+
+ // Ignore dependencies other than data
+ if (Dep.getKind() != SDep::Data)
+ continue;
+
+ SUnit &DepSU = *Dep.getSUnit();
+ if (DepSU.isBoundaryNode())
+ continue;
+
+ if (makeBundleAssumptions(ISU, Dep) == 1)
+ continue;
+
+ const MachineInstr *DstMI = DepSU.getInstr();
+ unsigned DstOpcode = DstMI->getOpcode();
+
+ // Word loads into any multiply or divide instruction are considered
+ // cannot bypass their scheduling stage. Didn't do this in the .td file
+ // because we cannot easily create a read advance that is 0 from certain
+ // writer classes and 1 from all the rest.
+ // (The other way around would have been easy.)
+ if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
+ setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
+
+ // Word loads into B operand of a load/store are considered cannot bypass
+ // their scheduling stage. Cannot do in the .td file because
+ // need to decide between -1 and -2 for ReadAdvance
+ if (isNSWload && II->hasBRegAddr(DstOpcode) &&
+ DstMI->getOperand(2).getReg() == Dep.getReg())
+ setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
+
+ // Multiplies into any address generation cannot bypass from EX3. Cannot do
+ // in the .td file because need to decide between -1 and -2 for ReadAdvance
+ if (II->isMultiply(SrcOpcode)) {
+ unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
+ for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
+ if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
+ DstMI->getOperand(i).getReg() == Dep.getReg()) {
+ setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
+ break;
+ }
+ }
+ }
+
+ // Mismatched conditional producers take longer on M7; they end up looking
+ // like they were produced at EX3 and read at IS.
+ if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
+ (SrcOpcode == ARM::BUNDLE ||
+ mismatchedPred(TII->getPredicate(*SrcMI),
+ TII->getPredicate(*DstMI)))) {
+ unsigned Lat = 1;
+ // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
+ if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
+ DstMI->getOperand(1).getReg() == Dep.getReg())
+ Lat = 2;
+ Lat = std::min(3u, Dep.getLatency() + Lat);
+ setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
+ }
+
+ // CC setter into conditional producer shouldn't have a latency of more
+ // than 1 unless it's due to an implicit read. (All the "true" readers
+ // of the condition code use an implicit read, and predicates use an
+ // explicit.)
+ if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
+ TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
+ setBidirLatencies(ISU, Dep, 1);
+
+ // REV instructions cannot bypass directly into the EX1 shifter. The
+ // code is slightly inexact as it doesn't attempt to ensure that the bypass
+ // is to the shifter operands.
+ if (II->isRev(SrcOpcode)) {
+ if (II->isInlineShiftALU(DstOpcode))
+ setBidirLatencies(ISU, Dep, 2);
+ else if (II->isShift(DstOpcode))
+ setBidirLatencies(ISU, Dep, 1);
+ }
+ }
+}
+
+class M85InstructionInformation : public InstructionInformation {
+public:
+ M85InstructionInformation(const ARMBaseInstrInfo *t)
+ : InstructionInformation(t) {
+ markDPProducersConsumers(t);
+ }
+};
+
+class M85Overrides : public ARMOverrideBypasses {
+public:
+ M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
+ : ARMOverrideBypasses(t, a) {
+ if (!II)
+ II.reset(new M85InstructionInformation(t));
+ }
+
+ void modifyBypasses(SUnit &) override;
+
+private:
+ unsigned computeBypassStage(const MCSchedClassDesc *SCD);
+ signed modifyMixedWidthFP(const MachineInstr *SrcMI,
+ const MachineInstr *DstMI, unsigned RegID,
+ const MCSchedClassDesc *SCD);
+};
+
+unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
+ auto SM = DAG->getSchedModel();
+ unsigned DefIdx = 0; // just look for the first output's timing
+ if (DefIdx < SCDesc->NumWriteLatencyEntries) {
+ // Lookup the definition's write latency in SubtargetInfo.
+ const MCWriteLatencyEntry *WLEntry =
+ SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
+ unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
+ if (Latency == 4)
+ return 2;
+ else if (Latency == 5)
+ return 3;
+ else if (Latency > 3)
+ return 3;
+ else
+ return Latency;
+ }
+ return 2;
+}
+
+// Latency changes for bypassing between FP registers of
diff erent sizes:
+//
+// Note that mixed DP/SP are unlikely because of the semantics
+// of C. Mixed MVE/SP are quite common when MVE intrinsics are used.
+signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
+ const MachineInstr *DstMI,
+ unsigned RegID,
+ const MCSchedClassDesc *SCD) {
+
+ if (!II->producesSP(SrcMI->getOpcode()) &&
+ !II->producesDP(SrcMI->getOpcode()) &&
+ !II->producesQP(SrcMI->getOpcode()))
+ return 0;
+
+ if (Register::isVirtualRegister(RegID)) {
+ if (II->producesSP(SrcMI->getOpcode()) &&
+ II->consumesDP(DstMI->getOpcode())) {
+ for (auto &OP : SrcMI->operands())
+ if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
+ OP.getSubReg() == ARM::ssub_1)
+ return 5 - computeBypassStage(SCD);
+ } else if (II->producesSP(SrcMI->getOpcode()) &&
+ II->consumesQP(DstMI->getOpcode())) {
+ for (auto &OP : SrcMI->operands())
+ if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
+ (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
+ return 5 - computeBypassStage(SCD) -
+ ((OP.getSubReg() == ARM::ssub_2 ||
+ OP.getSubReg() == ARM::ssub_3)
+ ? 1
+ : 0);
+ } else if (II->producesDP(SrcMI->getOpcode()) &&
+ II->consumesQP(DstMI->getOpcode())) {
+ for (auto &OP : SrcMI->operands())
+ if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
+ OP.getSubReg() == ARM::ssub_1)
+ return -1;
+ } else if (II->producesDP(SrcMI->getOpcode()) &&
+ II->consumesSP(DstMI->getOpcode())) {
+ for (auto &OP : DstMI->operands())
+ if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
+ OP.getSubReg() == ARM::ssub_1)
+ return 5 - computeBypassStage(SCD);
+ } else if (II->producesQP(SrcMI->getOpcode()) &&
+ II->consumesSP(DstMI->getOpcode())) {
+ for (auto &OP : DstMI->operands())
+ if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
+ (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
+ return 5 - computeBypassStage(SCD) +
+ ((OP.getSubReg() == ARM::ssub_2 ||
+ OP.getSubReg() == ARM::ssub_3)
+ ? 1
+ : 0);
+ } else if (II->producesQP(SrcMI->getOpcode()) &&
+ II->consumesDP(DstMI->getOpcode())) {
+ for (auto &OP : DstMI->operands())
+ if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
+ OP.getSubReg() == ARM::ssub_1)
+ return 1;
+ }
+ } else if (Register::isPhysicalRegister(RegID)) {
+ // Note that when the producer is narrower, not all of the producers
+ // may be present in the scheduling graph; somewhere earlier in the
+ // compiler, an implicit def/use of the aliased full register gets
+ // added to the producer, and so only that producer is seen as *the*
+ // single producer. This behavior also has the unfortunate effect of
+ // serializing the producers in the compiler's view of things.
+ if (II->producesSP(SrcMI->getOpcode()) &&
+ II->consumesDP(DstMI->getOpcode())) {
+ for (auto &OP : SrcMI->operands())
+ if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
+ OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
+ (OP.getReg() == RegID ||
+ (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
+ (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
+ return 5 - computeBypassStage(SCD);
+ } else if (II->producesSP(SrcMI->getOpcode()) &&
+ II->consumesQP(DstMI->getOpcode())) {
+ for (auto &OP : SrcMI->operands())
+ if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
+ OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
+ (OP.getReg() == RegID ||
+ (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
+ (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
+ return 5 - computeBypassStage(SCD) -
+ (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
+ } else if (II->producesDP(SrcMI->getOpcode()) &&
+ II->consumesQP(DstMI->getOpcode())) {
+ for (auto &OP : SrcMI->operands())
+ if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
+ OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
+ (OP.getReg() == RegID ||
+ (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
+ return -1;
+ } else if (II->producesDP(SrcMI->getOpcode()) &&
+ II->consumesSP(DstMI->getOpcode())) {
+ if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
+ return 5 - computeBypassStage(SCD);
+ } else if (II->producesQP(SrcMI->getOpcode()) &&
+ II->consumesSP(DstMI->getOpcode())) {
+ if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
+ return 5 - computeBypassStage(SCD) +
+ (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
+ } else if (II->producesQP(SrcMI->getOpcode()) &&
+ II->consumesDP(DstMI->getOpcode())) {
+ if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void M85Overrides::modifyBypasses(SUnit &ISU) {
+ const MachineInstr *SrcMI = ISU.getInstr();
+ unsigned SrcOpcode = SrcMI->getOpcode();
+ bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
+
+ // Walk the successors looking for latency overrides that are needed
+ for (SDep &Dep : ISU.Succs) {
+
+ // Output dependences should have 0 latency, as CortexM85 is able to
+ // schedule writers to the same register for simultaneous issue.
+ if (zeroOutputDependences(ISU, Dep))
+ continue;
+
+ if (memoryRAWHazard(ISU, Dep, 3))
+ continue;
+
+ // Ignore dependencies other than data or strong ordering.
+ if (Dep.getKind() != SDep::Data)
+ continue;
+
+ SUnit &DepSU = *Dep.getSUnit();
+ if (DepSU.isBoundaryNode())
+ continue;
+
+ if (makeBundleAssumptions(ISU, Dep) == 1)
+ continue;
+
+ const MachineInstr *DstMI = DepSU.getInstr();
+ unsigned DstOpcode = DstMI->getOpcode();
+
+ // Word loads into B operand of a load/store with cannot bypass their
+ // scheduling stage. Cannot do in the .td file because need to decide
+ // between -1 and -2 for ReadAdvance
+
+ if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
+ DstMI->getOperand(3).getImm() != 0 && // shift operand
+ DstMI->getOperand(2).getReg() == Dep.getReg())
+ setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
+
+ if (isNSWload && isMVEVectorInstruction(DstMI)) {
+ setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
+ }
+
+ if (II->isMVEIntMAC(DstOpcode) &&
+ II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
+ DstMI->getOperand(0).isReg() &&
+ DstMI->getOperand(0).getReg() == Dep.getReg())
+ setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);
+
+ // CC setter into conditional producer shouldn't have a latency of more
+ // than 0 unless it's due to an implicit read.
+ if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
+ TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
+ setBidirLatencies(ISU, Dep, 0);
+
+ if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
+ DAG->getSchedClass(&ISU)))
+ setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));
+
+ if (II->isRev(SrcOpcode)) {
+ if (II->isInlineShiftALU(DstOpcode))
+ setBidirLatencies(ISU, Dep, 1);
+ else if (II->isShift(DstOpcode))
+ setBidirLatencies(ISU, Dep, 1);
+ }
+ }
+}
+
+// Add M55 specific overrides for latencies between instructions. Currently it:
+// - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
+class CortexM55Overrides : public ARMOverrideBypasses {
+public:
+ CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
+ : ARMOverrideBypasses(TII, AA) {}
+
+ void modifyBypasses(SUnit &SU) override {
+ MachineInstr *SrcMI = SU.getInstr();
+ if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
+ return;
+
+ for (SDep &Dep : SU.Succs) {
+ if (Dep.getKind() != SDep::Data)
+ continue;
+ SUnit &DepSU = *Dep.getSUnit();
+ if (DepSU.isBoundaryNode())
+ continue;
+ MachineInstr *DstMI = DepSU.getInstr();
+
+ if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
+ setBidirLatencies(SU, Dep, 3);
+ }
+ }
+};
+
+} // end anonymous namespace
+
+void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
+ DAG = DAGInstrs;
+ for (SUnit &ISU : DAGInstrs->SUnits) {
+ if (ISU.isBoundaryNode())
+ continue;
+ modifyBypasses(ISU);
+ }
+ if (DAGInstrs->ExitSU.getInstr())
+ modifyBypasses(DAGInstrs->ExitSU);
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
+ if (ST.isCortexM85())
+ return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
+ else if (ST.isCortexM7())
+ return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
+ else if (ST.isCortexM55())
+ return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);
+
+ return nullptr;
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMLatencyMutations.h b/llvm/lib/Target/ARM/ARMLatencyMutations.h
new file mode 100644
index 00000000000000..a4b8de0be51f72
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMLatencyMutations.h
@@ -0,0 +1,56 @@
+//===- ARMLatencyMutations.h - ARM Latency Mutations ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM definition DAG scheduling mutations which
+/// change inter-instruction latencies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_LATENCYMUTATIONS_H
+#define LLVM_LIB_TARGET_ARM_LATENCYMUTATIONS_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+
+namespace llvm {
+
+class AAResults;
+class ARMBaseInstrInfo;
+
+/// Post-process the DAG to create cluster edges between instrs that may
+/// be fused by the processor into a single operation.
+class ARMOverrideBypasses : public ScheduleDAGMutation {
+public:
+ ARMOverrideBypasses(const ARMBaseInstrInfo *t, AAResults *a)
+ : ScheduleDAGMutation(), TII(t), AA(a) {}
+
+ void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+private:
+ virtual void modifyBypasses(SUnit &) = 0;
+
+protected:
+ const ARMBaseInstrInfo *TII;
+ AAResults *AA;
+ ScheduleDAGInstrs *DAG = nullptr;
+
+ static void setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, unsigned latency);
+ static bool zeroOutputDependences(SUnit &ISU, SDep &Dep);
+ unsigned makeBundleAssumptions(SUnit &ISU, SDep &Dep);
+ bool memoryRAWHazard(SUnit &ISU, SDep &Dep, unsigned latency);
+};
+
+/// Note that you have to add:
+/// DAG.addMutation(createARMLatencyMutation(ST, AA));
+/// to ARMPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation>
+createARMLatencyMutations(const class ARMSubtarget &, AAResults *AA);
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index b94a5fc1614697..22e7e6893c1a82 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -95,8 +95,12 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus",
def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
"Cortex-M3 ARM processors", []>;
+def ProcM55 : SubtargetFeature<"m55", "ARMProcFamily", "CortexM55",
+ "Cortex-M55 ARM processors", []>;
def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
"Cortex-M7 ARM processors", []>;
+def ProcM85 : SubtargetFeature<"m85", "ARMProcFamily", "CortexM85",
+ "Cortex-M85 ARM processors", []>;
//===----------------------------------------------------------------------===//
// ARM processors
@@ -384,6 +388,7 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline,
FeatureFixCMSE_CVE_2021_35465]>;
def : ProcessorModel<"cortex-m55", CortexM55Model, [ARMv81mMainline,
+ ProcM55,
FeatureDSP,
FeatureFPARMv8_D16,
FeatureUseMISched,
@@ -394,6 +399,7 @@ def : ProcessorModel<"cortex-m55", CortexM55Model, [ARMv81mMainline,
FeatureFixCMSE_CVE_2021_35465]>;
def : ProcessorModel<"cortex-m85", CortexM85Model, [ARMv81mMainline,
+ ProcM85,
FeatureDSP,
FeatureFPARMv8_D16,
FeaturePACBTI,
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 2f7af05a259f8f..611eeac9ef7128 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -291,7 +291,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
bool isCortexA15() const { return ARMProcFamily == CortexA15; }
bool isSwift() const { return ARMProcFamily == Swift; }
bool isCortexM3() const { return ARMProcFamily == CortexM3; }
+ bool isCortexM55() const { return ARMProcFamily == CortexM55; }
bool isCortexM7() const { return ARMProcFamily == CortexM7; }
+ bool isCortexM85() const { return ARMProcFamily == CortexM85; }
bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
bool isCortexR5() const { return ARMProcFamily == CortexR5; }
bool isKrait() const { return ARMProcFamily == Krait; }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 608e43d5c0bcde..360cd7272ca704 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -11,6 +11,7 @@
#include "ARMTargetMachine.h"
#include "ARM.h"
+#include "ARMLatencyMutations.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMMacroFusion.h"
#include "ARMSubtarget.h"
@@ -368,6 +369,8 @@ class ARMPassConfig : public TargetPassConfig {
const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
if (ST.hasFusion())
DAG->addMutation(createARMMacroFusionDAGMutation());
+ if (auto Mutation = createARMLatencyMutations(ST, C->AA))
+ DAG->addMutation(std::move(Mutation));
return DAG;
}
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index 3d6af28b437538..a39629bd8aeb02 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -62,6 +62,7 @@ add_llvm_target(ARMCodeGen
MVETailPredication.cpp
MVEVPTBlockPass.cpp
MVETPAndVPTOptimisationsPass.cpp
+ ARMLatencyMutations.cpp
Thumb1FrameLowering.cpp
Thumb1InstrInfo.cpp
ThumbRegisterInfo.cpp
More information about the llvm-commits
mailing list