[llvm] [AArch64] Add Cortex-A320 scheduling model (PR #144385)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 16 09:22:36 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Ties Stuij (stuij)
<details>
<summary>Changes</summary>
Instead of using the Cortex-A510 scheduling model, Cortex-A320 now uses its own scheduling model, based off of the Cortex-A320 Software Optimization Guide:
https://developer.arm.com/documentation/110285/r0p1
---
Patch is 1.22 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144385.diff
6 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64.td (+1)
- (modified) llvm/lib/Target/AArch64/AArch64Processors.td (+1-1)
- (added) llvm/lib/Target/AArch64/AArch64SchedA320.td (+1415)
- (added) llvm/test/tools/llvm-mca/AArch64/Cortex/A320-basic-instructions.s (+3721)
- (added) llvm/test/tools/llvm-mca/AArch64/Cortex/A320-neon-instructions.s (+3208)
- (added) llvm/test/tools/llvm-mca/AArch64/Cortex/A320-sve-instructions.s (+10258)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index f303819f411d6..eb5a5199b8951 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -103,6 +103,7 @@ def MTEUnsupported : AArch64Unsupported {
let F = [HasPAuth, HasPAuthLR] in
def PAUnsupported : AArch64Unsupported;
+include "AArch64SchedA320.td"
include "AArch64SchedA53.td"
include "AArch64SchedA55.td"
include "AArch64SchedA510.td"
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index c7ea6393e2ad3..e1b82953aad80 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -1131,7 +1131,7 @@ def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53,
[TuneA35]>;
def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,
[TuneA35]>;
-def : ProcessorModel<"cortex-a320", CortexA510Model, ProcessorFeatures.A320,
+def : ProcessorModel<"cortex-a320", CortexA320Model, ProcessorFeatures.A320,
[TuneA320]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, ProcessorFeatures.A53,
[TuneA53]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA320.td b/llvm/lib/Target/AArch64/AArch64SchedA320.td
new file mode 100644
index 0000000000000..9260f0eac789f
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedA320.td
@@ -0,0 +1,1415 @@
+//==- AArch64SchedCortexA320.td - ARM Cortex-A320 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-A320 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A320 machine model for scheduling and other instruction cost heuristics.
+def CortexA320Model : SchedMachineModel {
+ let MicroOpBufferSize = 0; // Cortex-A320 is an in-order processor
+ let IssueWidth = 1; // Cortex-A320 is a single-issue processor
+ let LoadLatency = 5;
+ let PostRAScheduler = 1; // Enable PostRA scheduler pass.
+ let CompleteModel = 0; // Covers instructions applicable to Cortex-A320.
+
+ let FullInstRWOverlapCheck = 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types
+
+let SchedModel = CortexA320Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
+// Cortex-A320 is in-order.
+let BufferSize = 0 in {
+ def CortexA320UnitALU : ProcResource<1>; // Int ALU
+ def CortexA320UnitMAC : ProcResource<1>; // Int MAC, 64-bi wide
+ def CortexA320UnitDiv : ProcResource<1>; // Int Division, not pipelined
+ def CortexA320UnitLdSt : ProcResource<1>; // Load/Store shared pipe
+ def CortexA320UnitB : ProcResource<1>; // Branch
+ def CortexA320UnitPAC : ProcResource<1>; // Pointer Authentication (PAC) pipe
+
+ // The FP DIV/SQRT instructions execute totally differently from the FP ALU
+ // instructions; that's why for now we model them with 2 resources.
+ def CortexA320UnitVALU : ProcResource<1>; // SIMD/FP/SVE ALU
+ def CortexA320UnitVMAC : ProcResource<1>; // SIMD/FP/SVE MAC
+ def CortexA320UnitVMC : ProcResource<1>; // SIMD/FP/SVE multicycle instrs (e.g Div, SQRT, cryptography)
+}
+
+// These latencies are modeled without taking into account forwarding paths
+// (the software optimisation guide lists latencies taking into account
+// typical forwarding paths).
+def : WriteRes<WriteImm, [CortexA320UnitALU]> { let Latency = 1; } // MOVN, MOVZ
+def : WriteRes<WriteI, [CortexA320UnitALU]> { let Latency = 1; } // ALU
+def : WriteRes<WriteISReg, [CortexA320UnitALU]> { let Latency = 2; } // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [CortexA320UnitALU]> { let Latency = 2; } // ALU of Extended-Reg
+def : WriteRes<WriteExtr, [CortexA320UnitALU]> { let Latency = 2; } // EXTR from a reg pair
+def : WriteRes<WriteIS, [CortexA320UnitALU]> { let Latency = 2; } // Shift/Scale
+
+// MAC
+def : WriteRes<WriteIM32, [CortexA320UnitMAC]> { let Latency = 3; } // 32-bit Multiply
+def : WriteRes<WriteIM64, [CortexA320UnitMAC]> { let Latency = 5; let ReleaseAtCycles = [2];} // 64-bit Multiply
+
+// Div
+def : WriteRes<WriteID32, [CortexA320UnitDiv]> {
+ let Latency = 12; let ReleaseAtCycles = [12];
+}
+def : WriteRes<WriteID64, [CortexA320UnitDiv]> {
+ let Latency = 20; let ReleaseAtCycles = [20];
+}
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to Cortex-A320
+
+//===----------------------------------------------------------------------===//
+class CortexA320Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+ let Latency = n;
+}
+
+class CortexA320MCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> {
+ let Latency = n;
+ let ReleaseAtCycles = [m];
+ let BeginGroup = 1;
+}
+
+class CortexA320MC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+ let Latency = n;
+ let BeginGroup = 1;
+}
+
+//===----------------------------------------------------------------------===//
+
+// Define generic 2 micro-op types
+def CortexA320Write_11cyc_1VMAC_1VALU : SchedWriteRes<[CortexA320UnitVALU, CortexA320UnitVMAC]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+
+def CortexA320Write_16cyc_1VMAC_1VALU : SchedWriteRes<[CortexA320UnitVALU, CortexA320UnitVMAC]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+}
+
+class CortexA320Write_PAC_B <int lat> : SchedWriteRes<[CortexA320UnitPAC, CortexA320UnitB]> {
+ let Latency = lat;
+ let NumMicroOps = 2;
+}
+
+// Load
+def : WriteRes<WriteLD, [CortexA320UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [CortexA320UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [CortexA320UnitLdSt]> { let Latency = 4; }
+
+def CortexA320WriteVLD1 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; }
+def CortexA320WriteVLD1SI : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; let SingleIssue = 1; }
+
+def CortexA320WriteVLD2 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4;
+ let ReleaseAtCycles = [2]; }
+
+def CortexA320WriteVLD3 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+ let ReleaseAtCycles = [3]; }
+
+def CortexA320WriteVLD4 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6;
+ let ReleaseAtCycles = [4]; }
+
+def CortexA320WriteVLD6 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+ let ReleaseAtCycles = [3]; }
+
+def CortexA320WriteVLD8 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6;
+ let ReleaseAtCycles = [4]; }
+
+def CortexA320WriteLDP1 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+def CortexA320WriteLDP2 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+def CortexA320WriteLDP4 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+
+// Pre/Post Indexing - Performed as part of address generation
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+let RetireOOO = 1 in {
+def : WriteRes<WriteST, [CortexA320UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [CortexA320UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [CortexA320UnitLdSt]> { let Latency = 1; }
+}
+def : WriteRes<WriteSTX, [CortexA320UnitLdSt]> { let Latency = 3; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [CortexA320UnitLdSt]> { let Latency = 5;
+ let ReleaseAtCycles = [2];}
+def CortexA320WriteVST1 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+def CortexA320WriteVST2 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+ let ReleaseAtCycles = [2]; }
+def CortexA320WriteVST3 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+ let ReleaseAtCycles = [3]; }
+def CortexA320WriteVST4 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+ let ReleaseAtCycles = [4]; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [CortexA320UnitB]>;
+def : WriteRes<WriteBrReg, [CortexA320UnitB]>;
+def : WriteRes<WriteSys, [CortexA320UnitB]>;
+def : WriteRes<WriteBarrier, [CortexA320UnitB]>;
+def : WriteRes<WriteHint, [CortexA320UnitB]>;
+
+// FP ALU
+// As WriteF result is produced in F5 and it can be mostly forwarded
+// to consumer at F1, the effectively Latency is set as 4.
+def : WriteRes<WriteF, [CortexA320UnitVALU]> { let Latency = 4; }
+def : WriteRes<WriteFCmp, [CortexA320UnitVALU]> { let Latency = 3; }
+def : WriteRes<WriteFCvt, [CortexA320UnitVALU]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [CortexA320UnitVALU]> { let Latency = 3; }
+def : WriteRes<WriteFImm, [CortexA320UnitVALU]> { let Latency = 3; }
+
+class CortexA320VSt<int n> : SchedWriteRes<[CortexA320UnitLdSt]> {
+ let RetireOOO = 1;
+ let ReleaseAtCycles = [n];
+}
+
+def CortexA320VSt0 : SchedWriteRes<[CortexA320UnitLdSt]> {
+ let RetireOOO = 1;
+}
+
+def : SchedAlias<WriteVd, CortexA320Write<4, CortexA320UnitVALU>>;
+def : SchedAlias<WriteVq, CortexA320Write<4, CortexA320UnitVALU>>;
+
+// FP ALU specific new schedwrite definitions
+def CortexA320WriteFPALU_F3 : SchedWriteRes<[CortexA320UnitVALU]> { let Latency = 3;}
+def CortexA320WriteFPALU_F4 : SchedWriteRes<[CortexA320UnitVALU]> { let Latency = 4;}
+
+// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
+def : WriteRes<WriteFMul, [CortexA320UnitVMAC]> { let Latency = 4; }
+
+let RetireOOO = 1 in {
+def : WriteRes<WriteFDiv, [CortexA320UnitVMC]> { let Latency = 22;
+ let ReleaseAtCycles = [29]; }
+def CortexA320WriteVMAC : SchedWriteRes<[CortexA320UnitVMAC]> { let Latency = 4; }
+def CortexA320WriteFDivHP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 8;
+ let ReleaseAtCycles = [5]; }
+def CortexA320WriteFDivSP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 13;
+ let ReleaseAtCycles = [10]; }
+def CortexA320WriteFDivDP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 22;
+ let ReleaseAtCycles = [19]; }
+def CortexA320WriteFSqrtHP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 8;
+ let ReleaseAtCycles = [5]; }
+def CortexA320WriteFSqrtSP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 12;
+ let ReleaseAtCycles = [9]; }
+def CortexA320WriteFSqrtDP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 22;
+ let ReleaseAtCycles = [19]; }
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+def : ReadAdvance<ReadVLD, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 1>;
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+
+
+// MUL
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 2>;
+
+// Div
+def : ReadAdvance<ReadID, 0>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+def CortexA320WriteISReg : SchedWriteVariant<[
+ SchedVar<RegShiftedPred, [WriteISReg]>,
+ SchedVar<NoSchedPred, [WriteI]>]>;
+def : InstRW<[CortexA320WriteISReg], (instregex ".*rs$")>;
+def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>;
+
+// Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code, using generic key
+// Compute pointer authentication code for instruction address
+def : InstRW<[CortexA320Write<4, CortexA320UnitPAC>], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[CortexA320Write_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+ BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+ ERETAA, ERETAB)>;
+
+// Load register, with pointer authentication
+def : InstRW<[CortexA320Write<2, CortexA320UnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[CortexA320Write<5, CortexA320UnitPAC>], (instrs XPACD, XPACI, XPACLRI)>;
+//---
+// Miscellaneous
+//---
+def : InstRW<[CortexA320WriteVLD1SI,CortexA320WriteLDP1], (instregex "LDPS?Wi")>;
+def : InstRW<[CortexA320WriteVLD1,CortexA320WriteLDP1], (instregex "LDPSi")>;
+def : InstRW<[CortexA320WriteVLD1,CortexA320WriteLDP2], (instregex "LDP(X|D)i")>;
+def : InstRW<[CortexA320WriteVLD1,CortexA320WriteLDP4], (instregex "LDPQi")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1SI,CortexA320WriteLDP1], (instregex "LDPS?W(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1,CortexA320WriteLDP1], (instregex "LDPS(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1,CortexA320WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1,CortexA320WriteLDP4], (instregex "LDPQ(pre|post)")>;
+def : InstRW<[WriteI], (instrs COPY)>;
+//---
+// Vector Loads - 128-bit per cycle
+//---
+// 1-element structures
+def CortexA320WriteVLD1Latency3: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [1]; }
+def CortexA320WriteVLD1Latency4: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def CortexA320WriteVLD1Latency5: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; }
+def CortexA320WriteVLD1Latency6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [4]; }
+
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency4], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency4], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency5], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency6], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1i(8|16|32|64)$")>; // single element
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
+
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency4], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency4], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency5], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency4], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency6], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1i(8|16|32|64)_POST$")>; // single element
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // replicate
+
+// 2-element structures
+def CortexA320WriteVLD2Latency3: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [2]; }
+def CortexA320WriteVLD2Latency4Release1: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [1]; }
+def CortexA320WriteVLD2Latency4Release2: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def CortexA320WriteVLD2Latency4Release6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [6]; }
+
+def : InstRW<[CortexA320WriteVLD2Latency4Release1], (instregex "LD2Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD2Latency4Release2], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD2Latency4Release6], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVLD2Latency3], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency4Release1], (instregex "LD2Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency4Release2], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency4Release6], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency3], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+
+// 3-element structures
+def CortexA320WriteVLD3Latency4: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [3]; }
+def CortexA320WriteVLD3Latency5Release6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [6]; }
+def CortexA320WriteVLD3Latency5Release7: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [7]; }
+
+def : InstRW<[CortexA320WriteVLD3Latency5Release6], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[CortexA320WriteVLD3Latency5Release7], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVLD3Latency4], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVLD3Latency5Release6], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD3Latency5Release7], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD3Latency4], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+
+// 4-element structures
+def CortexA320WriteVLD4Latency4: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [4]; }
+def CortexA320WriteVLD4Latency5Release7: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [7]; }
+def CortexA320WriteVLD4Latency5Release8: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [8]; }
+def CortexA320WriteVLD4Latency6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [7]; }
+
+def : InstRW<[CortexA320WriteVLD4Latency5Release7], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD4Latency5Release8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD4Latency6], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVLD4Latency4], (instregex "LD4Rv(8b|16b|4h|8b|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVLD4Latency5Release7], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD4Latency5Release8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD4Latency6], ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/144385
More information about the llvm-commits
mailing list