[llvm] [AArch64] Add C1-Nano scheduling model (PR #182316)

Fri Mar 6 07:01:56 PST 2026

================
@@ -0,0 +1,1401 @@
+//==- AArch64Sched1Nano.td - ARM C1-Nano Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM C1-Nano processor.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// C1-Nano machine model for scheduling and other instruction cost heuristics.
+def C1NanoModel : SchedMachineModel {
+  let MicroOpBufferSize = 0;  // The C1-Nano is an in-order processor
+  let IssueWidth = 3;         // It dual-issues under most circumstances
+  let LoadLatency = 3;        // Cycles for loads to access the cache.
+                              // 2 is best case, 4 is normal case.
+                              // 3 seems to be a good tradeoff
+  let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
+  let CompleteModel = 0;      // Covers instructions applicable to C1-Nano.
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types
+
+let SchedModel = C1NanoModel in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
+// C1-Nano is in-order.
+let BufferSize = 0 in {
+  def C1NanoUnitALU0   : ProcResource<1>;    // Int ALU0
+  def C1NanoUnitALU1   : ProcResource<1>;    // Int ALU1
+  def C1NanoUnitMAC    : ProcResource<1>;    // Int MAC, 64-bit wide
+  def C1NanoUnitDiv    : ProcResource<1>;    // Int Division, not pipelined
+  // There are 2 LS pipes, 1 for Load/Store; 1 for Load only
+  def C1NanoUnitLdSt   : ProcResource<1>;    // Load/Store shared pipe
+  def C1NanoUnitLd1    : ProcResource<1>;    // Load pipe
+  def C1NanoUnitB      : ProcResource<1>;    // Branch
+  def C1NanoUnitPAC    : ProcResource<1>;    // Pointer Authentication (PAC) pipe
+
+  // The FP DIV/SQRT instructions execute totally differently from the FP ALU
+  // instructions, which can mostly be dual-issued; that's why for now we model
+  // them with 2 resources.
+  def C1NanoUnitVALU0  : ProcResource<1>;    // SIMD/FP/SVE ALU0
+  def C1NanoUnitVALU1  : ProcResource<1>;    // SIMD/FP/SVE ALU1
+  def C1NanoUnitVMAC   : ProcResource<2>;    // SIMD/FP/SVE MAC
+  def C1NanoUnitVMC    : ProcResource<1>;    // SIMD/FP/SVE multicycle instrs  (e.g Div, SQRT, cryptography)
+}
+
+def C1NanoUnitLd     : ProcResGroup<[C1NanoUnitLdSt, C1NanoUnitLd1]>;
+def C1NanoUnitVALU   : ProcResGroup<[C1NanoUnitVALU0, C1NanoUnitVALU1]>;
+def C1NanoUnitALU    : ProcResGroup<[C1NanoUnitALU0, C1NanoUnitALU1]>;
+// These latencies are modeled without taking into account forwarding paths
+// (the software optimisation guide lists latencies taking into account
+// typical forwarding paths).
+def : WriteRes<WriteImm, [C1NanoUnitALU]> { let Latency = 1; }    // MOVN, MOVZ
+def : WriteRes<WriteI, [C1NanoUnitALU]> { let Latency = 1; }      // ALU
+def : WriteRes<WriteISReg, [C1NanoUnitALU]> { let Latency = 2; }  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [C1NanoUnitALU]> { let Latency = 2; }  // ALU of Extended-Reg
+def : WriteRes<WriteExtr, [C1NanoUnitALU]> { let Latency = 2; }   // EXTR from a reg pair
+def : WriteRes<WriteIS, [C1NanoUnitALU]> { let Latency = 2; }     // Shift/Scale
+
+// MAC
+def : WriteRes<WriteIM32, [C1NanoUnitMAC]> { let Latency = 3; }   // 32-bit Multiply
+def : WriteRes<WriteIM64, [C1NanoUnitMAC]> { let Latency = 4; let ReleaseAtCycles = [2];}   // 64-bit Multiply
+
+// Div
+def : WriteRes<WriteID32, [C1NanoUnitDiv]> {
+  let Latency = 8; let ReleaseAtCycles = [8];
+}
+def : WriteRes<WriteID64, [C1NanoUnitDiv]> {
+  let Latency = 16; let ReleaseAtCycles = [16];
+}
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the C1-Nano
+
+//===----------------------------------------------------------------------===//
+class C1NanoWrite<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+}
+
+class C1NanoMCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+  let ReleaseAtCycles = [m];
+  let BeginGroup = 1;
+}
+
+class C1NanoMC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+  let BeginGroup = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+def C1NanoWrite_10cyc_1VMAC_1VALU : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVMAC]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+
+def C1NanoWrite_14cyc_1VMAC_1VALU : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVMAC]> {
+  let Latency     = 14;
+  let NumMicroOps = 2;
+}
+
+class C1NanoWrite_PAC_B <int lat> : SchedWriteRes<[C1NanoUnitPAC, C1NanoUnitB]> {
+  let Latency = lat;
+  let NumMicroOps = 2;
+}
+// Load
+def : WriteRes<WriteLD, [C1NanoUnitLd]> { let Latency = 2; }
+def : WriteRes<WriteLDIdx, [C1NanoUnitLd]> { let Latency = 2; }
+def : WriteRes<WriteLDHi, [C1NanoUnitLd]> { let Latency = 2; }
+
+// Pre/Post Indexing - Performed as part of address generation
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+let RetireOOO = 1 in {
+def : WriteRes<WriteST, [C1NanoUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [C1NanoUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [C1NanoUnitLdSt]> { let Latency = 1; }
+}
+def : WriteRes<WriteSTX, [C1NanoUnitLdSt]> { let Latency = 3; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [C1NanoUnitLdSt]> { let Latency = 5;
+                                          let ReleaseAtCycles = [2];}
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [C1NanoUnitB]>;
+def : WriteRes<WriteBrReg, [C1NanoUnitB]>;
+def : WriteRes<WriteSys, [C1NanoUnitB]>;
+def : WriteRes<WriteBarrier, [C1NanoUnitB]>;
+def : WriteRes<WriteHint, [C1NanoUnitB]>;
+
+// FP ALU
+//   As WriteF result is produced in F5 and it can be mostly forwarded
+//   to consumer at F1, the effectively Latency is set as 4.
+def : WriteRes<WriteF, [C1NanoUnitVALU]> { let Latency = 4; }
+def : WriteRes<WriteFCmp, [C1NanoUnitVALU]> { let Latency = 3; }
+def : WriteRes<WriteFCvt, [C1NanoUnitVALU]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [C1NanoUnitVALU]> { let Latency = 3; }
+def : WriteRes<WriteFImm, [C1NanoUnitVALU]> { let Latency = 3; }
+
+class C1NanoVSt<int n> : SchedWriteRes<[C1NanoUnitLdSt]> {
+  let RetireOOO = 1;
+  let ReleaseAtCycles = [n];
+}
+
+def C1NanoVSt0      : SchedWriteRes<[C1NanoUnitLdSt]> {
+  let RetireOOO = 1;
+}
+
+def : SchedAlias<WriteVd, C1NanoWrite<4, C1NanoUnitVALU>>;
+def : SchedAlias<WriteVq, C1NanoWrite<4, C1NanoUnitVALU>>;
+
+// FP ALU specific new schedwrite definitions
+def C1NanoWriteFPALU_F3 : SchedWriteRes<[C1NanoUnitVALU]> { let Latency = 3;}
+def C1NanoWriteFPALU_F4 : SchedWriteRes<[C1NanoUnitVALU]> { let Latency = 4;}
+
+// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
+def : WriteRes<WriteFMul, [C1NanoUnitVMAC]> { let Latency = 4; }
+
+let RetireOOO = 1 in {
+def : WriteRes<WriteFDiv, [C1NanoUnitVMC]> { let Latency = 22;
+                                            let ReleaseAtCycles = [29]; }
+def C1NanoWriteVMAC : SchedWriteRes<[C1NanoUnitVMAC]> { let Latency = 4; }
+def C1NanoWriteFDivHP : SchedWriteRes<[C1NanoUnitVMC]> { let Latency = 8;
+                                                     let ReleaseAtCycles = [5]; }
+def C1NanoWriteFDivSP : SchedWriteRes<[C1NanoUnitVMC]> { let Latency = 13;
+                                                     let ReleaseAtCycles = [10]; }
+def C1NanoWriteFDivDP : SchedWriteRes<[C1NanoUnitVMC]> { let Latency = 22;
+                                                     let ReleaseAtCycles = [19]; }
+def C1NanoWriteFSqrtHP : SchedWriteRes<[C1NanoUnitVMC]> { let Latency = 8;
+                                                      let ReleaseAtCycles = [5]; }
+def C1NanoWriteFSqrtSP : SchedWriteRes<[C1NanoUnitVMC]> { let Latency = 12;
+                                                      let ReleaseAtCycles = [9]; }
+def C1NanoWriteFSqrtDP : SchedWriteRes<[C1NanoUnitVMC]> { let Latency = 22;
+                                                      let ReleaseAtCycles = [19]; }
+}
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+def : ReadAdvance<ReadVLD, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 1>;
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+
+
+// MUL
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 2>;
+
+// Div
+def : ReadAdvance<ReadID, 0>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+def C1NanoWriteISReg : SchedWriteVariant<[
+       SchedVar<RegShiftedPred, [WriteISReg]>,
+       SchedVar<NoSchedPred, [WriteI]>]>;
+def : InstRW<[C1NanoWriteISReg], (instregex ".*rs$")>;
+def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>;
+
+// Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Compute pointer authentication code, using generic key
+def : InstRW<[C1NanoWrite<5, C1NanoUnitPAC>], (instrs PACGA)>;
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code for instruction address
+def : InstRW<[C1NanoWrite<4, C1NanoUnitPAC>], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[C1NanoWrite_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+                                            BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+                                            ERETAA, ERETAB)>;
+
+// Load register, with pointer authentication
+def : InstRW<[C1NanoWrite<2, C1NanoUnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[C1NanoWrite<4, C1NanoUnitPAC>], (instrs XPACD, XPACI, XPACLRI)>;
+
+//---
+// Load instructions
+//---
+def C1NanoWriteVLD1 : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 3; }
+def C1NanoWriteVLD1SI : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 3; let SingleIssue = 1; }
+def C1NanoWriteLDP1 : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 2; }
+def C1NanoWriteLDP2 : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 2; }
+def C1NanoWriteLDP4 : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 2; }
+
+def : InstRW<[C1NanoWriteVLD1SI, C1NanoWriteLDP1], (instregex "LDPS?Wi")>;
+def : InstRW<[C1NanoWriteVLD1, C1NanoWriteLDP1], (instregex "LDPSi")>;
+def : InstRW<[C1NanoWriteVLD1, C1NanoWriteLDP2], (instregex "LDP(X|D)i")>;
+def : InstRW<[C1NanoWriteVLD1, C1NanoWriteLDP4], (instregex "LDPQi")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1SI, C1NanoWriteLDP1], (instregex "LDPS?W(pre|post)")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1, C1NanoWriteLDP1], (instregex "LDPS(pre|post)")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1, C1NanoWriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1, C1NanoWriteLDP4], (instregex "LDPQ(pre|post)")>;
+def : InstRW<[WriteI], (instrs COPY)>;
+//---
+// Vector Loads - 128-bit per cycle
+//---
+//   1-element structures
+def C1NanoWriteVLD1Latency3: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [1]; }
+def C1NanoWriteVLD1Latency4: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def C1NanoWriteVLD1Latency5: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; }
+def C1NanoWriteVLD1Latency6: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [5]; }
+
+def : InstRW<[C1NanoWriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[C1NanoWriteVLD1Latency3], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[C1NanoWriteVLD1Latency4], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[C1NanoWriteVLD1Latency4], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[C1NanoWriteVLD1Latency3], (instregex "LD1i(8|16|32|64)$")>;                // single element
+def : InstRW<[C1NanoWriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
+
+def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency4], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency4], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3], (instregex "LD1i(8|16|32|64)_POST$")>;                // single element
+def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // replicate
+
+//    2-element structures
+def C1NanoWriteVLD2Latency3: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [2]; }
+def C1NanoWriteVLD2Latency4Release1: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [1]; }
+def C1NanoWriteVLD2Latency4Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def C1NanoWriteVLD2Latency4Release4: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [4]; }
+
+def : InstRW<[C1NanoWriteVLD2Latency4Release1], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[C1NanoWriteVLD2Latency4Release4], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[C1NanoWriteVLD2Latency3], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, C1NanoWriteVLD2Latency4Release1], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD2Latency4Release4], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD2Latency3], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+
+//    3-element structures
+def C1NanoWriteVLD3Latency4Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def C1NanoWriteVLD3Latency5Release3: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; }
+def C1NanoWriteVLD3Latency5Release5: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [5]; }
+
+def : InstRW<[C1NanoWriteVLD3Latency5Release3], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[C1NanoWriteVLD3Latency5Release5], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[C1NanoWriteVLD3Latency4Release2], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, C1NanoWriteVLD3Latency5Release3], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD3Latency5Release5], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD3Latency4Release2], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+
+//    4-element structures
+def C1NanoWriteVLD4Latency4Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def C1NanoWriteVLD4Latency5Release3: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; }
+def C1NanoWriteVLD4Latency6Release5: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [5]; }
+
+def : InstRW<[C1NanoWriteVLD4Latency5Release3], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[C1NanoWriteVLD4Latency6Release5], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[C1NanoWriteVLD4Latency4Release2], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, C1NanoWriteVLD4Latency5Release3], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD4Latency6Release5], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVLD4Latency4Release2], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+// 1 Element structures
+def C1NanoWriteVST1 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; }
+def C1NanoWriteVST2 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [2]; }
+def C1NanoWriteVST3 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [3]; }
+def C1NanoWriteVST4 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [4]; }
+
+def : InstRW<[C1NanoWriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[C1NanoWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[C1NanoWriteVST2], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[C1NanoWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[C1NanoWriteVST2], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[C1NanoWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[C1NanoWriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+
+def : InstRW<[WriteAdr, C1NanoWriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST2], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST2], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// 2 Element structures
+def : InstRW<[C1NanoWriteVST2], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[C1NanoWriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[C1NanoWriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+
+def : InstRW<[WriteAdr, C1NanoWriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// 3 Element structures
+def : InstRW<[C1NanoWriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[C1NanoWriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+
+def : InstRW<[WriteAdr, C1NanoWriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+
+// 4 Element structures
+def : InstRW<[C1NanoWriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[C1NanoWriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+
+def : InstRW<[WriteAdr, C1NanoWriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, C1NanoWriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//---
+// Floating Point Conversions, MAC, DIV, SQRT
+//---
+def : InstRW<[C1NanoWriteFPALU_F3], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
+def : InstRW<[C1NanoWriteFPALU_F4], (instregex "^XTN")>;
+def : InstRW<[C1NanoWriteFPALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
+def : InstRW<[C1NanoWriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
+
+def : InstRW<[C1NanoWriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
+def : InstRW<[C1NanoWriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
+def : InstRW<[C1NanoWriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
+
+def : InstRW<[C1NanoWriteVMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[C1NanoWriteVMAC], (instregex "^FML(A|S)v.*")>;
+def : InstRW<[C1NanoWriteFDivHP], (instrs FDIVHrr)>;
+def : InstRW<[C1NanoWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[C1NanoWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[C1NanoWriteFDivHP], (instregex "^FDIVv.*16$")>;
+def : InstRW<[C1NanoWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[C1NanoWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[C1NanoWriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
+def : InstRW<[C1NanoWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[C1NanoWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+def : InstRW<[C1NanoWriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr)>;
+
+// 4.15. Advanced SIMD integer instructions
+// ASIMD absolute diff
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
+// ASIMD absolute diff accum
+def : InstRW<[C1NanoWrite<6, C1NanoUnitVALU>], (instregex "[SU]ABAL?v")>;
+// ASIMD absolute diff long
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ABDLv")>;
+// ASIMD arith #1
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "(ADD|SUB|NEG)v",
+  "[SU]R?HADDv", "[SU]HSUBv")>;
+// ASIMD arith #2
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
+  "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
+  "ADDPv(2i32|4i16|8i8)$")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>;
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
+  "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
+  "ADDPv(16i8|2i64|4i32|8i16)$")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>;
+// ASIMD arith #3
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex  "SADDLv", "UADDLv", "SADDWv",
+  "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex  "ADDHNv", "SUBHNv")>;
+// ASIMD arith #5
+def : InstRW<[C1NanoWrite<8, C1NanoUnitVALU>], (instregex "RADDHNv", "RSUBHNv")>;
+// ASIMD arith, reduce
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex  "ADDVv")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex  "SADDLVv", "UADDLVv")>;
+// ASIMD compare #1
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
+// ASIMD compare #2
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
+// ASIMD logical $1
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8",
+  "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8",
+  "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
+// ASIMD max/min, basic
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
+def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i32|8i16)")>;
+// SIMD max/min, reduce
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU](MAX|MIN)Vv")>;
+// ASIMD multiply, by element
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
+  "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instrs PMULv8i8)>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instrs PMULv16i8)>;
+// ASIMD multiply accumulate
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply accumulate half
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "SQRDML[AS]H[vi]")>;
+// ASIMD multiply accumulate long
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]ML[AS]Lv")>;
+// ASIMD multiply accumulate long #2
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "SQDML[AS]L[iv]")>;
+// ASIMD dot product
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]DOTv8i8")>;
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]DOTv16i8")>;
+// ASIMD dot product, by scalar
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]DOTlanev")>;
+// ASIMD multiply long
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>;
+// ASIMD pairwise add and accumulate
+def : InstRW<[C1NanoMCWrite<7, 2, C1NanoUnitVALU>], (instregex "[SU]ADALPv")>;
----------------
walkerkd wrote:

Up until now I have been checking the model implementation against what is in the SWOG to find issues.

However, I have just tried checking what is in the llvm-mca tests (which are autogenerated from information in the model) with what is in the SWOG.   This has found some more discrepancies in the timings so I am working on fixing those. 

https://github.com/llvm/llvm-project/pull/182316