[clang] [llvm] [RISCV] Add processor definition and scheduling model for XiangShan-KunMingHu (PR #90392)
Camel Coder via cfe-commits
cfe-commits at lists.llvm.org
Mon Apr 29 07:31:29 PDT 2024
=?utf-8?b?6YOd5bq36L6+?= <hebo at bosc.ac.cn>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/90392 at github.com>
================
@@ -0,0 +1,1489 @@
+//==- RISCVSchedXiangShanKunMingHu.td - XiangShanKunMingHu Scheduling Defs -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// The XiangShan is a high-performance open-source RISC-V processor project
+// initiated by the Institute of Computing Technology(ICT), Chinese Academy of Sciences(CAS).
+// The KunMingHu architecture is its third-generation derivative,
+// developed by the Institute of Computing Technology, Chinese Academy of Sciences
+// and the Beijing Institute of Open Source Chip (BOSC),
+// with a focus on achieving higher performance.
+// Source: https://github.com/OpenXiangShan/XiangShan
+// Documentation: https://github.com/OpenXiangShan/XiangShan-doc
+
+//===----------------------------------------------------------------------===//
+// KunMingHu core supports "RV64IMAFDCV_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh
+// _zksed_zksh_svinval_zicbom_zicboz_zicsr_zifencei"
+// then floating-point SEW can only be 64 and 32, not 16 and 8.
+class NoZvfhSchedSEWSet_rm8and16<string mx, bit isF = 0, bit isWidening = 0> {
+ defvar t = SchedSEWSet<mx, isF, isWidening>.val;
+ defvar remove8and16 = !if(isF, !listremove(t, [8, 16]), t);
+ list<int> val = remove8and16;
+}
+
+class NoZvfhSmallestSEW<string mx, bit isF = 0, bit isWidening = 0> {
+ int r = !head(NoZvfhSchedSEWSet_rm8and16<mx, isF, isWidening>.val);
+}
+
+multiclass NoZvfh_LMULSEWReadAdvanceImpl<string name, int val, list<SchedWrite> writes = [],
+ list<string> MxList, bit isF = 0,
+ bit isWidening = 0> {
+ if !exists<SchedRead>(name # "_WorstCase") then
+ def : ReadAdvance<!cast<SchedRead>(name # "_WorstCase"), val, writes>;
+ foreach mx = MxList in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF, isWidening>.val in
+ if !exists<SchedRead>(name # "_" # mx # "_E" # sew) then
+ def : ReadAdvance<!cast<SchedRead>(name # "_" # mx # "_E" # sew), val, writes>;
+ }
+}
+
+multiclass LMULSEWReadAdvanceFnoZvfh<string name, int val, list<SchedWrite> writes = []>
+ : NoZvfh_LMULSEWReadAdvanceImpl<name, val, writes, SchedMxListF, isF=1,
+ isWidening=0>;
+
+multiclass LMULSEWReadAdvanceFWnoZvfh<string name, int val, list<SchedWrite> writes = []>
+ : NoZvfh_LMULSEWReadAdvanceImpl<name, val, writes, SchedMxListFW, isF = 1,
+ isWidening=1>;
+
+//===----------------------------------------------------------------------===//
+// If Zvfhmin and Zvfh are not supported, floating-point SEW can only be 32 or 64.
+class NoZvfhSchedSEWSet_rm32and64<string mx, bit isF = 0, bit isWidening = 0> {
+ defvar t = SchedSEWSet<mx, isF, isWidening>.val;
+ defvar remove32and64 = !if(isF, !listremove(t, [32, 64]), t);
+ list<int> val = remove32and64;
+}
+
+// Write-Impl
+multiclass NoZvfhLMULSEWWriteResImpl<string name, list<ProcResourceKind> resources,
+ list<string> MxList, bit isF = 0,
+ bit isWidening = 0> {
+ foreach mx = MxList in {
+ foreach sew = NoZvfhSchedSEWSet_rm32and64<mx, isF, isWidening>.val in
+ if !exists<SchedWrite>(name # "_" # mx # "_E" # sew) then
+ def : WriteRes<!cast<SchedWrite>(name # "_" # mx # "_E" # sew), resources>;
+ }
+}
+// Read-Impl
+multiclass NoZvfhLMULSEWReadAdvanceImpl<string name, int val, list<SchedWrite> writes = [],
+ list<string> MxList, bit isF = 0,
+ bit isWidening = 0> {
+ foreach mx = MxList in {
+ foreach sew = NoZvfhSchedSEWSet_rm32and64<mx, isF, isWidening>.val in
+ if !exists<SchedRead>(name # "_" # mx # "_E" # sew) then
+ def : ReadAdvance<!cast<SchedRead>(name # "_" # mx # "_E" # sew), val, writes>;
+ }
+}
+
+// Write
+multiclass NoZvfhLMULSEWWriteResF<string name, list<ProcResourceKind> resources>
+ : NoZvfhLMULSEWWriteResImpl<name, resources, SchedMxListF, isF=1>;
+
+multiclass NoZvfhLMULSEWWriteResFW<string name, list<ProcResourceKind> resources>
+ : NoZvfhLMULSEWWriteResImpl<name, resources, SchedMxListFW, isF=1, isWidening=1>;
+
+multiclass NoZvfhLMULSEWWriteResFWRed<string name, list<ProcResourceKind> resources>
+ : NoZvfhLMULSEWWriteResImpl<name, resources, SchedMxListFWRed, isF=1, isWidening=1>;
+
+// Read
+multiclass NoZvfhLMULSEWReadAdvanceF<string name, int val, list<SchedWrite> writes = []>
+ : NoZvfhLMULSEWReadAdvanceImpl<name, val, writes, SchedMxListF, isF=1>;
+multiclass
+ NoZvfhLMULSEWReadAdvanceFW<string name, int val, list<SchedWrite> writes = []>
+ : NoZvfhLMULSEWReadAdvanceImpl<name, val, writes, SchedMxListFW, isF=1,
+ isWidening = 1>;
+
+multiclass UnsupportedSchedZvfh {
+let Unsupported = true in {
+// Write
+// 13. Vector Floating-Point Instructions
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFALUV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFALUF", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWALUV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWALUF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFDivV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFDivF", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulAddV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulAddF", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulAddV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulAddF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFSqrtV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRecpV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMinMaxV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMinMaxF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFSgnjV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFSgnjF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFCvtIToFV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWCvtFToFV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFNCvtIToFV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFNCvtFToFV", []>;
+
+// 14. Vector Reduction Operations
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRedV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRedOV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRedMinMaxV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResFWRed<"WriteVFWRedV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResFWRed<"WriteVFWRedOV_From", []>;
+
+// Read
+// 13. Vector Floating-Point Instructions
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFALUV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFALUF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+} // Unsupported
+} // UnsupportedSchedZvfh
+
+//===----------------------------------------------------------------------===//
+
+class XSGetCyclesVIALU<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 2,
+ !eq(mx, "M2") : 4,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ !eq(mx, "MF2") : 2,
+ !eq(mx, "MF4") : 2,
+ !eq(mx, "MF8") : 2
+ );
+}
+
+class XSGetCyclesVIMAC<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 3,
+ !eq(mx, "M2") : 6,
+ !eq(mx, "M4") : 12,
+ !eq(mx, "M8") : 24,
+ !eq(mx, "MF2") : 3,
+ !eq(mx, "MF4") : 3,
+ !eq(mx, "MF8") : 3
+ );
+}
+
+class XSGetCyclesVIDIV<string mx, int sew> {
+ int uop = !cond(
+ !eq(mx, "M1") : 1,
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ !eq(mx, "MF2") : 1,
+ !eq(mx, "MF4") : 1,
+ !eq(mx, "MF8") : 1
+ );
+ int cycles = !cond(
+ !eq(sew, 64) : 19, // I64: 4-19
+ !eq(sew, 32) : 11, // I32: 4-11
+ !eq(sew, 16) : 7, // I16: 4-7
+ !eq(sew, 8) : 6 // I8: 6
+ );
+ int c = !mul(uop, cycles);
+}
+
+class XSGetCyclesVIPU<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 2,
+ !eq(mx, "M2") : 4,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ !eq(mx, "MF2") : 2,
+ !eq(mx, "MF4") : 2,
+ !eq(mx, "MF8") : 2
+ );
+}
+
+class XSGetCyclesVPPU<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 2,
+ !eq(mx, "M2") : 4,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ !eq(mx, "MF2") : 2,
+ !eq(mx, "MF4") : 2,
+ !eq(mx, "MF8") : 2
+ );
+}
+
+class XSGetCyclesVFALU<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 2,
+ !eq(mx, "M2") : 4,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ !eq(mx, "MF2") : 2,
+ !eq(mx, "MF4") : 2,
+ !eq(mx, "MF8") : 2
+ );
+}
+
+class XSGetCyclesVFMA<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 4,
+ !eq(mx, "M2") : 8,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "M8") : 32,
+ !eq(mx, "MF2") : 4,
+ !eq(mx, "MF4") : 4,
+ !eq(mx, "MF8") : 4
+ );
+}
+
+class XSGetCyclesVFDIV<string mx, int sew> {
+ assert !or(!eq(sew, 32), !eq(sew, 64)), "Floating-point SEW of KunMingHu can only be 32 or 64.";
+ int uop = !cond(
+ !eq(mx, "M1") : 1,
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ !eq(mx, "MF2") : 1,
+ !eq(mx, "MF4") : 1,
+ !eq(mx, "MF8") : 1
+ );
+ int cycles = !cond(
+ !eq(sew, 64) : 15, // FP64: 15
+ !eq(sew, 32) : 10, // FP32: 10
+ );
+ int c = !mul(uop, cycles);
+}
+
+class XSGetCyclesVFCVT<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 3,
+ !eq(mx, "M2") : 6,
+ !eq(mx, "M4") : 12,
+ !eq(mx, "M8") : 24,
+ !eq(mx, "MF2") : 3,
+ !eq(mx, "MF4") : 3,
+ !eq(mx, "MF8") : 3
+ );
+}
+
+class XSGetCyclesVLDU<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 8,
+ !eq(mx, "M2") : 16,
+ !eq(mx, "M4") : 32,
+ !eq(mx, "M8") : 64,
+ !eq(mx, "MF2") : 8,
+ !eq(mx, "MF4") : 8,
+ !eq(mx, "MF8") : 8
+ );
+}
+
+class XSGetCyclesVSTU<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 7,
+ !eq(mx, "M2") : 14,
+ !eq(mx, "M4") : 28,
+ !eq(mx, "M8") : 56,
+ !eq(mx, "MF2") : 7,
+ !eq(mx, "MF4") : 7,
+ !eq(mx, "MF8") : 7
+ );
+}
+
+// If mx is the maximum LMUL in the MxList, then c is true, indicating the worst case.
+class XSIsWorstCaseMX<string mx, list<string> MxList> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
+}
+
+// If mx is the maximum LMUL in the MxList, and sew is the minimum value
+// when LMUL=mx, then c is true, indicating the worst case.
+class XSIsWorstCaseMXSEW<string mx, int sew, list<string> MxList,
+ bit isF = 0> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ defvar SSEW = NoZvfhSmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+class XSLDUtoAnyBypass<SchedRead read, int cycles = 2>
+ : ReadAdvance<read, cycles, [WriteLDB, WriteLDH,
+ WriteLDW, WriteLDD,
+ WriteAtomicW, WriteAtomicD,
+ WriteAtomicLDW, WriteAtomicLDD]>;
+
+//===----------------------------------------------------------------------===//
+
+def XiangShanKunMingHuModel : SchedMachineModel {
+ let IssueWidth = 6; // 6-way decode and dispatch
+ let MicroOpBufferSize = 256;
+ let LoopMicroOpBufferSize = 48; // Instruction queue size
+ let LoadLatency = 6;
+ let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+ let PostRAScheduler = 1;
+ let CompleteModel = 0;
+ let UnsupportedFeatures = [HasStdExtZcmt, HasStdExtZkr];
+}
+
+let SchedModel = XiangShanKunMingHuModel in {
+// Define each kind of processor resource and number available.
+/// Pipline
+let BufferSize = 12 in {
+ // Integer
+ def XSPipeALU0 : ProcResource<1>; // ALU, MUL, BKU
+ def XSPipeALU1 : ProcResource<1>; // ALU, MUL, BKU
+ def XSPipeALU2 : ProcResource<1>; // ALU
+ def XSPipeALU3 : ProcResource<1>; // ALU
+ def XSPipeBJU0 : ProcResource<1>; // BRU, JMP
+ def XSPipeBJU1 : ProcResource<1>; // BRU, JMP
+ def XSPipeBJU2 : ProcResource<1>; // BRU, JMP, I2F, I2V, VSET, CSR, FENCE
+ def XSPipeDIV : ProcResource<1>; // DIV
+
+ // Vector and floating-point
+ def XSPipVFEX0 : ProcResource<1>; // VFALU, VFMA, VIALU, VIMAC
+ def XSPipVFEX1 : ProcResource<1>; // VIPU, VPPU, VFCVT, F2V, VSET2
+ def XSPipVFEX2 : ProcResource<1>; // VFALU, VFMA, VIALU
+ def XSPipVFEX3 : ProcResource<1>; // VFDIV, VIDIV
+
+ // Vector load and store
+ def XSPipVLDU : ProcResource<1>; // VLDU
+ def XSPipVSTU : ProcResource<1>; // VSTU
+}
+
+let BufferSize = 24 in {
+ // Load and store
+ def XSPipeLDU0 : ProcResource<1>; // LDU
+ def XSPipeLDU1 : ProcResource<1>; // LDU
+ def XSPipeLDU2 : ProcResource<1>; // LDU
+ def XSPipeSTU0 : ProcResource<1>; // STU
+ def XSPipeSTU1 : ProcResource<1>; // STU
+}
+
+def XSPipeGroupALU : ProcResGroup<[XSPipeALU0, XSPipeALU1, XSPipeALU2, XSPipeALU3]>;
+def XSPipeGroupMUL : ProcResGroup<[XSPipeALU0, XSPipeALU1]>;
+def XSPipeGroupBKU : ProcResGroup<[XSPipeALU0, XSPipeALU1]>;
+def XSPipeGroupBRU : ProcResGroup<[XSPipeBJU0, XSPipeBJU1, XSPipeBJU2]>;
+def XSPipeGroupJMP : ProcResGroup<[XSPipeBJU0, XSPipeBJU1, XSPipeBJU2]>;
+
+def XSPipeGroupVIALU : ProcResGroup<[XSPipVFEX0, XSPipVFEX2]>;
+def XSPipeGroupVFALU : ProcResGroup<[XSPipVFEX0, XSPipVFEX2]>;
+def XSPipeGroupVFMA : ProcResGroup<[XSPipVFEX0, XSPipVFEX2]>;
+
+def XSPipeGroupLDU : ProcResGroup<[XSPipeLDU0, XSPipeLDU1, XSPipeLDU2]>;
+def XSPipeGroupSTU : ProcResGroup<[XSPipeSTU0, XSPipeSTU1]>;
+
+/// Register
+def XS_INT_PRF : RegisterFile<224, [GPR], [1], [1], 0, 0>;
+def XS_FP_PRF : RegisterFile<192, [FPR64], [1], [1], 0, 0>;
+
+//===----------------------------------------------------------------------===//
+
+// Jump
+let Latency = 1 in {
+ def : WriteRes<WriteJmp, [XSPipeGroupBRU]>;
+ def : WriteRes<WriteJal, [XSPipeGroupJMP]>;
+ def : WriteRes<WriteJalr, [XSPipeGroupJMP]>;
+}
+
+// Integer arithmetic and logic
+let Latency = 1 in {
+ def : WriteRes<WriteIALU32, [XSPipeGroupALU]>;
+ def : WriteRes<WriteIALU, [XSPipeGroupALU]>;
+ def : WriteRes<WriteShiftImm32, [XSPipeGroupALU]>;
+ def : WriteRes<WriteShiftImm, [XSPipeGroupALU]>;
+ def : WriteRes<WriteShiftReg32, [XSPipeGroupALU]>;
+ def : WriteRes<WriteShiftReg, [XSPipeGroupALU]>;
+}
+
+// Integer multiplication
+let Latency = 3 in {
+ def : WriteRes<WriteIMul, [XSPipeGroupMUL]>;
+ def : WriteRes<WriteIMul32, [XSPipeGroupMUL]>;
+}
+
+// Integer division
+// Worst case latency is used.
+// The latency of integer division ranges from 4 to 20.
+let Latency = 20, ReleaseAtCycles = [20] in {
+ def : WriteRes<WriteIDiv32, [XSPipeDIV]>;
+ def : WriteRes<WriteIDiv, [XSPipeDIV]>;
+ def : WriteRes<WriteIRem32, [XSPipeDIV]>;
+ def : WriteRes<WriteIRem, [XSPipeDIV]>;
+}
+
+// Memory
+let Latency = 5 in {
+ def : WriteRes<WriteSTB, [XSPipeGroupSTU]>;
+ def : WriteRes<WriteSTH, [XSPipeGroupSTU]>;
+ def : WriteRes<WriteSTW, [XSPipeGroupSTU]>;
+ def : WriteRes<WriteSTD, [XSPipeGroupSTU]>;
+ def : WriteRes<WriteFST32, [XSPipeGroupSTU]>;
+ def : WriteRes<WriteFST64, [XSPipeGroupSTU]>;
+ def : WriteRes<WriteAtomicSTW, [XSPipeGroupSTU]>;
+ def : WriteRes<WriteAtomicSTD, [XSPipeGroupSTU]>;
+}
+let Latency = 6 in {
+ def : WriteRes<WriteLDB, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteLDH, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteLDW, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteLDD, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteFLD32, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteFLD64, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteAtomicW, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteAtomicD, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteAtomicLDW, [XSPipeGroupLDU]>;
+ def : WriteRes<WriteAtomicLDD, [XSPipeGroupLDU]>;
+}
+
+let Latency = 2 in {
+ def : WriteRes<WriteFAdd32, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFAdd64, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFCmp32, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFCmp64, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFMinMax32, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFMinMax64, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFClass32, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFClass64, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFSGNJ32, [XSPipeGroupVFALU]>;
+ def : WriteRes<WriteFSGNJ64, [XSPipeGroupVFALU]>;
+}
+
+let Latency = 4 in {
+ def : WriteRes<WriteFMul32, [XSPipeGroupVFMA]>;
+ def : WriteRes<WriteFMul64, [XSPipeGroupVFMA]>;
+ def : WriteRes<WriteFMA32, [XSPipeGroupVFMA]>;
+ def : WriteRes<WriteFMA64, [XSPipeGroupVFMA]>;
+}
+
+// VFDIV
+let Latency = 10 in {
+ def : WriteRes<WriteFDiv32, [XSPipVFEX3]>;
+ def : WriteRes<WriteFSqrt32, [XSPipVFEX3]>;
+}
+let Latency = 15 in {
+ def : WriteRes<WriteFDiv64, [XSPipVFEX3]>;
+ def : WriteRes<WriteFSqrt64, [XSPipVFEX3]>;
+}
+
+// VFCVT
+let Latency = 3 in {
+ def : WriteRes<WriteFCvtF32ToI32, [XSPipVFEX1]>;
+ def : WriteRes<WriteFCvtF32ToI64, [XSPipVFEX1]>;
+ def : WriteRes<WriteFCvtF64ToI32, [XSPipVFEX1]>;
+ def : WriteRes<WriteFCvtF64ToI64, [XSPipVFEX1]>;
+ def : WriteRes<WriteFCvtF64ToF32, [XSPipVFEX1]>;
+ def : WriteRes<WriteFCvtF32ToF64, [XSPipVFEX1]>;
+ def : WriteRes<WriteFMovF64ToI64, [XSPipVFEX1]>;
+ def : WriteRes<WriteFMovF32ToI32, [XSPipVFEX1]>;
+}
+
+// I2V
+let Latency = 1 in {
+ def : WriteRes<WriteFMovI64ToF64, [XSPipeBJU2]>;
+ def : WriteRes<WriteFMovI32ToF32, [XSPipeBJU2]>;
+}
+
+// I2F
+let Latency = 3 in {
+ def : WriteRes<WriteFCvtI32ToF32, [XSPipeBJU2]>;
+ def : WriteRes<WriteFCvtI64ToF32, [XSPipeBJU2]>;
+ def : WriteRes<WriteFCvtI32ToF64, [XSPipeBJU2]>;
+ def : WriteRes<WriteFCvtI64ToF64, [XSPipeBJU2]>;
+}
+
+/// Zb*
+let Latency = 1 in {
+ // Zba
+ def : WriteRes<WriteSHXADD, [XSPipeGroupALU]>;
+ def : WriteRes<WriteSHXADD32, [XSPipeGroupALU]>;
+
+ // Zbb
+ def : WriteRes<WriteRotateImm, [XSPipeGroupALU]>;
+ def : WriteRes<WriteRotateImm32, [XSPipeGroupALU]>;
+ def : WriteRes<WriteRotateReg, [XSPipeGroupALU]>;
+ def : WriteRes<WriteRotateReg32, [XSPipeGroupALU]>;
+ def : WriteRes<WriteREV8, [XSPipeGroupALU]>;
+ def : WriteRes<WriteORCB, [XSPipeGroupALU]>;
+ def : WriteRes<WriteIMinMax, [XSPipeGroupALU]>;
+
+ // Zbs
+ def : WriteRes<WriteSingleBit, [XSPipeGroupALU]>;
+ def : WriteRes<WriteSingleBitImm, [XSPipeGroupALU]>;
+ def : WriteRes<WriteBEXT, [XSPipeGroupALU]>;
+ def : WriteRes<WriteBEXTI, [XSPipeGroupALU]>;
+
+ // Zbkb
+ def : WriteRes<WriteBREV8, [XSPipeGroupALU]>;
+ def : WriteRes<WritePACK, [XSPipeGroupALU]>;
+ def : WriteRes<WritePACK32, [XSPipeGroupALU]>;
+ def : WriteRes<WriteZIP, [XSPipeGroupALU]>;
+}
+
+let Latency = 3 in {
+ // Zbb
+ def : WriteRes<WriteCLZ, [XSPipeGroupBKU]>;
+ def : WriteRes<WriteCLZ32, [XSPipeGroupBKU]>;
+ def : WriteRes<WriteCTZ, [XSPipeGroupBKU]>;
+ def : WriteRes<WriteCTZ32, [XSPipeGroupBKU]>;
+ def : WriteRes<WriteCPOP, [XSPipeGroupBKU]>;
+ def : WriteRes<WriteCPOP32, [XSPipeGroupBKU]>;
+
+ // Zbc
+ def : WriteRes<WriteCLMUL, [XSPipeGroupBKU]>;
+
+ // Zbkx
+ def : WriteRes<WriteXPERM, [XSPipeGroupBKU]>;
+}
+
+/// Vector extension
+// 3.6 Vector Byte Length vlenb
+def : WriteRes<WriteRdVLENB, [XSPipeGroupALU]>;
+
+// 6. Configuration-Setting Instructions
+// VSET VSET2
+let Latency = 1 in {
+ def : WriteRes<WriteVSETVLI, [XSPipVFEX1]>;
+ def : WriteRes<WriteVSETIVLI, [XSPipVFEX1]>;
+ def : WriteRes<WriteVSETVL, [XSPipeBJU2]>;
+}
+
+// 7. Vector Loads and Stores
+// VLDU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVLDU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVLDE", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDM", [XSPipVLDU], mx, IsWorstCase>;
+ }
+}
+
+// VSTU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVSTU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVSTE", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTM", [XSPipVSTU], mx, IsWorstCase>;
+ }
+}
+
+// VLDU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVLDU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVLDS8", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDS16", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDS32", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDS64", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDUX8", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDUX16", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDUX32", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDUX64", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX8", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX16", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX32", [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX64", [XSPipVLDU], mx, IsWorstCase>;
+ }
+}
+
+// VSTU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVSTU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVSTS8", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTS16", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTS32", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTS64", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTUX8", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTUX16", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTUX32", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTUX64", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX8", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX16", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX32", [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX64", [XSPipVSTU], mx, IsWorstCase>;
+ }
+}
+
+// VLDU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVLDU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVLDFF", [XSPipVLDU], mx, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxList in {
+ foreach nf=2-8 in {
+ foreach eew = [8, 16, 32, 64] in {
+ defvar CyclesLoad = XSGetCyclesVLDU<mx>.c;
+ defvar CyclesStore = XSGetCyclesVSTU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = CyclesLoad in {
+ // VLDU
+ defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+ }
+ let Latency = CyclesStore in {
+ // VSTU
+ defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew, [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew, [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [XSPipVSTU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [XSPipVSTU], mx, IsWorstCase>;
+ }
+ }
+ }
+}
+
+// VLDU
+let Latency = 8 in
+ def : WriteRes<WriteVLD1R, [XSPipVLDU]>;
+let Latency = 16 in
+ def : WriteRes<WriteVLD2R, [XSPipVLDU]>;
+let Latency = 32 in
+ def : WriteRes<WriteVLD4R, [XSPipVLDU]>;
+let Latency = 64 in
+ def : WriteRes<WriteVLD8R, [XSPipVLDU]>;
+
+// VSTU
+let Latency = 7 in
+ def : WriteRes<WriteVST1R, [XSPipVSTU]>;
+let Latency = 14 in
+ def : WriteRes<WriteVST2R, [XSPipVSTU]>;
+let Latency = 28 in
+ def : WriteRes<WriteVST4R, [XSPipVSTU]>;
+let Latency = 56 in
+ def : WriteRes<WriteVST8R, [XSPipVSTU]>;
+
+// 11. Vector Integer Arithmetic Instructions
+// VIALU
+// The latency of KunMingHu vector extension instructions is independent of SEW.
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVIALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVIALUV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVExtV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+
+ // Because .vx and .vi need to be converted to .vv before execution,
+ // an additional cycle is required.
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVIALUX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+}
+
+// VIALU
+foreach mx = SchedMxListW in {
+ defvar Cycles = XSGetCyclesVIALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+
+ }
+}
+
+// VIMAC
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVIMulV", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVIMulX", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+}
+
+// VIDIV
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar Cycles = XSGetCyclesVIDIV<mx, sew>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [XSPipVFEX3], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [XSPipVFEX3], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VIMAC
+foreach mx = SchedMxListW in {
+ defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [XSPipVFEX0], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [XSPipVFEX0], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+}
+
+// VIMAC
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+}
+
+// VIALU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVIALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+
+ // 12. Vector Fixed-Point Arithmetic Instructions
+ defm "" : LMULWriteResMX<"WriteVSALUX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+}
+
+// VIMAC
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVSMulV", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVSMulX", [XSPipVFEX0], mx, IsWorstCase>;
+ }
+}
+
+// VIALU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVIALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVSShiftV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVSShiftX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+}
+
+// VIALU
+foreach mx = SchedMxListW in {
+ defvar Cycles = XSGetCyclesVIALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVNClipV", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVNClipX", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipI", [XSPipeGroupVIALU], mx, IsWorstCase>;
+ }
+}
+
+// 13. Vector Floating-Point Instructions
+// VFALU
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFALU
+foreach mx = SchedMxListFW in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = XSGetCyclesVFALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFMA
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFMA<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFDIV
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFDIV<mx, sew>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [XSPipVFEX3], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [XSPipVFEX3], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFMA
+foreach mx = SchedMxListFW in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = XSGetCyclesVFMA<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFMA
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFMA<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFMA
+foreach mx = SchedMxListFW in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = XSGetCyclesVFMA<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFDIV
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFDIV<mx, sew>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [XSPipVFEX3], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFCVT
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFALU
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFALU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVFALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [XSPipeGroupVFALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFClassV", [XSPipeGroupVFALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMergeV", [XSPipeGroupVFALU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMovV", [XSPipeGroupVFALU], mx, IsWorstCase>;
+ }
+ let Latency = !add(Cycles, 1) in {
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [XSPipeGroupVFALU], mx, IsWorstCase>;
+ }
+}
+
+// VFCVT
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFCVT
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [XSPipVFEX1], mx, IsWorstCase>;
+ }
+}
+
+// VFCVT
+foreach mx = SchedMxListW in {
+ foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+ defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFCVT
+foreach mx = SchedMxListFW in {
+ defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListFW>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [XSPipVFEX1], mx, IsWorstCase>;
+ }
+}
+
+// VFCVT
+foreach mx = SchedMxListFW in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFCVT
+foreach mx = SchedMxListW in {
+ defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles in {
+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [XSPipVFEX1], mx, IsWorstCase>;
+ }
+}
+
+// 14. Vector Reduction Operations
+// VIPU
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar Cycles = XSGetCyclesVIPU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VIPU
+foreach mx = SchedMxListWRed in {
+ foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+ defvar Cycles = XSGetCyclesVIPU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [XSPipVFEX1], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFALU
+foreach mx = SchedMxListF in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+ defvar Cycles = XSGetCyclesVFALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// VFALU
+foreach mx = SchedMxListFWRed in {
+ foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = XSGetCyclesVFALU<mx>.c;
+ defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, isF=1>.c;
+ let Latency = Cycles in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// 15. Vector Mask Instructions
+// VIALU
+foreach mx = SchedMxList in {
+ defvar Cycles = XSGetCyclesVIALU<mx>.c;
----------------
camel-cdr wrote:
Are you sure this is correct? Since masks always fit into a single LMUL=1 vector register, you'd expect that an LMUL=8 SEW=8 vmand.mm would have the same latency has a LMUL=1 vand.vv. Or does xiangshan use a different internal format for mask registers? See how the SiFivdP600 schedular sets the latency of all mask instructions to 1.
https://github.com/llvm/llvm-project/pull/90392
More information about the cfe-commits
mailing list