[clang] [llvm] [RISCV] Add processor definition and scheduling model for XiangShan-KunMingHu (PR #90392)

Camel Coder via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 29 07:31:29 PDT 2024


=?utf-8?b?6YOd5bq36L6+?= <hebo at bosc.ac.cn>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/90392 at github.com>


================
@@ -0,0 +1,1489 @@
+//==- RISCVSchedXiangShanKunMingHu.td - XiangShanKunMingHu Scheduling Defs -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// The XiangShan is a high-performance open-source RISC-V processor project 
+// initiated by the Institute of Computing Technology(ICT), Chinese Academy of Sciences(CAS). 
+// The KunMingHu architecture is its third-generation derivative, 
+// developed by the Institute of Computing Technology, Chinese Academy of Sciences  
+// and the Beijing Institute of Open Source Chip (BOSC), 
+// with a focus on achieving higher performance.
+// Source: https://github.com/OpenXiangShan/XiangShan
+// Documentation: https://github.com/OpenXiangShan/XiangShan-doc
+
+//===----------------------------------------------------------------------===//
+// KunMingHu core supports "RV64IMAFDCV_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh
+// _zksed_zksh_svinval_zicbom_zicboz_zicsr_zifencei"
+// then floating-point SEW can only be 64 and 32, not 16 and 8.
+class NoZvfhSchedSEWSet_rm8and16<string mx, bit isF = 0, bit isWidening = 0> {
+  defvar t = SchedSEWSet<mx, isF, isWidening>.val; 
+  defvar remove8and16 = !if(isF, !listremove(t, [8, 16]), t);
+  list<int> val = remove8and16;
+}
+
+class NoZvfhSmallestSEW<string mx, bit isF = 0, bit isWidening = 0> {
+  int r = !head(NoZvfhSchedSEWSet_rm8and16<mx, isF, isWidening>.val);
+}
+
+multiclass NoZvfh_LMULSEWReadAdvanceImpl<string name, int val, list<SchedWrite> writes = [],
+                                  list<string> MxList, bit isF = 0,
+                                  bit isWidening = 0> {
+  if !exists<SchedRead>(name # "_WorstCase") then
+    def : ReadAdvance<!cast<SchedRead>(name # "_WorstCase"), val, writes>;
+  foreach mx = MxList in {
+    foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF, isWidening>.val in
+      if !exists<SchedRead>(name # "_" # mx # "_E" # sew) then
+        def : ReadAdvance<!cast<SchedRead>(name # "_" # mx # "_E" # sew), val, writes>;
+  }
+}
+
+multiclass LMULSEWReadAdvanceFnoZvfh<string name, int val, list<SchedWrite> writes = []>
+  : NoZvfh_LMULSEWReadAdvanceImpl<name, val, writes, SchedMxListF, isF=1,
+                           isWidening=0>;
+
+multiclass LMULSEWReadAdvanceFWnoZvfh<string name, int val, list<SchedWrite> writes = []>
+    : NoZvfh_LMULSEWReadAdvanceImpl<name, val, writes, SchedMxListFW, isF = 1,
+                             isWidening=1>;
+
+//===----------------------------------------------------------------------===//
+// If Zvfhmin and Zvfh are not supported, floating-point SEW can only be 32 or 64.
+class NoZvfhSchedSEWSet_rm32and64<string mx, bit isF = 0, bit isWidening = 0> {
+  defvar t = SchedSEWSet<mx, isF, isWidening>.val;
+  defvar remove32and64 = !if(isF, !listremove(t, [32, 64]), t);
+  list<int> val = remove32and64;
+}
+
+// Write-Impl
+multiclass NoZvfhLMULSEWWriteResImpl<string name, list<ProcResourceKind> resources,
+                               list<string> MxList, bit isF = 0,
+                               bit isWidening = 0> {
+  foreach mx = MxList in {
+    foreach sew = NoZvfhSchedSEWSet_rm32and64<mx, isF, isWidening>.val in
+      if !exists<SchedWrite>(name # "_" # mx # "_E" # sew) then
+        def : WriteRes<!cast<SchedWrite>(name # "_" # mx # "_E" # sew), resources>;
+  }
+}
+// Read-Impl
+multiclass NoZvfhLMULSEWReadAdvanceImpl<string name, int val, list<SchedWrite> writes = [],
+                                  list<string> MxList, bit isF = 0,
+                                  bit isWidening = 0> {
+  foreach mx = MxList in {
+    foreach sew = NoZvfhSchedSEWSet_rm32and64<mx, isF, isWidening>.val in
+      if !exists<SchedRead>(name # "_" # mx # "_E" # sew) then
+        def : ReadAdvance<!cast<SchedRead>(name # "_" # mx # "_E" # sew), val, writes>;
+  }
+}
+
+// Write
+multiclass NoZvfhLMULSEWWriteResF<string name, list<ProcResourceKind> resources>
+    : NoZvfhLMULSEWWriteResImpl<name, resources, SchedMxListF, isF=1>;
+
+multiclass NoZvfhLMULSEWWriteResFW<string name, list<ProcResourceKind> resources>
+    : NoZvfhLMULSEWWriteResImpl<name, resources, SchedMxListFW, isF=1, isWidening=1>;
+
+multiclass NoZvfhLMULSEWWriteResFWRed<string name, list<ProcResourceKind> resources>
+    : NoZvfhLMULSEWWriteResImpl<name, resources, SchedMxListFWRed, isF=1, isWidening=1>;
+
+// Read
+multiclass NoZvfhLMULSEWReadAdvanceF<string name, int val, list<SchedWrite> writes = []>
+  : NoZvfhLMULSEWReadAdvanceImpl<name, val, writes, SchedMxListF, isF=1>;
+multiclass
+    NoZvfhLMULSEWReadAdvanceFW<string name, int val, list<SchedWrite> writes = []>
+    : NoZvfhLMULSEWReadAdvanceImpl<name, val, writes, SchedMxListFW, isF=1,
+                             isWidening = 1>;
+
+multiclass UnsupportedSchedZvfh {
+let Unsupported = true in {
+// Write 
+// 13. Vector Floating-Point Instructions
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFALUV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFALUF", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWALUV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWALUF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFDivV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFDivF", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulAddV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMulAddF", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulAddV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWMulAddF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFSqrtV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRecpV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMinMaxV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFMinMaxF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFSgnjV", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFSgnjF", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFCvtIToFV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFWCvtFToFV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFNCvtIToFV", []>;
+defm "" : NoZvfhLMULSEWWriteResFW<"WriteVFNCvtFToFV", []>;
+
+// 14. Vector Reduction Operations
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRedV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRedOV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResF<"WriteVFRedMinMaxV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResFWRed<"WriteVFWRedV_From", []>;
+defm "" : NoZvfhLMULSEWWriteResFWRed<"WriteVFWRedOV_From", []>;
+
+// Read
+// 13. Vector Floating-Point Instructions
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFALUV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFALUF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : NoZvfhLMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+} // Unsupported
+} // UnsupportedSchedZvfh
+
+//===----------------------------------------------------------------------===//
+
+class XSGetCyclesVIALU<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 2,
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 16,
+    !eq(mx, "MF2") : 2,
+    !eq(mx, "MF4") : 2,
+    !eq(mx, "MF8") : 2
+  );
+}
+
+class XSGetCyclesVIMAC<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 3,
+    !eq(mx, "M2") : 6,
+    !eq(mx, "M4") : 12,
+    !eq(mx, "M8") : 24,
+    !eq(mx, "MF2") : 3,
+    !eq(mx, "MF4") : 3,
+    !eq(mx, "MF8") : 3
+  );
+}
+
+class XSGetCyclesVIDIV<string mx, int sew> {
+  int uop = !cond(
+    !eq(mx, "M1") : 1,
+    !eq(mx, "M2") : 2,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+  int cycles = !cond(
+    !eq(sew, 64) : 19,   // I64: 4-19
+    !eq(sew, 32) : 11,   // I32: 4-11
+    !eq(sew, 16) : 7,    // I16: 4-7
+    !eq(sew, 8) : 6      // I8: 6
+  );
+  int c = !mul(uop, cycles);
+}
+
+class XSGetCyclesVIPU<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 2,
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 16,
+    !eq(mx, "MF2") : 2,
+    !eq(mx, "MF4") : 2,
+    !eq(mx, "MF8") : 2
+  );    
+}
+
+class XSGetCyclesVPPU<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 2,
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 16,
+    !eq(mx, "MF2") : 2,
+    !eq(mx, "MF4") : 2,
+    !eq(mx, "MF8") : 2
+  );    
+}
+
+class XSGetCyclesVFALU<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 2,
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 16,
+    !eq(mx, "MF2") : 2,
+    !eq(mx, "MF4") : 2,
+    !eq(mx, "MF8") : 2
+  );    
+}
+
+class XSGetCyclesVFMA<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 4,
+    !eq(mx, "M2") : 8,
+    !eq(mx, "M4") : 16,
+    !eq(mx, "M8") : 32,
+    !eq(mx, "MF2") : 4,
+    !eq(mx, "MF4") : 4,
+    !eq(mx, "MF8") : 4
+  );    
+}
+
+class XSGetCyclesVFDIV<string mx, int sew> {
+  assert !or(!eq(sew, 32), !eq(sew, 64)), "Floating-point SEW of KunMingHu can only be 32 or 64.";
+  int uop = !cond(
+    !eq(mx, "M1") : 1,
+    !eq(mx, "M2") : 2,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+  int cycles = !cond(
+    !eq(sew, 64) : 15,   // FP64: 15
+    !eq(sew, 32) : 10,   // FP32: 10
+  );
+  int c = !mul(uop, cycles);
+}
+
+class XSGetCyclesVFCVT<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 3,
+    !eq(mx, "M2") : 6,
+    !eq(mx, "M4") : 12,
+    !eq(mx, "M8") : 24,
+    !eq(mx, "MF2") : 3,
+    !eq(mx, "MF4") : 3,
+    !eq(mx, "MF8") : 3
+  );    
+}
+
+class XSGetCyclesVLDU<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 8,
+    !eq(mx, "M2") : 16,
+    !eq(mx, "M4") : 32,
+    !eq(mx, "M8") : 64,
+    !eq(mx, "MF2") : 8,
+    !eq(mx, "MF4") : 8,
+    !eq(mx, "MF8") : 8
+  );
+}
+
+class XSGetCyclesVSTU<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 7,
+    !eq(mx, "M2") : 14,
+    !eq(mx, "M4") : 28,
+    !eq(mx, "M8") : 56,
+    !eq(mx, "MF2") : 7,
+    !eq(mx, "MF4") : 7,
+    !eq(mx, "MF8") : 7
+  );
+}
+
+// If mx is the maximum LMUL in the MxList, then c is true, indicating the worst case.
+class XSIsWorstCaseMX<string mx, list<string> MxList> {
+  defvar LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+// If mx is the maximum LMUL in the MxList, and sew is the minimum value 
+// when LMUL=mx, then c is true, indicating the worst case.
+class XSIsWorstCaseMXSEW<string mx, int sew, list<string> MxList,
+                               bit isF = 0> {
+  defvar LLMUL = LargestLMUL<MxList>.r;
+  defvar SSEW = NoZvfhSmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+class XSLDUtoAnyBypass<SchedRead read, int cycles = 2>
+    : ReadAdvance<read, cycles, [WriteLDB, WriteLDH,
+                                 WriteLDW, WriteLDD,
+                                 WriteAtomicW, WriteAtomicD,
+                                 WriteAtomicLDW, WriteAtomicLDD]>;
+
+//===----------------------------------------------------------------------===//
+
+def XiangShanKunMingHuModel : SchedMachineModel {
+  let IssueWidth = 6;   // 6-way decode and dispatch
+  let MicroOpBufferSize = 256;
+  let LoopMicroOpBufferSize = 48;  // Instruction queue size
+  let LoadLatency = 6;
+  let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+  let PostRAScheduler = 1;
+  let CompleteModel = 0;
+  let UnsupportedFeatures = [HasStdExtZcmt, HasStdExtZkr];
+}
+
+let SchedModel = XiangShanKunMingHuModel in {
+// Define each kind of processor resource and number available.
+/// Pipline
+let BufferSize = 12 in {
+  // Integer
+  def XSPipeALU0 : ProcResource<1>; // ALU, MUL, BKU
+  def XSPipeALU1 : ProcResource<1>; // ALU, MUL, BKU
+  def XSPipeALU2 : ProcResource<1>; // ALU
+  def XSPipeALU3 : ProcResource<1>; // ALU
+  def XSPipeBJU0 : ProcResource<1>; // BRU, JMP
+  def XSPipeBJU1 : ProcResource<1>; // BRU, JMP
+  def XSPipeBJU2 : ProcResource<1>; // BRU, JMP, I2F, I2V, VSET, CSR, FENCE
+  def XSPipeDIV  : ProcResource<1>; // DIV
+
+  // Vector and floating-point
+  def XSPipVFEX0 : ProcResource<1>; // VFALU, VFMA, VIALU, VIMAC
+  def XSPipVFEX1 : ProcResource<1>; // VIPU, VPPU, VFCVT, F2V, VSET2
+  def XSPipVFEX2 : ProcResource<1>; // VFALU, VFMA, VIALU
+  def XSPipVFEX3 : ProcResource<1>; // VFDIV, VIDIV
+
+  // Vector load and store
+  def XSPipVLDU  : ProcResource<1>; // VLDU
+  def XSPipVSTU  : ProcResource<1>; // VSTU
+}
+
+let BufferSize = 24 in {
+  // Load and store
+  def XSPipeLDU0 : ProcResource<1>; // LDU
+  def XSPipeLDU1 : ProcResource<1>; // LDU
+  def XSPipeLDU2 : ProcResource<1>; // LDU
+  def XSPipeSTU0 : ProcResource<1>; // STU
+  def XSPipeSTU1 : ProcResource<1>; // STU
+}
+
+def XSPipeGroupALU : ProcResGroup<[XSPipeALU0, XSPipeALU1, XSPipeALU2, XSPipeALU3]>;
+def XSPipeGroupMUL : ProcResGroup<[XSPipeALU0, XSPipeALU1]>;
+def XSPipeGroupBKU : ProcResGroup<[XSPipeALU0, XSPipeALU1]>;
+def XSPipeGroupBRU : ProcResGroup<[XSPipeBJU0, XSPipeBJU1, XSPipeBJU2]>;
+def XSPipeGroupJMP : ProcResGroup<[XSPipeBJU0, XSPipeBJU1, XSPipeBJU2]>;
+
+def XSPipeGroupVIALU : ProcResGroup<[XSPipVFEX0, XSPipVFEX2]>;
+def XSPipeGroupVFALU : ProcResGroup<[XSPipVFEX0, XSPipVFEX2]>;
+def XSPipeGroupVFMA  : ProcResGroup<[XSPipVFEX0, XSPipVFEX2]>;
+
+def XSPipeGroupLDU : ProcResGroup<[XSPipeLDU0, XSPipeLDU1, XSPipeLDU2]>;
+def XSPipeGroupSTU : ProcResGroup<[XSPipeSTU0, XSPipeSTU1]>;
+
+/// Register
+def XS_INT_PRF : RegisterFile<224, [GPR], [1], [1], 0, 0>;
+def XS_FP_PRF  : RegisterFile<192, [FPR64], [1], [1], 0, 0>;
+
+//===----------------------------------------------------------------------===//
+
+// Jump
+let Latency = 1 in {
+  def : WriteRes<WriteJmp, [XSPipeGroupBRU]>;
+  def : WriteRes<WriteJal, [XSPipeGroupJMP]>;
+  def : WriteRes<WriteJalr, [XSPipeGroupJMP]>;
+}
+
+// Integer arithmetic and logic
+let Latency = 1 in {
+  def : WriteRes<WriteIALU32, [XSPipeGroupALU]>;
+  def : WriteRes<WriteIALU, [XSPipeGroupALU]>;
+  def : WriteRes<WriteShiftImm32, [XSPipeGroupALU]>;
+  def : WriteRes<WriteShiftImm, [XSPipeGroupALU]>;
+  def : WriteRes<WriteShiftReg32, [XSPipeGroupALU]>;
+  def : WriteRes<WriteShiftReg, [XSPipeGroupALU]>;
+}
+
+// Integer multiplication
+let Latency = 3 in {
+  def : WriteRes<WriteIMul, [XSPipeGroupMUL]>;
+  def : WriteRes<WriteIMul32, [XSPipeGroupMUL]>;
+}
+
+// Integer division
+// Worst case latency is used.
+// The latency of integer division ranges from 4 to 20.
+let Latency = 20, ReleaseAtCycles = [20] in {
+  def : WriteRes<WriteIDiv32, [XSPipeDIV]>;
+  def : WriteRes<WriteIDiv, [XSPipeDIV]>;
+  def : WriteRes<WriteIRem32, [XSPipeDIV]>;
+  def : WriteRes<WriteIRem, [XSPipeDIV]>;
+}
+
+// Memory
+let Latency = 5 in {
+  def : WriteRes<WriteSTB, [XSPipeGroupSTU]>;
+  def : WriteRes<WriteSTH, [XSPipeGroupSTU]>;
+  def : WriteRes<WriteSTW, [XSPipeGroupSTU]>;
+  def : WriteRes<WriteSTD, [XSPipeGroupSTU]>;
+  def : WriteRes<WriteFST32, [XSPipeGroupSTU]>;
+  def : WriteRes<WriteFST64, [XSPipeGroupSTU]>;
+  def : WriteRes<WriteAtomicSTW, [XSPipeGroupSTU]>;
+  def : WriteRes<WriteAtomicSTD, [XSPipeGroupSTU]>;
+}
+let Latency = 6 in {
+  def : WriteRes<WriteLDB, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteLDH, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteLDW, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteLDD, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteFLD32, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteFLD64, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteAtomicW, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteAtomicD, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteAtomicLDW, [XSPipeGroupLDU]>;
+  def : WriteRes<WriteAtomicLDD, [XSPipeGroupLDU]>;
+}
+
+let Latency = 2 in {
+  def : WriteRes<WriteFAdd32, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFAdd64, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFCmp32, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFCmp64, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFMinMax32, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFMinMax64, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFClass32, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFClass64, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFSGNJ32, [XSPipeGroupVFALU]>;
+  def : WriteRes<WriteFSGNJ64, [XSPipeGroupVFALU]>;
+}
+
+let Latency = 4 in {
+  def : WriteRes<WriteFMul32, [XSPipeGroupVFMA]>;
+  def : WriteRes<WriteFMul64, [XSPipeGroupVFMA]>;
+  def : WriteRes<WriteFMA32, [XSPipeGroupVFMA]>;
+  def : WriteRes<WriteFMA64, [XSPipeGroupVFMA]>;
+}
+
+// VFDIV
+let Latency = 10 in {
+  def : WriteRes<WriteFDiv32, [XSPipVFEX3]>;
+  def : WriteRes<WriteFSqrt32, [XSPipVFEX3]>;
+}
+let Latency = 15 in {
+  def : WriteRes<WriteFDiv64, [XSPipVFEX3]>;
+  def : WriteRes<WriteFSqrt64, [XSPipVFEX3]>;
+}
+
+// VFCVT
+let Latency = 3 in {
+  def : WriteRes<WriteFCvtF32ToI32, [XSPipVFEX1]>;
+  def : WriteRes<WriteFCvtF32ToI64, [XSPipVFEX1]>;
+  def : WriteRes<WriteFCvtF64ToI32, [XSPipVFEX1]>;
+  def : WriteRes<WriteFCvtF64ToI64, [XSPipVFEX1]>;
+  def : WriteRes<WriteFCvtF64ToF32, [XSPipVFEX1]>;
+  def : WriteRes<WriteFCvtF32ToF64, [XSPipVFEX1]>;
+  def : WriteRes<WriteFMovF64ToI64, [XSPipVFEX1]>;
+  def : WriteRes<WriteFMovF32ToI32, [XSPipVFEX1]>;
+}
+
+// I2V
+let Latency = 1 in {
+  def : WriteRes<WriteFMovI64ToF64, [XSPipeBJU2]>;
+  def : WriteRes<WriteFMovI32ToF32, [XSPipeBJU2]>;
+}
+
+// I2F
+let Latency = 3 in {
+  def : WriteRes<WriteFCvtI32ToF32, [XSPipeBJU2]>;
+  def : WriteRes<WriteFCvtI64ToF32, [XSPipeBJU2]>;
+  def : WriteRes<WriteFCvtI32ToF64, [XSPipeBJU2]>;
+  def : WriteRes<WriteFCvtI64ToF64, [XSPipeBJU2]>;
+}
+
+/// Zb*
+let Latency = 1 in {
+  // Zba
+  def : WriteRes<WriteSHXADD, [XSPipeGroupALU]>;
+  def : WriteRes<WriteSHXADD32, [XSPipeGroupALU]>;
+  
+  // Zbb
+  def : WriteRes<WriteRotateImm, [XSPipeGroupALU]>;
+  def : WriteRes<WriteRotateImm32, [XSPipeGroupALU]>;
+  def : WriteRes<WriteRotateReg, [XSPipeGroupALU]>;
+  def : WriteRes<WriteRotateReg32, [XSPipeGroupALU]>;
+  def : WriteRes<WriteREV8, [XSPipeGroupALU]>;
+  def : WriteRes<WriteORCB, [XSPipeGroupALU]>;
+  def : WriteRes<WriteIMinMax, [XSPipeGroupALU]>;
+
+  // Zbs
+  def : WriteRes<WriteSingleBit, [XSPipeGroupALU]>;
+  def : WriteRes<WriteSingleBitImm, [XSPipeGroupALU]>;
+  def : WriteRes<WriteBEXT, [XSPipeGroupALU]>;
+  def : WriteRes<WriteBEXTI, [XSPipeGroupALU]>;
+
+  // Zbkb
+  def : WriteRes<WriteBREV8, [XSPipeGroupALU]>;
+  def : WriteRes<WritePACK, [XSPipeGroupALU]>;
+  def : WriteRes<WritePACK32, [XSPipeGroupALU]>;
+  def : WriteRes<WriteZIP, [XSPipeGroupALU]>;
+}
+
+let Latency = 3 in {
+  // Zbb
+  def : WriteRes<WriteCLZ, [XSPipeGroupBKU]>;
+  def : WriteRes<WriteCLZ32, [XSPipeGroupBKU]>;
+  def : WriteRes<WriteCTZ, [XSPipeGroupBKU]>;
+  def : WriteRes<WriteCTZ32, [XSPipeGroupBKU]>;
+  def : WriteRes<WriteCPOP, [XSPipeGroupBKU]>;
+  def : WriteRes<WriteCPOP32, [XSPipeGroupBKU]>;
+
+  // Zbc
+  def : WriteRes<WriteCLMUL, [XSPipeGroupBKU]>;
+
+  // Zbkx
+  def : WriteRes<WriteXPERM, [XSPipeGroupBKU]>;
+}
+
+/// Vector extension
+// 3.6 Vector Byte Length vlenb
+def : WriteRes<WriteRdVLENB, [XSPipeGroupALU]>;
+
+// 6. Configuration-Setting Instructions
+// VSET VSET2
+let Latency = 1 in {
+  def : WriteRes<WriteVSETVLI, [XSPipVFEX1]>;
+  def : WriteRes<WriteVSETIVLI, [XSPipVFEX1]>;
+  def : WriteRes<WriteVSETVL, [XSPipeBJU2]>;
+}
+
+// 7. Vector Loads and Stores
+// VLDU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVLDU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVLDE",       [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDM",       [XSPipVLDU], mx, IsWorstCase>;
+  }
+}
+
+// VSTU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVSTU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVSTE",       [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTM",       [XSPipVSTU], mx, IsWorstCase>;
+  }
+}
+
+// VLDU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVLDU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVLDS8",      [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS16",     [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS32",     [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS64",     [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX8",     [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX16",    [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX32",    [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX64",    [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX8",     [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX16",    [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX32",    [XSPipVLDU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX64",    [XSPipVLDU], mx, IsWorstCase>;
+  }
+}
+
+// VSTU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVSTU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVSTS8",      [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS16",     [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS32",     [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS64",     [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX8",     [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX16",    [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX32",    [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX64",    [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX8",     [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX16",    [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX32",    [XSPipVSTU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX64",    [XSPipVSTU], mx, IsWorstCase>;
+  }
+}
+
+// VLDU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVLDU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVLDFF",      [XSPipVLDU], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  foreach nf=2-8 in {
+    foreach eew = [8, 16, 32, 64] in {
+      defvar CyclesLoad = XSGetCyclesVLDU<mx>.c;
+      defvar CyclesStore = XSGetCyclesVSTU<mx>.c;
+      defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+      let Latency = CyclesLoad in {
+        // VLDU
+        defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew,   [XSPipVLDU], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew,  [XSPipVLDU], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [XSPipVLDU], mx, IsWorstCase>;
+      }
+      let Latency = CyclesStore in {
+        // VSTU
+        defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew,   [XSPipVSTU], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew,  [XSPipVSTU], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [XSPipVSTU], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [XSPipVSTU], mx, IsWorstCase>;
+      }
+    }
+  }
+}
+
+// VLDU
+let Latency = 8 in 
+  def : WriteRes<WriteVLD1R, [XSPipVLDU]>;
+let Latency = 16 in 
+  def : WriteRes<WriteVLD2R, [XSPipVLDU]>;
+let Latency = 32 in 
+  def : WriteRes<WriteVLD4R, [XSPipVLDU]>;
+let Latency = 64 in 
+  def : WriteRes<WriteVLD8R, [XSPipVLDU]>;
+
+// VSTU
+let Latency = 7 in 
+  def : WriteRes<WriteVST1R, [XSPipVSTU]>;
+let Latency = 14 in 
+  def : WriteRes<WriteVST2R, [XSPipVSTU]>;
+let Latency = 28 in 
+  def : WriteRes<WriteVST4R, [XSPipVSTU]>;
+let Latency = 56 in 
+  def : WriteRes<WriteVST8R, [XSPipVSTU]>;
+
+// 11. Vector Integer Arithmetic Instructions
+// VIALU
+// The latency of KunMingHu vector extension instructions is independent of SEW.
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVIALU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVIALUV",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVExtV",      [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUV",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVShiftV",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpV",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMinMaxV",  [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+
+  // Because .vx and .vi need to be converted to .vv before execution, 
+  // an additional cycle is required.
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVIALUX",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIALUI",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUX",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUI",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVShiftX",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVShiftI",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpX",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpI",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMinMaxX",  [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+}
+
+// VIALU
+foreach mx = SchedMxListW in {
+  defvar Cycles = XSGetCyclesVIALU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVIWALUV",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNShiftV",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVIWALUX",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWALUI",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNShiftX",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNShiftI",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+   
+  }
+}
+
+// VIMAC
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVIMulV",     [XSPipVFEX0], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVIMulX",     [XSPipVFEX0], mx, IsWorstCase>;
+  }
+}
+
+// VIDIV
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar Cycles = XSGetCyclesVIDIV<mx, sew>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV",   [XSPipVFEX3], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX",   [XSPipVFEX3], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VIMAC
+foreach mx = SchedMxListW in {
+  defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVIWMulV",    [XSPipVFEX0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulAddV", [XSPipVFEX0], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVIWMulX",    [XSPipVFEX0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulAddX", [XSPipVFEX0], mx, IsWorstCase>;
+  }
+}
+
+// VIMAC
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVIMulAddV",  [XSPipVFEX0], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVIMulAddX",  [XSPipVFEX0], mx, IsWorstCase>;
+  }
+}
+
+// VIALU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVIALU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVIMergeV",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovV",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUV",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUV",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVIMergeX",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMergeI",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovX",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovI",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+
+    // 12. Vector Fixed-Point Arithmetic Instructions
+    defm "" : LMULWriteResMX<"WriteVSALUX",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUI",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUX",     [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+}
+
+// VIMAC
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVIMAC<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVSMulV",     [XSPipVFEX0], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVSMulX",     [XSPipVFEX0], mx, IsWorstCase>;
+  }
+}
+
+// VIALU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVIALU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVSShiftV",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVSShiftX",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftI",   [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+}
+
+// VIALU
+foreach mx = SchedMxListW in {
+  defvar Cycles = XSGetCyclesVIALU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVNClipV",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVNClipX",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNClipI",    [XSPipeGroupVIALU], mx, IsWorstCase>;
+  }
+}
+
+// 13. Vector Floating-Point Instructions
+// VFALU
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFALU<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV",     [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF",     [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFALU
+foreach mx = SchedMxListFW in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+    defvar Cycles = XSGetCyclesVFALU<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV",    [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF",    [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFMA
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFMA<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV",     [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF",     [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFDIV
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFDIV<mx, sew>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV",   [XSPipVFEX3], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF",   [XSPipVFEX3], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFMA
+foreach mx = SchedMxListFW in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+    defvar Cycles = XSGetCyclesVFMA<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV",    [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF",    [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFMA
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFMA<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV",  [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF",  [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFMA
+foreach mx = SchedMxListFW in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+    defvar Cycles = XSGetCyclesVFMA<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [XSPipeGroupVFMA], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFDIV
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFDIV<mx, sew>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV",    [XSPipVFEX3], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFCVT
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV",    [XSPipVFEX1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFALU
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFALU<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV",  [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV",    [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+    let Latency = !add(Cycles, 1) in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF",  [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF",    [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFALU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVFALU<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVFCmpV",     [XSPipeGroupVFALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFClassV",   [XSPipeGroupVFALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMergeV",   [XSPipeGroupVFALU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMovV",     [XSPipeGroupVFALU], mx, IsWorstCase>;
+  }
+  let Latency = !add(Cycles, 1) in {
+    defm "" : LMULWriteResMX<"WriteVFCmpF",     [XSPipeGroupVFALU], mx, IsWorstCase>;
+  }
+}
+
+// VFCVT
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV",    [XSPipVFEX1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFCVT
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVFCvtFToIV",    [XSPipVFEX1], mx, IsWorstCase>;
+  }
+}
+
+// VFCVT
+foreach mx = SchedMxListW in {
+    foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+    defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV",   [XSPipVFEX1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFCVT
+foreach mx = SchedMxListFW in {
+  defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListFW>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVFWCvtFToIV",   [XSPipVFEX1], mx, IsWorstCase>;
+  }
+}
+
+// VFCVT
+foreach mx = SchedMxListFW in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+    defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV",   [XSPipVFEX1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV",   [XSPipVFEX1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV",   [XSPipVFEX1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFCVT
+foreach mx = SchedMxListW in {
+  defvar Cycles = XSGetCyclesVFCVT<mx>.c;
+  defvar IsWorstCase = XSIsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = Cycles in {
+    defm "" : LMULWriteResMX<"WriteVFNCvtFToIV",   [XSPipVFEX1], mx, IsWorstCase>;
+  }
+}
+
+// 14. Vector Reduction Operations
+// VIPU
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar Cycles = XSGetCyclesVIPU<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From",        [XSPipVFEX1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From",  [XSPipVFEX1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VIPU
+foreach mx = SchedMxListWRed in {
+  foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+    defvar Cycles = XSGetCyclesVIPU<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From",       [XSPipVFEX1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFALU
+foreach mx = SchedMxListF in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=0>.val in {
+    defvar Cycles = XSGetCyclesVFALU<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From",        [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From",       [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From",  [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// VFALU
+foreach mx = SchedMxListFWRed in {
+  foreach sew = NoZvfhSchedSEWSet_rm8and16<mx, isF=1, isWidening=1>.val in {
+    defvar Cycles = XSGetCyclesVFALU<mx>.c;
+    defvar IsWorstCase = XSIsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, isF=1>.c;
+    let Latency = Cycles in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From",       [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From",      [XSPipeGroupVFALU], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// 15. Vector Mask Instructions
+// VIALU
+foreach mx = SchedMxList in {
+  defvar Cycles = XSGetCyclesVIALU<mx>.c;
----------------
camel-cdr wrote:

Are you sure this is correct? Since masks always fit into a single LMUL=1 vector register, you'd expect that an LMUL=8 SEW=8 vmand.mm would have the same latency has a LMUL=1 vand.vv. Or does xiangshan use a different internal format for mask registers? See how the SiFivdP600 schedular sets the latency of all mask instructions to 1.


https://github.com/llvm/llvm-project/pull/90392


More information about the llvm-commits mailing list