[clang] [llvm] [RISCV] Add vector and vector crypto to SiFiveP400 scheduler model (PR #102155)

Tue Aug 6 07:40:32 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Michael Maitland (michaelmaitland)

<details>
<summary>Changes</summary>

    The SiFiveP400 scheduler model did not support vector or vector crypto. With
    the addition of the sifive-p470 processor, this model needs to support these
    extensions.

    The processors who use this model but do not have vector or vector crypto
    will never produce these instructions, so there is no impact to these
    processors.

    This change is stacked on #102022

    Co-authored-by: Min Hsu <min.hsu@sifive.com>

---

Patch is 1.04 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102155.diff


19 Files Affected:

- (modified) clang/test/Driver/riscv-cpus.c (+48) 
- (modified) clang/test/Misc/target-invalid-cpu-note.c (+3-2) 
- (modified) llvm/docs/ReleaseNotes.rst (+1) 
- (modified) llvm/lib/Target/RISCV/RISCVProcessors.td (+47-5) 
- (modified) llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td (+867-2) 
- (modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/load.s (+13-8) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vislide-vx.s (+108) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vlseg-vsseg.s (+9827) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vmv.s (+895) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vreduce.s (+438) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vrgather.s (+86) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vshift-vmul.s (+132) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/zvbb.s (+460) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/zvbc.s (+112) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/zvkg.s (+127) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/zvkned.s (+203) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/zvknhb.s (+152) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/zvksed.s (+113) 
- (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP400/zvksh.s (+98) 


``````````diff

diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index 7a885cde76d6a..750fb637edeb1 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -304,6 +304,54 @@
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zbs"
 // MCPU-SIFIVE-P450-SAME: "-target-abi" "lp64d"
 
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p470 | FileCheck -check-prefix=MCPU-SIFIVE-P470 %s
+// MCPU-SIFIVE-P470: "-target-cpu" "sifive-p470"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+m"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+a"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+f"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+d"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+c"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+v"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zic64b"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicbom"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicbop"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicboz"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+ziccamoa"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+ziccif"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicclsm"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+ziccrse"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicsr"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zifencei"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zihintntl"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zihintpause"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zihpm"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zmmul"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+za64rs"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zfhmin"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zba"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zbb"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zbs"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvbb"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvbc"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zve32f"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zve32x"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zve64d"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zve64f"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zve64x"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvkg"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvkn"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvknc"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvkned"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvkng"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvknhb"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvks"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvksc"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvksed"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvksg"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvksh"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvkt"
+// MCPU-SIFIVE-P470-SAME: "-target-abi" "lp64d"
+
 // RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p670 | FileCheck -check-prefix=MCPU-SIFIVE-P670 %s
 // MCPU-SIFIVE-P670: "-target-cpu" "sifive-p670"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+m"
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index b87bced18cb2b..249bea2311549 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -85,7 +85,7 @@
 
 // RUN: not %clang_cc1 -triple riscv64 -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix RISCV64
 // RISCV64: error: unknown target CPU 'not-a-cpu'
-// RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-p670, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, spacemit-x60, syntacore-scr3-rv64, syntacore-scr4-rv64, veyron-v1, xiangshan-nanhu{{$}}
+// RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-p470, sifive-p670, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, spacemit-x60, syntacore-scr3-rv64, syntacore-scr4-rv64, veyron-v1, xiangshan-nanhu{{$}}
 
 // RUN: not %clang_cc1 -triple riscv32 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV32
 // TUNE-RISCV32: error: unknown target CPU 'not-a-cpu'
@@ -93,4 +93,5 @@
 
 // RUN: not %clang_cc1 -triple riscv64 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV64
 // TUNE-RISCV64: error: unknown target CPU 'not-a-cpu'
-// TUNE-RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-p670, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, spacemit-x60, syntacore-scr3-rv64, syntacore-scr4-rv64, veyron-v1, xiangshan-nanhu, generic, rocket, sifive-7-series{{$}}
+// TUNE-RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-p470, sifive-p670, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, spacemit-x60, syntacore-scr3-rv64, syntacore-scr4-rv64, veyron-v1, xiangshan-nanhu, generic, rocket, sifive-7-series{{$}}
+
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index a95cb53694e2b..1ed860de6b9dc 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -107,6 +107,7 @@ Changes to the RISC-V Backend
   the required alignment space with a sequence of `0x0` bytes (the requested
   fill value) rather than NOPs.
 * Added Syntacore SCR4 CPUs: ``-mcpu=syntacore-scr4-rv32/64``
+* ``-mcpu=sifive-p470`` was added.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 83d27b35cf0da..a118e1fe5e502 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -239,6 +239,12 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                        FeatureStdExtZbb],
                                       SiFiveX280TuneFeatures>;
 
+defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
+                                 TuneConditionalCompressedMoveFusion,
+                                 TuneLUIADDIFusion,
+                                 TuneAUIPCADDIFusion,
+                                 FeaturePostRAScheduler];
+
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [Feature64Bit,
                                        FeatureStdExtI,
@@ -266,11 +272,47 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        FeatureStdExtZfhmin,
                                        FeatureUnalignedScalarMem,
                                        FeatureUnalignedVectorMem],
-                                      [TuneNoDefaultUnroll,
-                                       TuneConditionalCompressedMoveFusion,
-                                       TuneLUIADDIFusion,
-                                       TuneAUIPCADDIFusion,
-                                       FeaturePostRAScheduler]>;
+                                      SiFiveP400TuneFeatures>;
+
+def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtZifencei,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureVendorXSiFivecdiscarddlone,
+                                       FeatureVendorXSiFivecflushdlone,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb,
+                                       FeatureStdExtZbs,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtV,
+                                       FeatureStdExtZvl128b,
+                                       FeatureStdExtZvbb,
+                                       FeatureStdExtZvknc,
+                                       FeatureStdExtZvkng,
+                                       FeatureStdExtZvksc,
+                                       FeatureStdExtZvksg,
+                                       FeatureUnalignedScalarMem,
+                                       FeatureUnalignedVectorMem],
+                                      !listconcat(SiFiveP400TuneFeatures,
+                                                  [TuneNoSinkSplatOperands])>;
+
 
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       [Feature64Bit,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index 9fa455be38525..6926184e92399 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -8,6 +8,115 @@
 
 //===----------------------------------------------------------------------===//
 
+/// c is true if mx has the worst case behavior compared to LMULs in MxList.
+/// On the SiFiveP400, the worst case LMUL is the Largest LMUL
+/// and the worst case sew is the smallest SEW for that LMUL.
+class SiFiveP400IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+class SiFiveP400IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+// 1 Micro-Op per cycle.
+class SiFiveP400GetLMulCycles<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 1,
+    !eq(mx, "M2") : 2,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+}
+
+// Latency for segmented loads and stores are calculated as vl * nf.
+class SiFiveP400GetCyclesSegmented<string mx, int sew, int nf> {
+  defvar VLEN = 128;
+  defvar VLUpperBound = !cond(
+    !eq(mx, "M1") : !div(VLEN, sew),
+    !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
+    !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
+    !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
+    !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
+    !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
+    !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
+  );
+  int c = !mul(VLUpperBound, nf);
+}
+
+// Both variants of floating point vector reductions are based on numbers collected
+// from llvm-exegesis.
+class VFReduceBaseCycles<int sew> {
+  // The latency for simple unordered VFReduce is `C + 6 * log2(LMUL)`,
+  // and `C * LMUL` for ordered VFReduce. This helper class provides the `C`.
+  int val = !cond(!eq(sew, 16): 16,
+                  !eq(sew, 32): 10,
+                  !eq(sew, 64): 6);
+}
+
+class AdvancedVFReduceCycles<int sew, string mx> {
+  // SEW = 64 has lower latencies and RThroughputs than other SEWs.
+  int latency = !cond(!eq(mx, "M1"): !if(!eq(sew, 64), 4, 6),
+                      !eq(mx, "M2"): !if(!eq(sew, 64), 6, 8),
+                      !eq(mx, "M4"): !if(!eq(sew, 64), 8, 10),
+                      !eq(mx, "M8"): !if(!eq(sew, 64), 11, 13),
+                      true: !if(!eq(sew, 64), 4, 6));
+  int rthroughput = !cond(!eq(mx, "M1"): !if(!eq(sew, 64), 2, 3),
+                          !eq(mx, "M2"): !if(!eq(sew, 64), 3, 4),
+                          !eq(mx, "M4"): !if(!eq(sew, 64), 5, 6),
+                          !eq(mx, "M8"): !if(!eq(sew, 64), 10, 12),
+                          true: !if(!eq(sew, 64), 2, 3));
+}
+
+// Both variants of integer vector reductions are based on numbers collected
+// from llvm-exegesis.
+// TODO: Fractional LMUL's latency and rthroughput.
+class SimpleVIReduceCycles<string mx> {
+  defvar LMul = SiFiveP400GetLMulCycles<mx>.c;
+  int latency = !mul(LMul, 2);
+  int rthroughput = !cond(
+                      !eq(mx, "M1"): 1,
+                      !eq(mx, "M2"): 2,
+                      !eq(mx, "M4"): 4,
+                      !eq(mx, "M8"): 9,
+                      true: 1);
+}
+
+class AdvancedVIReduceCycles<int sew, string mx> {
+  // `C - 2 * log2(SEW)`, where `C` = 16.1, 18.1, 20.1, and 23.8 for
+  // M1/2/4/8, respectively.
+  int latency = !cond(!eq(mx, "M1"): !sub(16, !mul(2, !logtwo(sew))),
+                      !eq(mx, "M2"): !sub(18, !mul(2, !logtwo(sew))),
+                      !eq(mx, "M4"): !sub(20, !mul(2, !logtwo(sew))),
+                      !eq(mx, "M8"): !sub(23, !mul(2, !logtwo(sew))),
+                      true: 4);
+  int rthroughput = !cond(
+                      // `8.3 - 1.02 * log2(SEW)`
+                      !eq(mx, "M1"): !sub(8, !logtwo(sew)),
+                      // `10.0 - 1.16 * log2(SEW)`. Note that `9 - log2(SEW)`
+                      // is closer to the floor value of the original formula.
+                      !eq(mx, "M2"): !sub(9, !logtwo(sew)),
+                      // `14.2 - 1.53 * log2(SEW)`
+                      !eq(mx, "M4"): !div(!sub(1420, !mul(153, !logtwo(sew))), 100),
+                      // `24.1 - 2.3 * log2(SEW)`
+                      !eq(mx, "M8"): !div(!sub(241, !mul(23, !logtwo(sew))), 10),
+                      true: 1);
+}
+
+class SiFiveP400VSM3CCycles<string mx> {
+  // c = ceil(LMUL / 2)
+  int c = !cond(!eq(mx, "M2") : 1,
+                !eq(mx, "M4") : 2,
+                !eq(mx, "M8") : 4,
+                true : 1);
+}
+
 def SiFiveP400Model : SchedMachineModel {
   let IssueWidth = 3;         // 3 micro-ops are dispatched per cycle.
   let MicroOpBufferSize = 56; // Max micro-ops that can be buffered.
@@ -45,6 +154,13 @@ defvar SiFiveP400FloatArith  = SiFiveP400FEXQ0;
 defvar SiFiveP400F2I      = SiFiveP400FEXQ0;
 def SiFiveP400FloatDiv    : ProcResource<1>;
 
+// Vector pipeline
+def SiFiveP400VEXQ0        : ProcResource<1>;
+def SiFiveP400VLD          : ProcResource<1>;
+def SiFiveP400VST          : ProcResource<1>;
+def SiFiveP400VDiv         : ProcResource<1>;
+def SiFiveP400VFloatDiv    : ProcResource<1>;
+
 let Latency = 1 in {
 // Integer arithmetic and logic
 def : WriteRes<WriteIALU, [SiFiveP400IntArith]>;
@@ -246,9 +362,549 @@ def : WriteRes<WriteFMovI64ToF64, [SiFiveP400I2F]>;
 def : WriteRes<WriteFMovF64ToI64, [SiFiveP400F2I]>;
 }
 
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [SiFiveP400SYS]>;
+def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
+def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
+
+// 7. Vector Loads and Stores
+// FIXME: This unit is still being improved, currently
+// it is based on stage numbers. Estimates are optimistic,
+// latency may be longer.
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 8, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDE",    [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDM",    [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDFF",   [SiFiveP400VLD], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDS8",   [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS16",  [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS32",  [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS64",  [SiFiveP400VLD], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDUX8",  [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX8",  [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 8, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTE",    [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTM",    [SiFiveP400VST], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTS8",   [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS16",  [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS32",  [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS64",  [SiFiveP400VST], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTUX8",  [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX8",  [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  foreach nf=2-8 in {
+    foreach eew = [8, 16, 32, 64] in {
+      defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
+      defvar LMulLat = SiFiveP400GetCyclesSegmented<mx, eew, nf>.c;
+      let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
+        defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew,   [SiFiveP400VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew,  [SiFiveP400VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
+      }
+      let Latency = !add(1, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
+        defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew,   [SiFiveP400VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew,  [SiFiveP400VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SiFiveP400VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SiFiveP400VST], mx, IsWorstCase>;
+      }
+    }
+  }
+}
+
+// Whole register move/load/store
+foreach LMul = [1, 2, 4, 8] in {
+  let Latency = 8, ReleaseAtCycles = [LMul] in {
+    def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SiFiveP400VLD]>;
+    def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SiFiveP400VST]>;
+  }
+  let Latency = 2, ReleaseAtCycles = [LMul] in {
+    def : WriteRes<!cast<SchedWrite>("WriteVMov" #...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/102155