[llvm] [RISCV] Update some of the RVV memory ops in SiFive P400 & P600 sched models (PR #129575)
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 5 14:30:21 PST 2025
================
@@ -368,65 +370,44 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
// 7. Vector Loads and Stores
-// FIXME: This unit is still being improved, currently
-// it is based on stage numbers. Estimates are optimistic,
-// latency may be longer.
-foreach mx = SchedMxList in {
- defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
- defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
- }
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP400VLD], mx, IsWorstCase>;
- }
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
- }
-}
+// Note that the latency of vector loads are measured by consuming the loaded
+// value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
foreach mx = SchedMxList in {
defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase>;
- }
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP400VST], mx, IsWorstCase>;
+ let Latency = 8 in {
+ let ReleaseAtCycles = [LMulLat] in {
+ defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
+ }
+
+ // Mask load and store always have EMUL=1.
+ let ReleaseAtCycles = [SiFiveP400GetLMulCycles<"M1">.c] in {
+ defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase=!eq(mx, "M1")>;
+ defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase=!eq(mx, "M1")>;
+ }
}
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
+ foreach eew = [8, 16, 32, 64] in {
+ let Latency = 13, ReleaseAtCycles = [SiFiveP400GetVLMAX<mx, eew>.val] in {
----------------
topperc wrote:
If the throughput is related to number of element shouldn't latency also be dependent on the number of elements. Who VLEN worth of elements needs to be loaded before a dependent operation can start. With larger elements there are less elements per VLEN so I would expect it to have lower latency than smaller elements.
https://github.com/llvm/llvm-project/pull/129575
More information about the llvm-commits
mailing list