[llvm] [RISCV] Update some of the RVV memory ops in SiFive P400 & P600 sched models (PR #129575)
Min-Yih Hsu via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 17 11:21:38 PDT 2025
================
@@ -368,65 +370,44 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
// 7. Vector Loads and Stores
-// FIXME: This unit is still being improved, currently
-// it is based on stage numbers. Estimates are optimistic,
-// latency may be longer.
-foreach mx = SchedMxList in {
- defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
- defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
- }
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP400VLD], mx, IsWorstCase>;
- }
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
- }
-}
+// Note that the latency of vector loads are measured by consuming the loaded
+// value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
foreach mx = SchedMxList in {
defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase>;
- }
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP400VST], mx, IsWorstCase>;
+ let Latency = 8 in {
+ let ReleaseAtCycles = [LMulLat] in {
+ defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
+ }
+
+ // Mask load and store always have EMUL=1.
+ let ReleaseAtCycles = [SiFiveP400GetLMulCycles<"M1">.c] in {
+ defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase=!eq(mx, "M1")>;
+ defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase=!eq(mx, "M1")>;
+ }
}
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
+ foreach eew = [8, 16, 32, 64] in {
+ let Latency = 13, ReleaseAtCycles = [SiFiveP400GetVLMAX<mx, eew>.val] in {
----------------
mshockwave wrote:
I'd re-measured and updated the latencies. The latencies of stride and indexed ops are indeed dependent on VL.
https://github.com/llvm/llvm-project/pull/129575
More information about the llvm-commits
mailing list