[llvm] [RISCV] Update SpacemiT-X60 vector floating-point instructions (PR #150618)
Mikhail R. Gadelha via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 25 06:32:27 PDT 2025
================
@@ -518,94 +535,162 @@ foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
- }
-}
+ let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
-foreach mx = SchedMxListF in {
- foreach sew = SchedSEWSet<mx, isF=1>.val in {
- defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = !if(!eq(sew, 64), Get5558Latency<mx>.c, Get458Latency<mx>.c), ReleaseAtCycles = [4] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
- defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ // TODO: for some reason, the following cond is not working, and always use Get5558Latency
+ let Latency = !if(!eq(sew, 64), Get6678Latency<mx>.c, Get5558Latency<mx>.c), ReleaseAtCycles = [5] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
+ }
- defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ }
}
// Widening
foreach mx = SchedMxListW in {
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ let Latency = !mul(Get1248Latency<mx>.c, 4), ReleaseAtCycles = [4] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
foreach mx = SchedMxListFW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
- defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ let Latency = !mul(Get1248Latency<mx>.c, 4), ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ }
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ let Latency = !mul(Get1248Latency<mx>.c, 4), ReleaseAtCycles = [4] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+
+ // Pattern for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: e16mf4=4, e16mf2=4, e16m1=4, e16m2=5,
+ // e16m4=8, e32mf2=4, e32m1=4, e32m2=5, e32m4=8
+ // Pattern for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: e16mf4=5, e16mf2=5, e16m1=5, e16m2=9,
+ // e16m4=17, e32mf2=5, e32m1=5, e32m2=9, e32m4=17
+ // TODO: Split .wf/.wv variants into separate scheduling classes to use 5/5/9/17
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+
+ // Pattern for vfwmul.vv, vfwmul.vf: e16 = 4/4/4/6/8. We use 4/4/5/8 as approximation
+ // TODO: e32m4 = 8, but it's set to 5 here
+ let Latency = !if(!eq(sew, 32), Get5558Latency<mx>.c, Get4588Latency<mx>.c), ReleaseAtCycles = [4] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+
+ // Pattern for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
+ // Use existing 6,6,7,8 as close approximation
+ let Latency = Get6678Latency<mx>.c, ReleaseAtCycles = [6] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
// Narrowing
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ let Latency = !mul(Get1248Latency<mx>.c, 4), ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ }
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+
+ let Latency = !mul(Get1248Latency<mx>.c, 4), ReleaseAtCycles = [4] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
// Vector Floating-Point Division and Square Root
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+ defvar LMulLat = Get1248Latency<mx>.c;
+
+ // Pattern for vfdiv.vf: e16 = 12/24/48/96; e32 = 12/24/48/96; e64 = 18/36/72/144
+ // Pattern for vfrdiv.vf: e16 = 12/24/48/96; e32 = 12/24/48/96; e64 = 40/80/160/320
+ // We use the worst-case, vfdiv.vf is penalized in e64
+ // TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes
+ let Latency = !if(!eq(sew, 64), !mul(LMulLat, 40), !mul(LMulLat, 12)), ReleaseAtCycles = [7] in {
----------------
mikhailramalho wrote:
I used ReleaseAtCycles = [7], since this is the smallest measured latency (from fractional LMUL).
M1 latency starts at 12, but it's higher for higher SEW. Should I scale the ReleaseAtCycles with SEW?
https://github.com/llvm/llvm-project/pull/150618
More information about the llvm-commits
mailing list