[llvm] [WIP][RISCV] tt-ascalon-d8 vector scheduling (PR #167066)
Petr Penzin via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 16:54:57 PST 2025
https://github.com/ppenzin created https://github.com/llvm/llvm-project/pull/167066
Drive-by: additional tuning knobs.
Partial scheduling model for vector operations.
>From 162020b748089263ed310b99cfbded59e4d9ae79 Mon Sep 17 00:00:00 2001
From: Petr Penzin <ppenzin at tenstorrent.com>
Date: Fri, 7 Nov 2025 18:51:19 -0600
Subject: [PATCH] [WIP][RISCV] tt-ascalon-d8 vector scheduling
Drive-by: additional tuning knobs.
Partial scheduling model for vector operations.
---
llvm/lib/Target/RISCV/RISCVProcessors.td | 7 +
.../lib/Target/RISCV/RISCVSchedTTAscalonD8.td | 318 +++++++++++++++++-
2 files changed, 319 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index e86431f78f1ba..07f6a38c77897 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -633,6 +633,13 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
FeatureUnalignedVectorMem]),
[TuneNoDefaultUnroll,
TuneNLogNVRGather,
+ TuneOptimizedNF2SegmentLoadStore,
+ TuneOptimizedNF3SegmentLoadStore,
+ TuneOptimizedNF4SegmentLoadStore,
+ TuneOptimizedNF5SegmentLoadStore,
+ TuneOptimizedNF6SegmentLoadStore,
+ TuneOptimizedNF7SegmentLoadStore,
+ TuneOptimizedNF8SegmentLoadStore,
TuneOptimizedZeroStrideLoad,
TunePostRAScheduler]>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
index da89e158f9839..973e55b5c53f8 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
@@ -8,19 +8,85 @@
//===----------------------------------------------------------------------===//
+class AscalonIsWorstCaseMX<string mx, list<string> MxList> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
+}
+
+class AscalonIsWorstCaseMXSEW<string mx, int sew, list<string> MxList,
+ bit isF = 0> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ defvar SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+/// Cycle counts that scale with LMUL with LMUL=1 having the same latency as
+/// fractional LMULs
+class AscalonGetCyclesLMUL<string mx, int base> {
+ int c = !cond(
+ !eq(mx, "M1") : base,
+ !eq(mx, "M2") : !mul(base, 2),
+ !eq(mx, "M4") : !mul(base, 4),
+ !eq(mx, "M8") : !mul(base, 8),
+ !eq(mx, "MF2") : base,
+ !eq(mx, "MF4") : base,
+ !eq(mx, "MF8") : base
+ );
+}
+
+/// Linear LMUL scaling starting from smallest fractional LMUL
+class AscalonGetCyclesLMULFractional<string mx, int base> {
+ int c = !cond(
+ !eq(mx, "MF8") : base,
+ !eq(mx, "MF4") : !mul(base, 2),
+ !eq(mx, "MF2") : !mul(base, 4),
+ !eq(mx, "M1") : !mul(base, 8),
+ !eq(mx, "M2") : !mul(base, 16),
+ !eq(mx, "M4") : !mul(base, 32),
+ !eq(mx, "M8") : !mul(base, 64)
+ );
+}
+
+class AscalonGetCyclesDefault<string mx> {
+ int c = AscalonGetCyclesLMUL<mx, 1>.c;
+}
+
+class AscalonGetCyclesNarrowing<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 4,
+ !eq(mx, "M2") : 8,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "MF2") : 2,
+ !eq(mx, "MF4") : 1,
+ !eq(mx, "MF8") : 1
+ );
+}
+
+
+class AscalonGetCyclesDivOrSqrt<string mx, int sew> {
+ int c = !cond(
+ !eq(sew, 8) : AscalonGetCyclesLMUL<mx, 7>.c, // TODO not valid for fp
+ !eq(sew, 16) : AscalonGetCyclesLMUL<mx, 6>.c,
+ !eq(sew, 32) : AscalonGetCyclesLMUL<mx, 5>.c,
+ !eq(sew, 64) : AscalonGetCyclesLMUL<mx, 8>.c
+ );
+}
+
+//===----------------------------------------------------------------------===//
+
def TTAscalonD8Model : SchedMachineModel {
let IssueWidth = 8; // 8-way decode and dispatch
let MicroOpBufferSize = 256; // 256 micro-op re-order buffer
let LoadLatency = 4; // Optimistic load latency
let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
- let CompleteModel = 0;
+ let CompleteModel = false;
// TODO: supported, but haven't added scheduling info yet.
let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
- HasStdExtZkr, HasVInstructions, HasVInstructionsI64];
+ HasStdExtZkr];
}
let SchedModel = TTAscalonD8Model in {
@@ -34,11 +100,17 @@ let BufferSize = 16 in {
def AscalonFXB : ProcResource<1>; // ALU, INT -> FP/VEC
def AscalonFXC : ProcResource<2>; // ALU, BR
def AscalonFXD : ProcResource<2>; // ALU
- def AscalonFP : ProcResource<2>;
- // TODO: two vector units with vector scheduling model.
+ def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>;
+ // FP
+ def AscalonFPA : ProcResource<1>; // Pipe A aslo handles FP/VEC -> INT
+ def AscalonFPB : ProcResource<1>;
+ def AscalonFP : ProcResGroup<[AscalonFPA, AscalonFPB]>;
+ // Vector
+ def AscalonVA : ProcResource<1>;
+ def AscalonVB : ProcResource<1>;
+ def AscalonV : ProcResGroup<[AscalonFPA, AscalonFPB]>;
}
-def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>;
//===----------------------------------------------------------------------===//
@@ -316,10 +388,244 @@ def : ReadAdvance<ReadSHXADD32, 0>;
def : ReadAdvance<ReadSingleBit, 0>;
def : ReadAdvance<ReadSingleBitImm, 0>;
+//===----------------------------------------------------------------------===//
+// Vector
+
+// Configuration-Setting Instructions
+let Latency = 1 in {
+def : WriteRes<WriteVSETVLI, [AscalonV]>;
+def : WriteRes<WriteVSETIVLI, [AscalonV]>;
+}
+let Latency = 2 in {
+def : WriteRes<WriteVSETVL, [AscalonV]>;
+}
+
+// Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVIALUV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar Cycles = AscalonGetCyclesDivOrSqrt<mx, sew>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [AscalonFX, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [AscalonFX, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ }
+}
+// Narrowing
+foreach mx = SchedMxListW in {
+ defvar Cycles = AscalonGetCyclesNarrowing<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ }
+}
+
+// Vector Floating-Point Instructions
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>;
+ }
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFClassV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMergeV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMovV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ }
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar Cycles = AscalonGetCyclesDivOrSqrt<mx, sew>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListFW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in
+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>;
+}
+// Narrowing
+foreach mx = SchedMxListW in {
+ defvar Cycles = AscalonGetCyclesNarrowing<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = AscalonGetCyclesNarrowing<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// Vector Reduction Instructions
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+foreach mx = SchedMxListWRed in {
+ foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar RedCycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+ let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, RedCycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+ defvar OrdRedCycles = AscalonGetCyclesLMUL<mx, 18>.c;
+ let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, OrdRedCycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListFWRed in {
+ foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+ defvar RedCycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+ let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, RedCycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ defvar OrdRedCycles = AscalonGetCyclesLMUL<mx, 18>.c;
+ let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, OrdRedCycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// Unsupported extensions
defm : UnsupportedSchedQ;
-defm : UnsupportedSchedV;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedZbc;
defm : UnsupportedSchedZbkb;
More information about the llvm-commits
mailing list