[llvm] [AArch64] Neoverse V1 scheduling info (PR #126707)

Julien Villette via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 12 02:15:11 PST 2025


================
@@ -98,377 +103,487 @@ def V1Write_0c_0Z : SchedWriteRes<[]>;
 
 def V1Write_1c_1B      : SchedWriteRes<[V1UnitB]>   { let Latency = 1; }
 def V1Write_1c_1I      : SchedWriteRes<[V1UnitI]>   { let Latency = 1; }
-def V1Write_1c_1I_1Flg : SchedWriteRes<[V1UnitI, V1UnitFlg]>   { let Latency = 1; }
+def V1Write_1c_1I_1Flg : SchedWriteRes<[V1UnitI, V1UnitFlg]>   { let Latency = 1;
+								 let NumMicroOps = 2; }
 def V1Write_4c_1L      : SchedWriteRes<[V1UnitL]>   { let Latency = 4; }
+def V1Write_4c3_1L     : SchedWriteRes<[V1UnitL]>   { let Latency = 4;
+						      let ReleaseAtCycles = [3]; }
+def V1Write_5c3_1L     : SchedWriteRes<[V1UnitL]>   { let Latency = 5;
+						      let ReleaseAtCycles = [3]; }
+
 def V1Write_6c_1L      : SchedWriteRes<[V1UnitL]>   { let Latency = 6; }
+def V1Write_6c2_1L     : SchedWriteRes<[V1UnitL]>   { let Latency = 6;
+						      let ReleaseAtCycles = [2]; }
+def V1Write_6c3_1L     : SchedWriteRes<[V1UnitL]>   { let Latency = 6;
+						      let ReleaseAtCycles = [3]; }
+def V1Write_7c4_1L     : SchedWriteRes<[V1UnitL]>   { let Latency = 7;
+						      let ReleaseAtCycles = [4]; }
 def V1Write_1c_1L01    : SchedWriteRes<[V1UnitL01]> { let Latency = 1; }
 def V1Write_4c_1L01    : SchedWriteRes<[V1UnitL01]> { let Latency = 4; }
 def V1Write_6c_1L01    : SchedWriteRes<[V1UnitL01]> { let Latency = 6; }
 def V1Write_2c_1M      : SchedWriteRes<[V1UnitM]>   { let Latency = 2; }
-def V1Write_2c_1M_1Flg : SchedWriteRes<[V1UnitM, V1UnitFlg]>   { let Latency = 2; }
+def V1Write_2c_1M_1Flg : SchedWriteRes<[V1UnitM, V1UnitFlg]>   { let Latency = 2;
+								 let NumMicroOps = 2; }
 def V1Write_3c_1M      : SchedWriteRes<[V1UnitM]>   { let Latency = 3; }
-def V1Write_4c_1M      : SchedWriteRes<[V1UnitM]>   { let Latency = 4; }
+def V1Write_4c6_1M     : SchedWriteRes<[V1UnitM]>   { let Latency = 4;
+						      let ReleaseAtCycles = [6]; }
 def V1Write_1c_1M0     : SchedWriteRes<[V1UnitM0]>  { let Latency = 1; }
 def V1Write_2c_1M0     : SchedWriteRes<[V1UnitM0]>  { let Latency = 2; }
+def V1Write_2c2_1M0    : SchedWriteRes<[V1UnitM0]>  { let Latency = 2;
+						      let ReleaseAtCycles = [2]; }
+def V1Write_3c2_1M0    : SchedWriteRes<[V1UnitM0]>  { let Latency = 3;
+						      let ReleaseAtCycles = [2]; }
 def V1Write_3c_1M0     : SchedWriteRes<[V1UnitM0]>  { let Latency = 3; }
 def V1Write_5c_1M0     : SchedWriteRes<[V1UnitM0]>  { let Latency = 5; }
-def V1Write_12c5_1M0   : SchedWriteRes<[V1UnitM0]>  { let Latency = 12;
-                                                      let ReleaseAtCycles = [5]; }
-def V1Write_20c5_1M0   : SchedWriteRes<[V1UnitM0]>  { let Latency = 20;
-                                                      let ReleaseAtCycles = [5]; }
+def V1Write_12c12_1M0  : SchedWriteRes<[V1UnitM0]>  { let Latency = 12;
+						      let ReleaseAtCycles = [12]; }
+def V1Write_20c20_1M0  : SchedWriteRes<[V1UnitM0]>  { let Latency = 20;
+						      let ReleaseAtCycles = [20]; }
 def V1Write_2c_1V      : SchedWriteRes<[V1UnitV]>   { let Latency = 2; }
+def V1Write_2c4_1V     : SchedWriteRes<[V1UnitV]>   { let Latency = 2;
+						      let ReleaseAtCycles = [4]; }
 def V1Write_3c_1V      : SchedWriteRes<[V1UnitV]>   { let Latency = 3; }
 def V1Write_4c_1V      : SchedWriteRes<[V1UnitV]>   { let Latency = 4; }
+def V1Write_4c2_1V     : SchedWriteRes<[V1UnitV]>   { let Latency = 4;
+						      let ReleaseAtCycles = [2]; }
 def V1Write_5c_1V      : SchedWriteRes<[V1UnitV]>   { let Latency = 5; }
+def V1Write_6c3_1V     : SchedWriteRes<[V1UnitV]>   { let Latency = 6;
+						      let ReleaseAtCycles = [3]; }
+def V1Write_12c4_1SVE1 : SchedWriteRes<[V1UnitSVE1]> { let Latency = 12;
+						       let NumMicroOps = 2;
+						       let ReleaseAtCycles = [4]; }
+def V1Write_14c4_1SVE1 : SchedWriteRes<[V1UnitSVE1]> { let Latency = 14;
+						       let NumMicroOps = 2;
+						       let ReleaseAtCycles = [4]; }
+
 def V1Write_2c_1V0     : SchedWriteRes<[V1UnitV0]>  { let Latency = 2; }
+def V1Write_2c_1SVE0   : SchedWriteRes<[V1UnitSVE0,V1UnitSVE0]>	 { let Latency = 2;
+								   let NumMicroOps = 2; }
 def V1Write_3c_1V0     : SchedWriteRes<[V1UnitV0]>  { let Latency = 3; }
+def V1Write_3c_1SVE0   : SchedWriteRes<[V1UnitSVE0,V1UnitSVE0]>	 { let Latency = 3;
+								   let NumMicroOps = 2; }
 def V1Write_4c_1V0     : SchedWriteRes<[V1UnitV0]>  { let Latency = 4; }
-def V1Write_6c_1V0     : SchedWriteRes<[V1UnitV0]>  { let Latency = 6; }
-def V1Write_10c7_1V0   : SchedWriteRes<[V1UnitV0]>  { let Latency = 10;
-                                                      let ReleaseAtCycles = [7]; }
-def V1Write_12c7_1V0   : SchedWriteRes<[V1UnitV0]>  { let Latency = 12;
-                                                      let ReleaseAtCycles = [7]; }
-def V1Write_13c10_1V0  : SchedWriteRes<[V1UnitV0]>  { let Latency = 13;
-                                                      let ReleaseAtCycles = [10]; }
-def V1Write_15c7_1V0   : SchedWriteRes<[V1UnitV0]>  { let Latency = 15;
-                                                      let ReleaseAtCycles = [7]; }
-def V1Write_16c7_1V0   : SchedWriteRes<[V1UnitV0]>  { let Latency = 16;
-                                                      let ReleaseAtCycles = [7]; }
-def V1Write_20c7_1V0   : SchedWriteRes<[V1UnitV0]>  { let Latency = 20;
-                                                      let ReleaseAtCycles = [7]; }
+def V1Write_4c_1SVE0   : SchedWriteRes<[V1UnitSVE0,V1UnitSVE0]>	 { let Latency = 4;
+								   let NumMicroOps = 2; }
+def V1Write_5c4_1SVE0  : SchedWriteRes<[V1UnitSVE0]>  { let Latency = 5;
+							let ReleaseAtCycles = [4];
+							let NumMicroOps = 2; }
+def V1Write_6c_1SVE0	: SchedWriteRes<[V1UnitSVE0,V1UnitSVE0]>  { let Latency = 6;
+								    let NumMicroOps = 2; }
+def V1Write_6c4_1SVE0	: SchedWriteRes<[V1UnitSVE0,V1UnitSVE0]>  { let Latency = 6;
+								    let NumMicroOps = 2;
+								    let ReleaseAtCycles = [4,4]; }
+def V1Write_10c18_1SVE0	: SchedWriteRes<[V1UnitSVE0]>  { let Latency = 10;
+							 let NumMicroOps = 2;
+							 let ReleaseAtCycles = [18]; }
+def V1Write_11c20_1SVE0	 : SchedWriteRes<[V1UnitSVE0]> { let Latency = 11;
+							 let NumMicroOps = 2;
+							 let ReleaseAtCycles = [20]; }
+def V1Write_12c22_1SVE0	: SchedWriteRes<[V1UnitSVE0]>  { let Latency = 12;
+							 let NumMicroOps = 2;
+							 let ReleaseAtCycles = [22]; }
+def V1Write_13c24_1SVE0 : SchedWriteRes<[V1UnitSVE0]>  { let Latency = 13;
+							 let NumMicroOps = 2;
+							 let ReleaseAtCycles = [24]; }
+def V1Write_15c28_1SVE0	: SchedWriteRes<[V1UnitSVE0]>  { let Latency = 15;
+							 let NumMicroOps = 2;
+							let ReleaseAtCycles = [28]; }
+def V1Write_16c28_1SVE0	: SchedWriteRes<[V1UnitSVE0]>  { let Latency = 16;
+							 let NumMicroOps = 2;
+							 let ReleaseAtCycles = [28]; }
+def V1Write_19c36_1SVE0 : SchedWriteRes<[V1UnitSVE0]> { let Latency = 19;
+							let NumMicroOps = 2;
+							let ReleaseAtCycles = [36]; }
+def V1Write_20c40_1SVE0	: SchedWriteRes<[V1UnitSVE0]> { let Latency = 20;
+							let NumMicroOps = 2;
+							let ReleaseAtCycles = [40]; }
+
 def V1Write_2c_1V01    : SchedWriteRes<[V1UnitV01]> { let Latency = 2; }
+def V1Write_2c_1SVE01  : SchedWriteRes<[V1UnitSVE01,V1UnitSVE01]> { let Latency = 2;
+								    let NumMicroOps = 2; }
 def V1Write_3c_1V01    : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
-def V1Write_4c_1V01    : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
-def V1Write_5c_1V01    : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Write_3c_1SVE01  : SchedWriteRes<[V1UnitSVE01,V1UnitSVE01]> { let Latency = 3;
+								    let NumMicroOps = 2; }
+def V1Write_4c_1SVE01  : SchedWriteRes<[V1UnitSVE01,V1UnitSVE01]> { let Latency = 4;
+								    let NumMicroOps = 2; }
+def V1Write_4c2_1V01   : SchedWriteRes<[V1UnitV01]> { let Latency = 4;
+						      let ReleaseAtCycles = [2]; }
+def V1Write_4c3_1V01   : SchedWriteRes<[V1UnitV01]> { let Latency = 4;
+						      let ReleaseAtCycles = [3]; }
+def V1Write_6c3_1V01   : SchedWriteRes<[V1UnitV01]> { let Latency = 6;
+						      let ReleaseAtCycles = [3]; }
+def V1Write_6c5_1V01   : SchedWriteRes<[V1UnitV01]> { let Latency = 6;
+						      let ReleaseAtCycles = [5]; }
+def V1Write_8c6_1SVE01 : SchedWriteRes<[V1UnitSVE01]> { let Latency = 8;
+							let NumMicroOps = 2;
+							let ReleaseAtCycles = [6]; }
+def V1Write_9c8_1SVE01 : SchedWriteRes<[V1UnitSVE01]> { let Latency = 9;
+							let NumMicroOps = 2;
+							let ReleaseAtCycles = [8]; }
+def V1Write_12c8_1SVE01: SchedWriteRes<[V1UnitSVE01]> { let Latency = 12;
+							let ReleaseAtCycles = [8];
+							let NumMicroOps = 2; }
+def V1Write_13c6_1SVE01	 : SchedWriteRes<[V1UnitSVE01]> { let Latency = 13;
+							  let ReleaseAtCycles = [12];
+							  let NumMicroOps = 2; }
+def V1Write_11c10_1SVE01  : SchedWriteRes<[V1UnitSVE01]> { let Latency = 11;
+							   let NumMicroOps = 2;
+							   let ReleaseAtCycles = [10]; }
 def V1Write_3c_1V02    : SchedWriteRes<[V1UnitV02]> { let Latency = 3; }
 def V1Write_4c_1V02    : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Write_4c2_1V02   : SchedWriteRes<[V1UnitV02]> { let Latency = 4;
+						      let ReleaseAtCycles = [2]; }
+def V1Write_6c4_1V02   : SchedWriteRes<[V1UnitV02]> { let Latency = 6;
+						      let ReleaseAtCycles = [4]; }
+def V1Write_7c2_1V02   : SchedWriteRes<[V1UnitV02]> { let Latency = 7;
+						      let ReleaseAtCycles = [2]; }
 def V1Write_7c7_1V02   : SchedWriteRes<[V1UnitV02]> { let Latency = 7;
                                                       let ReleaseAtCycles = [7]; }
-def V1Write_10c7_1V02  : SchedWriteRes<[V1UnitV02]> { let Latency = 10;
-                                                      let ReleaseAtCycles = [7]; }
-def V1Write_13c5_1V02  : SchedWriteRes<[V1UnitV02]> { let Latency = 13;
+def V1Write_9c3_1V02  : SchedWriteRes<[V1UnitV02]>  { let Latency = 9;
+						      let ReleaseAtCycles = [2]; }
+def V1Write_10c3_1V02  : SchedWriteRes<[V1UnitV02]> { let Latency = 10;
+						      let ReleaseAtCycles = [3]; }
+def V1Write_10c5_1V02  : SchedWriteRes<[V1UnitV02]> { let Latency = 10;
                                                       let ReleaseAtCycles = [5]; }
-def V1Write_13c11_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 13;
-                                                      let ReleaseAtCycles = [11]; }
+def V1Write_10c9_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 10;
+						      let ReleaseAtCycles = [9]; }
+def V1Write_13c13_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 13;
+						      let ReleaseAtCycles = [13]; }
 def V1Write_15c7_1V02  : SchedWriteRes<[V1UnitV02]> { let Latency = 15;
                                                       let ReleaseAtCycles = [7]; }
-def V1Write_16c7_1V02  : SchedWriteRes<[V1UnitV02]> { let Latency = 16;
-                                                      let ReleaseAtCycles = [7]; }
+def V1Write_15c14_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 15;
+						      let ReleaseAtCycles = [14]; }
+def V1Write_16c8_1V02  : SchedWriteRes<[V1UnitV02]> { let Latency = 16;
+						      let ReleaseAtCycles = [8]; }
----------------
jvillette38 wrote:

This one is used for FSQRTDr. Again, I used worste-case throughput.
`FP square root, D-form | FSQRT | 7 to 16 | 4/15 to 4/7 | V02`
So throughput of 4/15. This instruction can be issued in V0 or V2 so throughput in each pipeline is 4/15/2: 2/15.
To get the number of cycles the micro op should stay in pipeline: 15/2 so 7.5.
It was computed with a script to generate references. I am agree that it should be better to consider also best-case + 1/3 between best and worst cases. And probably after benchmarking, only the best case... 
I can skip this kind of changes in new patches versions.
Sorry.

https://github.com/llvm/llvm-project/pull/126707


More information about the llvm-commits mailing list