[llvm] r342728 - [X86][BtVer2] Fix latency and resource cycles of AVX 256-bit zero-idioms.
Andrea Di Biagio via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 21 05:43:07 PDT 2018
Author: adibiagio
Date: Fri Sep 21 05:43:07 2018
New Revision: 342728
URL: http://llvm.org/viewvc/llvm-project?rev=342728&view=rev
Log:
[X86][BtVer2] Fix latency and resource cycles of AVX 256-bit zero-idioms.
This patch introduces a SchedWriteVariant to describe zero-idiom VXORP(S|D)Yrr
and VANDNP(S|D)Yrr.
This is a follow-up of r342555.
On Jaguar, a VXORPSYrr is 2 macro opcodes. Only one opcode is eliminated at
register-renaming stage. The other opcode has to be executed to set the upper
half of the destination YMM.
Same for VANDNP(S|D)Yrr.
Differential Revision: https://reviews.llvm.org/D52347
Modified:
llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td
llvm/trunk/test/CodeGen/X86/avx-schedule.ll
llvm/trunk/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s
Modified: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td?rev=342728&r1=342727&r2=342728&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td Fri Sep 21 05:43:07 2018
@@ -595,6 +595,10 @@ def JWriteZeroLatency : SchedWriteRes<[]
let Latency = 0;
}
+def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
+ let NumMicroOps = 2;
+}
+
// Certain instructions that use the same register for both source
// operands do not have a real dependency on the previous contents of the
// register, and thus, do not have to wait before completing. They can be
@@ -619,6 +623,13 @@ def : InstRW<[JWriteFZeroIdiom], (instrs
ANDNPSrr, VANDNPSrr,
ANDNPDrr, VANDNPDrr)>;
+def JWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+ VANDNPSYrr, VANDNPDYrr)>;
+
def JWriteVZeroIdiomLogic : SchedWriteVariant<[
SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
SchedVar<NoSchedPred, [WriteVecLogic]>
Modified: llvm/trunk/test/CodeGen/X86/avx-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-schedule.ll?rev=342728&r1=342727&r2=342728&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-schedule.ll Fri Sep 21 05:43:07 2018
@@ -5489,10 +5489,10 @@ define void @test_avx256_zero_idioms() {
; BTVER2-LABEL: test_avx256_zero_idioms:
; BTVER2: # %bb.0:
; BTVER2-NEXT: #APP
-; BTVER2-NEXT: vxorps %ymm0, %ymm0, %ymm0 # sched: [1:1.00]
-; BTVER2-NEXT: vxorpd %ymm1, %ymm1, %ymm1 # sched: [1:1.00]
-; BTVER2-NEXT: vandnps %ymm2, %ymm2, %ymm2 # sched: [1:1.00]
-; BTVER2-NEXT: vandnpd %ymm3, %ymm3, %ymm3 # sched: [1:1.00]
+; BTVER2-NEXT: vxorps %ymm0, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vxorpd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; BTVER2-NEXT: vandnps %ymm2, %ymm2, %ymm2 # sched: [1:0.50]
+; BTVER2-NEXT: vandnpd %ymm3, %ymm3, %ymm3 # sched: [1:0.50]
; BTVER2-NEXT: #NO_APP
; BTVER2-NEXT: retq # sched: [4:1.00]
;
Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s?rev=342728&r1=342727&r2=342728&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s Fri Sep 21 05:43:07 2018
@@ -35,12 +35,12 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 300
-# CHECK-NEXT: Total Cycles: 306
+# CHECK-NEXT: Total Cycles: 304
# CHECK-NEXT: Total uOps: 600
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 1.96
-# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: uOps Per Cycle: 1.97
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Instruction Info:
@@ -53,7 +53,7 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 3 2.00 vaddps %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 2 1 1.00 vxorps %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2 1 0.50 vxorps %ymm1, %ymm1, %ymm1
# CHECK-NEXT: 2 1 1.00 vblendps $2, %ymm1, %ymm2, %ymm3
# CHECK: Resources:
@@ -74,27 +74,27 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: - - - 3.00 3.00 3.00 3.00 - - - - - - -
+# CHECK-NEXT: - - - 3.00 2.00 3.00 2.00 - - - - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddps %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vxorps %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vxorps %ymm1, %ymm1, %ymm1
# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - vblendps $2, %ymm1, %ymm2, %ymm3
# CHECK: Timeline view:
-# CHECK-NEXT: 012
+# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeER . . vaddps %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1] .DeE-R . . vxorps %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,2] . DeE-R . . vblendps $2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [1,0] . D=eeeER. . vaddps %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [1,1] . DeE--R. . vxorps %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [1,2] . D=eE-R . vblendps $2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [2,0] . .DeeeER. vaddps %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [2,1] . . D=eER. vxorps %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [2,2] . . D=eER vblendps $2, %ymm1, %ymm2, %ymm3
+# CHECK: [0,0] DeeeER . . vaddps %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1] .DeE-R . . vxorps %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2] . DeE-R . . vblendps $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0] . DeeeER . . vaddps %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1] . DeE-R . . vxorps %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2] . DeE-R. . vblendps $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0] . .D=eeeER. vaddps %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1] . . DeE--R. vxorps %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2] . . DeE--R vblendps $2, %ymm1, %ymm2, %ymm3
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -104,19 +104,19 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.3 1.3 0.0 vaddps %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 1. 3 1.3 1.3 1.0 vxorps %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 2. 3 1.7 0.3 0.7 vblendps $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1. 3 1.0 1.0 1.3 vxorps %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2. 3 1.0 0.0 1.3 vblendps $2, %ymm1, %ymm2, %ymm3
# CHECK: [1] Code Region - ZERO-IDIOM-2
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 300
-# CHECK-NEXT: Total Cycles: 306
+# CHECK-NEXT: Total Cycles: 304
# CHECK-NEXT: Total uOps: 600
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 1.96
-# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: uOps Per Cycle: 1.97
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Instruction Info:
@@ -129,7 +129,7 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 3 2.00 vaddpd %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 2 1 1.00 vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2 1 0.50 vxorpd %ymm1, %ymm1, %ymm1
# CHECK-NEXT: 2 1 1.00 vblendpd $2, %ymm1, %ymm2, %ymm3
# CHECK: Resources:
@@ -150,27 +150,27 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: - - - 3.00 3.00 3.00 3.00 - - - - - - -
+# CHECK-NEXT: - - - 3.00 2.00 3.00 2.00 - - - - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddpd %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vxorpd %ymm1, %ymm1, %ymm1
# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - vblendpd $2, %ymm1, %ymm2, %ymm3
# CHECK: Timeline view:
-# CHECK-NEXT: 012
+# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeER . . vaddpd %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1] .DeE-R . . vxorpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,2] . DeE-R . . vblendpd $2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [1,0] . D=eeeER. . vaddpd %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [1,1] . DeE--R. . vxorpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [1,2] . D=eE-R . vblendpd $2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [2,0] . .DeeeER. vaddpd %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [2,1] . . D=eER. vxorpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [2,2] . . D=eER vblendpd $2, %ymm1, %ymm2, %ymm3
+# CHECK: [0,0] DeeeER . . vaddpd %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1] .DeE-R . . vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2] . DeE-R . . vblendpd $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0] . DeeeER . . vaddpd %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1] . DeE-R . . vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2] . DeE-R. . vblendpd $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0] . .D=eeeER. vaddpd %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1] . . DeE--R. vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2] . . DeE--R vblendpd $2, %ymm1, %ymm2, %ymm3
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -180,8 +180,8 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.3 1.3 0.0 vaddpd %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 1. 3 1.3 1.3 1.0 vxorpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 2. 3 1.7 0.3 0.7 vblendpd $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1. 3 1.0 1.0 1.3 vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2. 3 1.0 0.0 1.3 vblendpd $2, %ymm1, %ymm2, %ymm3
# CHECK: [2] Code Region - ZERO-IDIOM-3
@@ -205,7 +205,7 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 3 2.00 vaddps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 2 1 1.00 vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 2 1 0.50 vandnps %ymm2, %ymm2, %ymm3
# CHECK: Resources:
# CHECK-NEXT: [0] - JALU0
@@ -225,12 +225,12 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 - - - - - - -
+# CHECK-NEXT: - - - 2.00 1.00 2.00 1.00 - - - - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vandnps %ymm2, %ymm2, %ymm3
# CHECK: Timeline view:
# CHECK-NEXT: Index 0123456789
@@ -274,7 +274,7 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 3 2.00 vaddps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 2 1 1.00 vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 2 1 0.50 vandnps %ymm2, %ymm2, %ymm3
# CHECK: Resources:
# CHECK-NEXT: [0] - JALU0
@@ -294,12 +294,12 @@ vandnps %ymm2, %ymm2, %ymm3
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 - - - - - - -
+# CHECK-NEXT: - - - 2.00 1.00 2.00 1.00 - - - - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vandnps %ymm2, %ymm2, %ymm3
# CHECK: Timeline view:
# CHECK-NEXT: Index 0123456789
More information about the llvm-commits
mailing list