[llvm] r328914 - [X86] Add SchedRW for PMULLD
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 30 21:54:32 PDT 2018
Author: ctopper
Date: Fri Mar 30 21:54:32 2018
New Revision: 328914
URL: http://llvm.org/viewvc/llvm-project?rev=328914&view=rev
Log:
[X86] Add SchedRW for PMULLD
Summary:
It seems many CPUs don't implement this instruction as well as the other vector multiplies. Often using a multi uop flow. Silvermont in particular has a 7 uop flow with 11 cycle throughput. Sandy Bridge implements it as a single uop with 5 cycle latency and 1 cycle throughput. But Haswell and later use 2 uops with 10 cycle latency and 2 cycle throughput.
This patch adds a new X86SchedWritePair we can use to tag this instruction separately. I've provided correct information for Silvermont, Btver2, and Sandy Bridge. I've removed the InstRWs for SandyBridge. I've left Haswell/Broadwell/Skylake InstRWs in place because I wasn't sure how to account for the different load latency between 128 and 256 bits. I also left Znver1 InstRWs in place because the existing values don't match Agner's spreadsheet.
I also left a FIXME in the SandyBridge model because it being used for the "generic" model is too optimistic for the 256/512-bit versions since those are multiple uops on all known CPUs.
Reviewers: RKSimon, GGanesh, courbet
Reviewed By: RKSimon
Subscribers: gchatelet, gbedwell, andreadb, llvm-commits
Differential Revision: https://reviews.llvm.org/D44972
Modified:
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
llvm/trunk/lib/Target/X86/X86SchedHaswell.td
llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
llvm/trunk/lib/Target/X86/X86Schedule.td
llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td
llvm/trunk/lib/Target/X86/X86ScheduleSLM.td
llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td
llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
llvm/trunk/test/CodeGen/X86/slow-pmulld.ll
llvm/trunk/test/CodeGen/X86/sse41-schedule.ll
llvm/trunk/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s
llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s
llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse41.s
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Fri Mar 30 21:54:32 2018
@@ -4505,7 +4505,7 @@ defm VPADDUS : avx512_binop_rm_vl_bw<0xD
defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
SSE_INTALU_ITINS_P, HasBWI, 0>;
defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
- SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD;
+ SSE_PMULLD_ITINS, HasAVX512, 1>, T8PD;
defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
SSE_INTMUL_ITINS_P, HasBWI, 1>;
defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Fri Mar 30 21:54:32 2018
@@ -195,7 +195,7 @@ def SSE_MPSADBW_ITINS : OpndItins<
IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
>;
-let Sched = WriteVecIMul in
+let Sched = WritePMULLD in
def SSE_PMULLD_ITINS : OpndItins<
IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
>;
Modified: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedBroadwell.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td Fri Mar 30 21:54:32 2018
@@ -167,6 +167,7 @@ def : WriteRes<WriteVecMove, [BW
defm : BWWriteResPair<WriteVecALU, [BWPort15], 1>; // Vector integer ALU op, no logicals.
defm : BWWriteResPair<WriteVecShift, [BWPort0], 1>; // Vector integer shifts.
defm : BWWriteResPair<WriteVecIMul, [BWPort0], 5>; // Vector integer multiply.
+defm : BWWriteResPair<WritePMULLD, [BWPort0], 10, [2], 2, 5>; // PMULLD
defm : BWWriteResPair<WriteShuffle, [BWPort5], 1>; // Vector shuffles.
defm : BWWriteResPair<WriteBlend, [BWPort15], 1>; // Vector blends.
defm : BWWriteResPair<WriteVarBlend, [BWPort5], 2, [2]>; // Vector variable blends.
@@ -2180,13 +2181,6 @@ def BWWriteResGroup113 : SchedWriteRes<[
def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm",
"LSL(16|32|64)rm")>;
-def BWWriteResGroup114 : SchedWriteRes<[BWPort0]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[BWWriteResGroup114], (instregex "(V?)PMULLD(Y?)rr")>;
-
def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 10;
let NumMicroOps = 2;
@@ -2462,13 +2456,6 @@ def: InstRW<[BWWriteResGroup147], (instr
"DIVR_FST0r",
"DIVR_FrST0")>;
-def BWWriteResGroup148 : SchedWriteRes<[BWPort0,BWPort23]> {
- let Latency = 15;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup148], (instregex "(V?)PMULLDrm")>;
-
def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
let Latency = 15;
let NumMicroOps = 10;
Modified: llvm/trunk/lib/Target/X86/X86SchedHaswell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedHaswell.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedHaswell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td Fri Mar 30 21:54:32 2018
@@ -163,6 +163,7 @@ defm : HWWriteResPair<WriteVecShift, [HW
defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1>;
defm : HWWriteResPair<WriteVecALU, [HWPort15], 1>;
defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5>;
+defm : HWWriteResPair<WritePMULLD, [HWPort0], 10, [2], 2, 6>;
defm : HWWriteResPair<WriteShuffle, [HWPort5], 1>;
defm : HWWriteResPair<WriteBlend, [HWPort15], 1>;
defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3>;
@@ -2680,20 +2681,6 @@ def HWWriteResGroup117 : SchedWriteRes<[
}
def: InstRW<[HWWriteResGroup117], (instregex "(V?)DPPDrmi")>;
-def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup118], (instregex "(V?)PMULLD(Y?)rr")>;
-
-def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 16;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup119], (instregex "(V?)PMULLDrm")>;
-
def HWWriteResGroup119_1 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 17;
let NumMicroOps = 3;
Modified: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td Fri Mar 30 21:54:32 2018
@@ -151,6 +151,7 @@ defm : SBWriteResPair<WriteVecShift, [SB
defm : SBWriteResPair<WriteVecLogic, [SBPort5], 1>;
defm : SBWriteResPair<WriteVecALU, [SBPort1], 3>;
defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5>;
+defm : SBWriteResPair<WritePMULLD, [SBPort0], 5, [1], 1, 6>; // TODO this is probably wrong for 256/512-bit for the "generic" model
defm : SBWriteResPair<WriteShuffle, [SBPort5], 1>;
defm : SBWriteResPair<WriteBlend, [SBPort15], 1>;
defm : SBWriteResPair<WriteVarBlend, [SBPort1, SBPort5], 2>;
@@ -672,7 +673,6 @@ def: InstRW<[SBWriteResGroup20], (instre
"(V?)PMULHRSWrr",
"(V?)PMULHUWrr",
"(V?)PMULHWrr",
- "(V?)PMULLDrr",
"(V?)PMULLWrr",
"(V?)PMULUDQrr",
"(V?)PSADBWrr")>;
@@ -1602,7 +1602,6 @@ def: InstRW<[SBWriteResGroup89], (instre
"(V?)PMULHRSWrm",
"(V?)PMULHUWrm",
"(V?)PMULHWrm",
- "(V?)PMULLDrm",
"(V?)PMULLWrm",
"(V?)PMULUDQrm",
"(V?)PSADBWrm")>;
Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td Fri Mar 30 21:54:32 2018
@@ -164,6 +164,7 @@ def : WriteRes<WriteVecMove, [S
defm : SKLWriteResPair<WriteVecALU, [SKLPort15], 1>; // Vector integer ALU op, no logicals.
defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1>; // Vector integer shifts.
defm : SKLWriteResPair<WriteVecIMul, [SKLPort0], 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>;
defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1>; // Vector shuffles.
defm : SKLWriteResPair<WriteBlend, [SKLPort15], 1>; // Vector blends.
defm : SKLWriteResPair<WriteVarBlend, [SKLPort5], 2, [2]>; // Vector variable blends.
@@ -1849,13 +1850,6 @@ def: InstRW<[SKLWriteResGroup105], (inst
"(V?)ROUNDSDr",
"(V?)ROUNDSSr")>;
-def SKLWriteResGroup105_2 : SchedWriteRes<[SKLPort01]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup105_2], (instregex "(V?)PMULLD(Y?)rr")>;
-
def SKLWriteResGroup106 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let Latency = 8;
let NumMicroOps = 2;
@@ -2559,13 +2553,6 @@ def: InstRW<[SKLWriteResGroup168], (inst
def: InstRW<[SKLWriteResGroup168], (instregex "(V?)ROUNDSDm")>;
def: InstRW<[SKLWriteResGroup168], (instregex "(V?)ROUNDSSm")>;
-def SKLWriteResGroup168_2 : SchedWriteRes<[SKLPort23,SKLPort01]> {
- let Latency = 16;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup168_2], (instregex "(V?)PMULLDrm")>;
-
def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 14;
let NumMicroOps = 3;
Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td Fri Mar 30 21:54:32 2018
@@ -164,6 +164,7 @@ def : WriteRes<WriteVecMove, [S
defm : SKXWriteResPair<WriteVecALU, [SKXPort15], 1>; // Vector integer ALU op, no logicals.
defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1>; // Vector integer shifts.
defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WritePMULLD, [SKXPort015], 10, [2], 2, 6>; // Vector integer multiply.
defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1>; // Vector shuffles.
defm : SKXWriteResPair<WriteBlend, [SKXPort15], 1>; // Vector blends.
defm : SKXWriteResPair<WriteVarBlend, [SKXPort5], 2, [2]>; // Vector variable blends.
Modified: llvm/trunk/lib/Target/X86/X86Schedule.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Schedule.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Schedule.td (original)
+++ llvm/trunk/lib/Target/X86/X86Schedule.td Fri Mar 30 21:54:32 2018
@@ -100,6 +100,7 @@ def WriteVecMove : SchedWrite;
defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply.
+defm WritePMULLD : X86SchedWritePair; // PMULLD
defm WriteShuffle : X86SchedWritePair; // Vector shuffles.
defm WriteBlend : X86SchedWritePair; // Vector blends.
defm WriteVarBlend : X86SchedWritePair; // Vector variable blends.
Modified: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td Fri Mar 30 21:54:32 2018
@@ -345,6 +345,7 @@ def : WriteRes<WriteVecMove,
defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>;
defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>;
defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2]>;
defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>;
defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>;
Modified: llvm/trunk/lib/Target/X86/X86ScheduleSLM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleSLM.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleSLM.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleSLM.td Fri Mar 30 21:54:32 2018
@@ -138,6 +138,7 @@ defm : SLMWriteResPair<WriteVecShift, [S
defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecALU, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
defm : SLMWriteResPair<WriteShuffle, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>;
Modified: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td Fri Mar 30 21:54:32 2018
@@ -213,6 +213,7 @@ defm : ZnWriteResFpuPair<WriteVecLogic,
defm : ZnWriteResFpuPair<WritePHAdd, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WritePMULLD, [ZnFPU0], 4>; // FIXME
defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>;
defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;
Modified: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-schedule.ll?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll Fri Mar 30 21:54:32 2018
@@ -4911,7 +4911,7 @@ define <8 x i32> @test_pmulld(<8 x i32>
; GENERIC-LABEL: test_pmulld:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulld:
Modified: llvm/trunk/test/CodeGen/X86/slow-pmulld.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/slow-pmulld.ll?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/slow-pmulld.ll (original)
+++ llvm/trunk/test/CodeGen/X86/slow-pmulld.ll Fri Mar 30 21:54:32 2018
@@ -1215,34 +1215,32 @@ define <8 x i32> @test_mul_v8i32_v8i16_m
define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
; SLM32-LABEL: test_mul_v16i32_v16i16_minsize:
; SLM32: # %bb.0:
-; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLM32-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; SLM32-NEXT: pmulld %xmm1, %xmm4
-; SLM32-NEXT: pmulld %xmm1, %xmm0
-; SLM32-NEXT: pmulld %xmm1, %xmm2
-; SLM32-NEXT: pmulld %xmm1, %xmm3
-; SLM32-NEXT: movdqa %xmm4, %xmm1
+; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SLM32-NEXT: pmulld %xmm5, %xmm0
+; SLM32-NEXT: pmulld %xmm5, %xmm2
+; SLM32-NEXT: pmulld %xmm5, %xmm1
+; SLM32-NEXT: pmulld %xmm5, %xmm3
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v16i32_v16i16_minsize:
; SLM64: # %bb.0:
-; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLM64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; SLM64-NEXT: pmulld %xmm1, %xmm4
-; SLM64-NEXT: pmulld %xmm1, %xmm0
-; SLM64-NEXT: pmulld %xmm1, %xmm2
-; SLM64-NEXT: pmulld %xmm1, %xmm3
-; SLM64-NEXT: movdqa %xmm4, %xmm1
+; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SLM64-NEXT: pmulld %xmm5, %xmm0
+; SLM64-NEXT: pmulld %xmm5, %xmm2
+; SLM64-NEXT: pmulld %xmm5, %xmm1
+; SLM64-NEXT: pmulld %xmm5, %xmm3
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
Modified: llvm/trunk/test/CodeGen/X86/sse41-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41-schedule.ll?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41-schedule.ll Fri Mar 30 21:54:32 2018
@@ -4817,8 +4817,8 @@ define <4 x i32> @test_pmulld(<4 x i32>
;
; SLM-LABEL: test_pmulld:
; SLM: # %bb.0:
-; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00]
-; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [11:11.00]
+; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [14:11.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-SSE-LABEL: test_pmulld:
@@ -4883,14 +4883,14 @@ define <4 x i32> @test_pmulld(<4 x i32>
;
; BTVER2-SSE-LABEL: test_pmulld:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: pmulld %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-SSE-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00]
+; BTVER2-SSE-NEXT: pmulld %xmm1, %xmm0 # sched: [4:2.00]
+; BTVER2-SSE-NEXT: pmulld (%rdi), %xmm0 # sched: [9:2.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_pmulld:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_pmulld:
Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s Fri Mar 30 21:54:32 2018
@@ -19,7 +19,7 @@ vsqrtps %ymm0, %ymm2
# CHECK: Iterations: 70
# CHECK-NEXT: Instructions: 560
-# CHECK-NEXT: Total Cycles: 4415
+# CHECK-NEXT: Total Cycles: 4416
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 0.13
@@ -33,7 +33,7 @@ vsqrtps %ymm0, %ymm2
# CHECK-NEXT: [6]: HasSideEffects
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 1 2 1.00 vpmulld %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 3 4 2.00 vpmulld %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.50 vpand %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 3 1.00 vcvttps2dq %xmm0, %xmm2
# CHECK-NEXT: 1 2 1.00 vpclmulqdq $0, %xmm0, %xmm1, %xmm2
@@ -62,12 +62,11 @@ vsqrtps %ymm0, %ymm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: - - - 3.00 63.00 5.00 5.00 - - - 1.00 0.50 0.50 2.00
-
+# CHECK-NEXT: - - - 3.00 63.00 6.01 5.99 - - - 1.00 1.00 1.00 3.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
-# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmulld %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: - - - - - - 1.00 - - - - 0.50 0.50 - vpand %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - 2.00 1.00 - - - - 0.03 0.97 2.00 vpmulld %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - 0.01 0.99 - - - - 0.97 0.03 - vpand %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - 1.00 - - - 1.00 - - - vcvttps2dq %xmm0, %xmm2
# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpclmulqdq $0, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vaddps %xmm0, %xmm1, %xmm2
@@ -80,20 +79,20 @@ vsqrtps %ymm0, %ymm2
# CHECK-NEXT: 0123456789 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 0123456789 01234567
-# CHECK: [0,0] DeeER. . . . . . . . . . . . . . vpmulld %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,1] DeE-R. . . . . . . . . . . . . . vpand %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,2] .DeeeER . . . . . . . . . . . . . vcvttps2dq %xmm0, %xmm2
-# CHECK-NEXT: [0,3] .DeeE-R . . . . . . . . . . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,4] . DeeeER . . . . . . . . . . . . . vaddps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . vsqrtps %xmm0, %xmm2
-# CHECK-NEXT: [0,6] . DeeeE-----------------R . . . . . . . . . vaddps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [0,7] . D===================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER vsqrtps %ymm0, %ymm2
-
-# CHECK: [1,0] . DeeE----------------------------------------------------------R vpmulld %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [1,1] . DeE-----------------------------------------------------------R vpand %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [1,2] . .DeeeE--------------------------------------------------------R vcvttps2dq %xmm0, %xmm2
-# CHECK-NEXT: [1,3] . .DeeE---------------------------------------------------------R vpclmulqdq $0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [1,4] . . DeeeE-------------------------------------------------------R vaddps %xmm0, %xmm1, %xmm2
+# CHECK: [0,0] DeeeeER . . . . . . . . . . . . . vpmulld %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1] .DeE--R . . . . . . . . . . . . . vpand %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,2] . DeeeER . . . . . . . . . . . . . vcvttps2dq %xmm0, %xmm2
+# CHECK-NEXT: [0,3] . DeeE-R . . . . . . . . . . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,4] . DeeeER . . . . . . . . . . . . . vaddps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . vsqrtps %xmm0, %xmm2
+# CHECK-NEXT: [0,6] . DeeeE-----------------R . . . . . . . . . vaddps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,7] . D===================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER vsqrtps %ymm0, %ymm2
+
+# CHECK: [1,0] . .DeeeeE--------------------------------------------------------R vpmulld %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1] . . DeE----------------------------------------------------------R vpand %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,2] . . DeeeE-------------------------------------------------------R vcvttps2dq %xmm0, %xmm2
+# CHECK-NEXT: [1,3] . . DeeE--------------------------------------------------------R vpclmulqdq $0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,4] . . DeeeE------------------------------------------------------R vaddps %xmm0, %xmm1, %xmm2
# CHECK: Average Wait times (based on the timeline view):
@@ -103,11 +102,11 @@ vsqrtps %ymm0, %ymm2
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 1.0 1.0 29.0 vpmulld %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0. 2 1.0 1.0 28.0 vpmulld %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1. 2 1.0 1.0 30.0 vpand %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 2. 2 1.0 1.0 28.0 vcvttps2dq %xmm0, %xmm2
-# CHECK-NEXT: 3. 2 1.0 1.0 29.0 vpclmulqdq $0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 4. 2 1.0 1.0 27.5 vaddps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 2. 2 1.0 1.0 27.5 vcvttps2dq %xmm0, %xmm2
+# CHECK-NEXT: 3. 2 1.0 1.0 28.5 vpclmulqdq $0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 4. 2 1.0 1.0 27.0 vaddps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 5. 1 1.0 1.0 0.0 vsqrtps %xmm0, %xmm2
# CHECK-NEXT: 6. 1 1.0 1.0 17.0 vaddps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 7. 1 20.0 20.0 0.0 vsqrtps %ymm0, %ymm2
Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s Fri Mar 30 21:54:32 2018
@@ -1518,8 +1518,8 @@ vzeroupper
# CHECK-NEXT: 1 7 1.00 * vpmulhuw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 2 1.00 vpmulhw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 7 1.00 * vpmulhw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 2 1.00 vpmulld %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 7 1.00 * vpmulld (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 3 4 2.00 vpmulld %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 3 9 2.00 * vpmulld (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 2 1.00 vpmullw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 7 1.00 * vpmullw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 2 1.00 vpmuludq %xmm0, %xmm1, %xmm2
@@ -2221,8 +2221,8 @@ vzeroupper
# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmulhuw (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmulhw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmulhw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmulld %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmulld (%rax), %xmm1, %xmm2
+# CHECK-NEXT: - - - - - 2.50 0.50 - - - - 0.50 0.50 2.00 vpmulld %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - 2.50 0.50 1.00 - - - 0.50 0.50 2.00 vpmulld (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmullw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmullw (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmuludq %xmm0, %xmm1, %xmm2
Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse41.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse41.s?rev=328914&r1=328913&r2=328914&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse41.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse41.s Fri Mar 30 21:54:32 2018
@@ -247,8 +247,8 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - pmovzxwq (%rax), %xmm2
# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 pmuldq %xmm0, %xmm2
# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 pmuldq (%rax), %xmm2
-# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 pmulld %xmm0, %xmm2
-# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 pmulld (%rax), %xmm2
+# CHECK-NEXT: - - - - - 2.50 0.50 - - - - 0.50 0.50 2.00 pmulld %xmm0, %xmm2
+# CHECK-NEXT: - - - - - 2.50 0.50 1.00 - - - 0.50 0.50 2.00 pmulld (%rax), %xmm2
# CHECK-NEXT: 1.00 - - 1.00 - 1.00 - - - - - - - - ptest %xmm0, %xmm1
# CHECK-NEXT: 1.00 - - 1.00 - 1.00 - 1.00 - - - - - - ptest (%rax), %xmm1
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - roundpd $1, %xmm0, %xmm2
More information about the llvm-commits
mailing list