[llvm] r357367 - [x86] allow movmsk with 2-element reductions
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 31 08:11:34 PDT 2019
Author: spatel
Date: Sun Mar 31 08:11:34 2019
New Revision: 357367
URL: http://llvm.org/viewvc/llvm-project?rev=357367&view=rev
Log:
[x86] allow movmsk with 2-element reductions
One motivation for making this change is that the lack of using movmsk is likely
a main source of perf difference between clang and gcc on the C-Ray benchmark as
shown here:
https://www.phoronix.com/scan.php?page=article&item=gcc-clang-2019&num=5
...but this change alone isn't enough to solve that problem.
The 'all-of' examples show what is likely the worst case trade-off: we end up with
an extra instruction (or 2 if we count the 'xor' register clearing). The 'any-of'
examples look clearly better using movmsk because we've traded 2 vector instructions
for 2 scalar instructions, and movmsk may have better timing than the generic 'movq'.
If we examine the llvm-mca output for these cases, it appears that even though the
'all-of' movmsk variant looks worse on paper, it would perform better on both
Haswell and Jaguar.
$ llvm-mca -mcpu=haswell no_movmsk.s -timeline
Iterations: 100
Instructions: 400
Total Cycles: 504
Total uOps: 400
Dispatch Width: 4
uOps Per Cycle: 0.79
IPC: 0.79
Block RThroughput: 1.0
$ llvm-mca -mcpu=haswell movmsk.s -timeline
Iterations: 100
Instructions: 600
Total Cycles: 358
Total uOps: 600
Dispatch Width: 4
uOps Per Cycle: 1.68
IPC: 1.68
Block RThroughput: 1.5
$ llvm-mca -mcpu=btver2 no_movmsk.s -timeline
Iterations: 100
Instructions: 400
Total Cycles: 407
Total uOps: 400
Dispatch Width: 2
uOps Per Cycle: 0.98
IPC: 0.98
Block RThroughput: 2.0
$ llvm-mca -mcpu=btver2 movmsk.s -timeline
Iterations: 100
Instructions: 600
Total Cycles: 311
Total uOps: 600
Dispatch Width: 2
uOps Per Cycle: 1.93
IPC: 1.93
Block RThroughput: 3.0
Finally, there may be CPUs where movmsk is horribly slow (old AMD small cores?), but if
that's true, then we're also almost certainly making the wrong transform already for
reductions with >2 elements, so that should be fixed independently.
Differential Revision: https://reviews.llvm.org/D59997
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll
llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=357367&r1=357366&r2=357367&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Mar 31 08:11:34 2019
@@ -34306,8 +34306,11 @@ static SDValue combineHorizontalPredicat
((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
return SDValue();
- // Don't bother performing this for 2-element vectors.
- if (Match.getValueType().getVectorNumElements() <= 2)
+ // Make sure this isn't a vector of 1 element. The perf win from using MOVMSK
+ // diminishes with less elements in the reduction, but it is generally better
+ // to get the comparison over to the GPRs as soon as possible to reduce the
+ // number of vector ops.
+ if (Match.getValueType().getVectorNumElements() < 2)
return SDValue();
// Check that we are extracting a reduction of all sign bits.
Modified: llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll?rev=357367&r1=357366&r2=357367&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll Sun Mar 31 08:11:34 2019
@@ -8,17 +8,21 @@ define i64 @test_v2f64_sext(<2 x double>
; SSE-LABEL: test_v2f64_sext:
; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: movmskpd %xmm1, %ecx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: cmpl $3, %ecx
+; SSE-NEXT: sete %al
+; SSE-NEXT: negq %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_sext:
; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vmovmskpd %xmm0, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: cmpl $3, %ecx
+; AVX-NEXT: sete %al
+; AVX-NEXT: negq %rax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_sext:
@@ -42,9 +46,11 @@ define i64 @test_v4f64_sext(<4 x double>
; SSE-NEXT: cmpltpd %xmm1, %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
; SSE-NEXT: andpd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: movmskpd %xmm2, %ecx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: cmpl $3, %ecx
+; SSE-NEXT: sete %al
+; SSE-NEXT: negq %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_sext:
@@ -273,17 +279,21 @@ define i64 @test_v2i64_sext(<2 x i64> %a
; SSE-LABEL: test_v2i64_sext:
; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: movmskpd %xmm0, %ecx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: cmpl $3, %ecx
+; SSE-NEXT: sete %al
+; SSE-NEXT: negq %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64_sext:
; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vmovmskpd %xmm0, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: cmpl $3, %ecx
+; AVX-NEXT: sete %al
+; AVX-NEXT: negq %rax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i64_sext:
@@ -307,9 +317,11 @@ define i64 @test_v4i64_sext(<4 x i64> %a
; SSE-NEXT: pcmpgtq %xmm3, %xmm1
; SSE-NEXT: pcmpgtq %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: movmskpd %xmm0, %ecx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: cmpl $3, %ecx
+; SSE-NEXT: sete %al
+; SSE-NEXT: negq %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_sext:
Modified: llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll?rev=357367&r1=357366&r2=357367&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll Sun Mar 31 08:11:34 2019
@@ -8,17 +8,17 @@ define i64 @test_v2f64_sext(<2 x double>
; SSE-LABEL: test_v2f64_sext:
; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: movmskpd %xmm1, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: sbbq %rax, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_sext:
; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vmovmskpd %xmm0, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: sbbq %rax, %rax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_sext:
@@ -42,9 +42,9 @@ define i64 @test_v4f64_sext(<4 x double>
; SSE-NEXT: cmpltpd %xmm1, %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: movmskpd %xmm2, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: sbbq %rax, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_sext:
@@ -255,17 +255,17 @@ define i64 @test_v2i64_sext(<2 x i64> %a
; SSE-LABEL: test_v2i64_sext:
; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: movmskpd %xmm0, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: sbbq %rax, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64_sext:
; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vmovmskpd %xmm0, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: sbbq %rax, %rax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i64_sext:
@@ -289,9 +289,9 @@ define i64 @test_v4i64_sext(<4 x i64> %a
; SSE-NEXT: pcmpgtq %xmm3, %xmm1
; SSE-NEXT: pcmpgtq %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: movmskpd %xmm0, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: sbbq %rax, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_sext:
More information about the llvm-commits
mailing list