[llvm] r353332 - [x86] add tests for horizontal ops (PR38971, PR33758); NFC
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 6 11:40:11 PST 2019
Author: spatel
Date: Wed Feb 6 11:40:11 2019
New Revision: 353332
URL: http://llvm.org/viewvc/llvm-project?rev=353332&view=rev
Log:
[x86] add tests for horizontal ops (PR38971, PR33758); NFC
Modified:
llvm/trunk/test/CodeGen/X86/haddsub.ll
llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll
Modified: llvm/trunk/test/CodeGen/X86/haddsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub.ll?rev=353332&r1=353331&r2=353332&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub.ll Wed Feb 6 11:40:11 2019
@@ -1352,3 +1352,91 @@ define float @extract_extract_v4f32_fadd
ret float %x01
}
+; Repeat tests from general reductions to verify output for hoppy targets:
+; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
+
+declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
+
+define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
+; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
+; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
+; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm2, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: fadd_reduce_v8f32:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: addps %xmm2, %xmm1
+; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: fadd_reduce_v8f32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: fadd_reduce_v8f32:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-FAST-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
+ %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+ ret float %r
+}
+
+define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
+; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
+; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: fadd_reduce_v4f64:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
+; SSE3-FAST-NEXT: addpd %xmm2, %xmm0
+; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: fadd_reduce_v4f64:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: fadd_reduce_v4f64:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-FAST-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
+ %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+ ret double %r
+}
+
Modified: llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll?rev=353332&r1=353331&r2=353332&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll Wed Feb 6 11:40:11 2019
@@ -902,3 +902,257 @@ define i32 @extract_extract_v4i32_add_i3
ret i32 %x01
}
+; PR33758: https://bugs.llvm.org/show_bug.cgi?id=33758
+
+define i32 @partial_reduction_add_v8i32(<8 x i32> %x) {
+; SSE3-SLOW-LABEL: partial_reduction_add_v8i32:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movd %xmm0, %eax
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: partial_reduction_add_v8i32:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
+; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
+; SSE3-FAST-NEXT: movd %xmm1, %eax
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: partial_reduction_add_v8i32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: partial_reduction_add_v8i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-FAST-LABEL: partial_reduction_add_v8i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
+;
+; AVX512-FAST-LABEL: partial_reduction_add_v8i32:
+; AVX512-FAST: # %bb.0:
+; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
+; AVX512-FAST-NEXT: vzeroupper
+; AVX512-FAST-NEXT: retq
+ %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0213 = add <8 x i32> %x, %x23
+ %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0123 = add <8 x i32> %x0213, %x13
+ %r = extractelement <8 x i32> %x0123, i32 0
+ ret i32 %r
+}
+
+define i32 @partial_reduction_add_v16i32(<16 x i32> %x) {
+; SSE3-SLOW-LABEL: partial_reduction_add_v16i32:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movd %xmm0, %eax
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: partial_reduction_add_v16i32:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
+; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
+; SSE3-FAST-NEXT: movd %xmm1, %eax
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: partial_reduction_add_v16i32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: partial_reduction_add_v16i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-FAST-LABEL: partial_reduction_add_v16i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
+;
+; AVX512-FAST-LABEL: partial_reduction_add_v16i32:
+; AVX512-FAST: # %bb.0:
+; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
+; AVX512-FAST-NEXT: vzeroupper
+; AVX512-FAST-NEXT: retq
+ %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0213 = add <16 x i32> %x, %x23
+ %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0123 = add <16 x i32> %x0213, %x13
+ %r = extractelement <16 x i32> %x0123, i32 0
+ ret i32 %r
+}
+
+define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) {
+; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movd %xmm0, %eax
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: partial_reduction_sub_v8i32:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: psubd %xmm1, %xmm0
+; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0
+; SSE3-FAST-NEXT: movd %xmm0, %eax
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: partial_reduction_sub_v8i32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: partial_reduction_sub_v8i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-FAST-LABEL: partial_reduction_sub_v8i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
+;
+; AVX512-FAST-LABEL: partial_reduction_sub_v8i32:
+; AVX512-FAST: # %bb.0:
+; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
+; AVX512-FAST-NEXT: vzeroupper
+; AVX512-FAST-NEXT: retq
+ %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0213 = sub <8 x i32> %x, %x23
+ %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0123 = sub <8 x i32> %x0213, %x13
+ %r = extractelement <8 x i32> %x0123, i32 0
+ ret i32 %r
+}
+
+define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
+; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movd %xmm0, %eax
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: partial_reduction_sub_v16i32:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: psubd %xmm1, %xmm0
+; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0
+; SSE3-FAST-NEXT: movd %xmm0, %eax
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: partial_reduction_sub_v16i32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: partial_reduction_sub_v16i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-FAST-LABEL: partial_reduction_sub_v16i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
+;
+; AVX512-FAST-LABEL: partial_reduction_sub_v16i32:
+; AVX512-FAST: # %bb.0:
+; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
+; AVX512-FAST-NEXT: vzeroupper
+; AVX512-FAST-NEXT: retq
+ %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0213 = sub <16 x i32> %x, %x23
+ %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x0123 = sub <16 x i32> %x0213, %x13
+ %r = extractelement <16 x i32> %x0123, i32 0
+ ret i32 %r
+}
+
More information about the llvm-commits
mailing list