[llvm] r335256 - [X86] Update fast-isel tests for clang r335253.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 21 09:54:18 PDT 2018
Author: ctopper
Date: Thu Jun 21 09:54:18 2018
New Revision: 335256
URL: http://llvm.org/viewvc/llvm-project?rev=335256&view=rev
Log:
[X86] Update fast-isel tests for clang r335253.
The new IR fixes a mismatch in the final extractelement for the i32 intrinsics. Previously we extracted a 64-bit element even though we only wanted 32 bits.
SimplifyDemandedElts isn't able to make FP elements undef now and the shuffle mask I used prevents the use of horizontal add we had before. Not sure we should have been using horizontal add anyway. It's implemented on Intel with two port 5 shuffles and an add. So we have on less shuffle now, but an additional instruction to decode.
Differential Revision: https://reviews.llvm.org/D48347
Modified:
llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll?rev=335256&r1=335255&r2=335256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll Thu Jun 21 09:54:18 2018
@@ -6734,189 +6734,130 @@ entry:
}
define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
-; X86-LABEL: test_mm512_reduce_add_epi32:
-; X86: # %bb.0: # %entry
-; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: vzeroupper
-; X86-NEXT: retl
-;
-; X64-LABEL: test_mm512_reduce_add_epi32:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__W to <16 x i32>
- %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %add7.i = add <4 x i32> %shuffle6.i, %add4.i
- %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %add10.i = add <4 x i32> %shuffle9.i, %add7.i
- %1 = bitcast <4 x i32> %add10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %1, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+; CHECK-LABEL: test_mm512_reduce_add_epi32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: ret{{[l|q]}}
+entry:
+ %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %0 = bitcast <4 x i64> %extract.i to <8 x i32>
+ %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
+ %add.i = add <8 x i32> %0, %1
+ %2 = bitcast <8 x i32> %add.i to <4 x i64>
+ %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
+ %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
+ %add5.i = add <4 x i32> %3, %4
+ %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %add6.i = add <4 x i32> %shuffle.i, %add5.i
+ %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %add8.i = add <4 x i32> %shuffle7.i, %add6.i
+ %vecext.i = extractelement <4 x i32> %add8.i, i32 0
+ ret i32 %vecext.i
}
define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
-; X86-LABEL: test_mm512_reduce_mul_epi32:
-; X86: # %bb.0: # %entry
-; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X86-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: vzeroupper
-; X86-NEXT: retl
-;
-; X64-LABEL: test_mm512_reduce_mul_epi32:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__W to <16 x i32>
- %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
- %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
- %1 = bitcast <4 x i32> %mul10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %1, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+; CHECK-LABEL: test_mm512_reduce_mul_epi32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: ret{{[l|q]}}
+entry:
+ %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %0 = bitcast <4 x i64> %extract.i to <8 x i32>
+ %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
+ %mul.i = mul <8 x i32> %0, %1
+ %2 = bitcast <8 x i32> %mul.i to <4 x i64>
+ %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
+ %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
+ %mul5.i = mul <4 x i32> %3, %4
+ %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
+ %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
+ %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
+ ret i32 %vecext.i
}
define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
-; X86-LABEL: test_mm512_reduce_or_epi32:
-; X86: # %bb.0: # %entry
-; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
-; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: vzeroupper
-; X86-NEXT: retl
-;
-; X64-LABEL: test_mm512_reduce_or_epi32:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__W to <16 x i32>
- %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %or7.i = or <4 x i32> %shuffle6.i, %or4.i
- %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %or10.i = or <4 x i32> %shuffle9.i, %or7.i
- %1 = bitcast <4 x i32> %or10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %1, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+; CHECK-LABEL: test_mm512_reduce_or_epi32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: ret{{[l|q]}}
+entry:
+ %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %or25.i = or <4 x i64> %extract.i, %extract2.i
+ %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %or526.i = or <2 x i64> %extract3.i, %extract4.i
+ %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
+ %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %or6.i = or <4 x i32> %shuffle.i, %or5.i
+ %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %or8.i = or <4 x i32> %shuffle7.i, %or6.i
+ %vecext.i = extractelement <4 x i32> %or8.i, i32 0
+ ret i32 %vecext.i
}
define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
-; X86-LABEL: test_mm512_reduce_and_epi32:
-; X86: # %bb.0: # %entry
-; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: vzeroupper
-; X86-NEXT: retl
-;
-; X64-LABEL: test_mm512_reduce_and_epi32:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__W to <16 x i32>
- %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %and7.i = and <4 x i32> %shuffle6.i, %and4.i
- %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %and10.i = and <4 x i32> %shuffle9.i, %and7.i
- %1 = bitcast <4 x i32> %and10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %1, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+; CHECK-LABEL: test_mm512_reduce_and_epi32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: ret{{[l|q]}}
+entry:
+ %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %and25.i = and <4 x i64> %extract.i, %extract2.i
+ %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %and526.i = and <2 x i64> %extract3.i, %extract4.i
+ %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
+ %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %and6.i = and <4 x i32> %shuffle.i, %and5.i
+ %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %and8.i = and <4 x i32> %shuffle7.i, %and6.i
+ %vecext.i = extractelement <4 x i32> %and8.i, i32 0
+ ret i32 %vecext.i
}
define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
@@ -6946,28 +6887,31 @@ define i32 @test_mm512_mask_reduce_add_e
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__W to <16 x i32>
%1 = bitcast i16 %__M to <16 x i1>
%2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
- %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %add7.i = add <4 x i32> %shuffle6.i, %add4.i
- %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %add10.i = add <4 x i32> %shuffle9.i, %add7.i
- %3 = bitcast <4 x i32> %add10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %3, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+ %3 = bitcast <16 x i32> %2 to <8 x i64>
+ %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = bitcast <4 x i64> %extract.i to <8 x i32>
+ %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
+ %add.i = add <8 x i32> %4, %5
+ %6 = bitcast <8 x i32> %add.i to <4 x i64>
+ %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
+ %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
+ %add6.i = add <4 x i32> %7, %8
+ %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %add7.i = add <4 x i32> %shuffle.i, %add6.i
+ %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %add9.i = add <4 x i32> %shuffle8.i, %add7.i
+ %vecext.i = extractelement <4 x i32> %add9.i, i32 0
+ ret i32 %vecext.i
}
define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
@@ -7001,28 +6945,31 @@ define i32 @test_mm512_mask_reduce_mul_e
; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__W to <16 x i32>
%1 = bitcast i16 %__M to <16 x i1>
%2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
- %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
- %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
- %3 = bitcast <4 x i32> %mul10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %3, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+ %3 = bitcast <16 x i32> %2 to <8 x i64>
+ %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = bitcast <4 x i64> %extract.i to <8 x i32>
+ %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
+ %mul.i = mul <8 x i32> %4, %5
+ %6 = bitcast <8 x i32> %mul.i to <4 x i64>
+ %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
+ %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
+ %mul7.i = mul <4 x i32> %7, %8
+ %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
+ %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
+ %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
+ ret i32 %vecext.i
}
define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
@@ -7056,28 +7003,27 @@ define i32 @test_mm512_mask_reduce_and_e
; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__W to <16 x i32>
%1 = bitcast i16 %__M to <16 x i1>
%2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
- %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %and7.i = and <4 x i32> %shuffle6.i, %and4.i
- %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %and10.i = and <4 x i32> %shuffle9.i, %and7.i
- %3 = bitcast <4 x i32> %and10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %3, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+ %3 = bitcast <16 x i32> %2 to <8 x i64>
+ %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %and28.i = and <4 x i64> %extract.i, %extract4.i
+ %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %and729.i = and <2 x i64> %extract5.i, %extract6.i
+ %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
+ %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %and8.i = and <4 x i32> %shuffle.i, %and7.i
+ %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %and10.i = and <4 x i32> %shuffle9.i, %and8.i
+ %vecext.i = extractelement <4 x i32> %and10.i, i32 0
+ ret i32 %vecext.i
}
define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
@@ -7109,28 +7055,27 @@ define i32 @test_mm512_mask_reduce_or_ep
; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__W to <16 x i32>
%1 = bitcast i16 %__M to <16 x i1>
%2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
- %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %or7.i = or <4 x i32> %shuffle6.i, %or4.i
- %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %or10.i = or <4 x i32> %shuffle9.i, %or7.i
- %3 = bitcast <4 x i32> %or10.i to <2 x i64>
- %vecext.i = extractelement <2 x i64> %3, i32 0
- %conv.i = trunc i64 %vecext.i to i32
- ret i32 %conv.i
+ %3 = bitcast <16 x i32> %2 to <8 x i64>
+ %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %or27.i = or <4 x i64> %extract.i, %extract3.i
+ %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %or628.i = or <2 x i64> %extract4.i, %extract5.i
+ %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
+ %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %or7.i = or <4 x i32> %shuffle.i, %or6.i
+ %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %or9.i = or <4 x i32> %shuffle8.i, %or7.i
+ %vecext.i = extractelement <4 x i32> %or9.i, i32 0
+ ret i32 %vecext.i
}
define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
@@ -7147,7 +7092,8 @@ define double @test_mm512_reduce_add_pd(
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -7162,7 +7108,8 @@ define double @test_mm512_reduce_add_pd(
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@@ -7172,7 +7119,7 @@ entry:
%shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
%shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
%add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+ %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
%add7.i = fadd <2 x double> %add4.i, %shuffle6.i
%vecext.i = extractelement <2 x double> %add7.i, i32 0
ret double %vecext.i
@@ -7219,7 +7166,7 @@ entry:
%shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
%shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
%mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+ %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
%mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
%vecext.i = extractelement <2 x double> %mul7.i, i32 0
ret double %vecext.i
@@ -7236,7 +7183,8 @@ define float @test_mm512_reduce_add_ps(<
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -7252,21 +7200,25 @@ define float @test_mm512_reduce_add_ps(<
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
- %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
- %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
- %vecext.i = extractelement <4 x float> %add10.i, i32 0
+ %0 = bitcast <16 x float> %__W to <8 x double>
+ %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast <4 x double> %extract.i to <8 x float>
+ %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <4 x double> %extract2.i to <8 x float>
+ %add.i = fadd <8 x float> %1, %2
+ %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %add5.i = fadd <4 x float> %extract3.i, %extract4.i
+ %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %add6.i = fadd <4 x float> %add5.i, %shuffle.i
+ %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
+ %vecext.i = extractelement <4 x float> %add8.i, i32 0
ret float %vecext.i
}
@@ -7281,7 +7233,7 @@ define float @test_mm512_reduce_mul_ps(<
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
@@ -7298,22 +7250,25 @@ define float @test_mm512_reduce_mul_ps(<
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
- %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
- %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
- %vecext.i = extractelement <4 x float> %mul10.i, i32 0
+ %0 = bitcast <16 x float> %__W to <8 x double>
+ %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast <4 x double> %extract.i to <8 x float>
+ %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <4 x double> %extract2.i to <8 x float>
+ %mul.i = fmul <8 x float> %1, %2
+ %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
+ %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
+ %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
+ %vecext.i = extractelement <4 x float> %mul8.i, i32 0
ret float %vecext.i
}
@@ -7334,7 +7289,8 @@ define double @test_mm512_mask_reduce_ad
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -7351,7 +7307,8 @@ define double @test_mm512_mask_reduce_ad
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@@ -7363,7 +7320,7 @@ entry:
%shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
%shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
%add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+ %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
%add7.i = fadd <2 x double> %add4.i, %shuffle6.i
%vecext.i = extractelement <2 x double> %add7.i, i32 0
ret double %vecext.i
@@ -7419,7 +7376,7 @@ entry:
%shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
%shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
%mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+ %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
%mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
%vecext.i = extractelement <2 x double> %mul7.i, i32 0
ret double %vecext.i
@@ -7438,7 +7395,8 @@ define float @test_mm512_mask_reduce_add
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -7456,23 +7414,27 @@ define float @test_mm512_mask_reduce_add
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i16 %__M to <16 x i1>
%1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
- %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
- %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
- %vecext.i = extractelement <4 x float> %add10.i, i32 0
+ %2 = bitcast <16 x float> %1 to <8 x double>
+ %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = bitcast <4 x double> %extract.i to <8 x float>
+ %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <4 x double> %extract3.i to <8 x float>
+ %add.i = fadd <8 x float> %3, %4
+ %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %add6.i = fadd <4 x float> %extract4.i, %extract5.i
+ %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %add7.i = fadd <4 x float> %add6.i, %shuffle.i
+ %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
+ %vecext.i = extractelement <4 x float> %add9.i, i32 0
ret float %vecext.i
}
@@ -7490,7 +7452,7 @@ define float @test_mm512_mask_reduce_mul
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
@@ -7510,23 +7472,26 @@ define float @test_mm512_mask_reduce_mul
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i16 %__M to <16 x i1>
%1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
- %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
- %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
- %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
- %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
+ %2 = bitcast <16 x float> %1 to <8 x double>
+ %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = bitcast <4 x double> %extract.i to <8 x float>
+ %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <4 x double> %extract4.i to <8 x float>
+ %mul.i = fmul <8 x float> %3, %4
+ %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
+ %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
+ %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
%vecext.i = extractelement <4 x float> %mul10.i, i32 0
ret float %vecext.i
}
More information about the llvm-commits
mailing list