[llvm] r362599 - [X86][AVX] combineX86ShuffleChain - combine shuffle(extractsubvector(x), extractsubvector(y))
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 5 05:56:53 PDT 2019
Author: rksimon
Date: Wed Jun 5 05:56:53 2019
New Revision: 362599
URL: http://llvm.org/viewvc/llvm-project?rev=362599&view=rev
Log:
[X86][AVX] combineX86ShuffleChain - combine shuffle(extractsubvector(x),extractsubvector(y))
We already handle the case where we combine shuffle(extractsubvector(x),extractsubvector(x)), this relaxes the requirement to permit different sources as long as they have the same value type.
This causes a couple of cases where the VPERMV3 binary shuffles occur at a wider width than before, which I intend to improve in future commits - but as only the subvector's mask indices are defined, these will broadcast so we don't see any increase in constant size.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/trunk/test/CodeGen/X86/pr29112.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=362599&r1=362598&r2=362599&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jun 5 05:56:53 2019
@@ -32091,19 +32091,28 @@ static SDValue combineX86ShuffleChain(Ar
isa<ConstantSDNode>(V2.getOperand(1))) {
SDValue Src1 = V1.getOperand(0);
SDValue Src2 = V2.getOperand(0);
- if (Src1 == Src2) {
+ if (Src1.getValueType() == Src2.getValueType()) {
unsigned Offset1 = V1.getConstantOperandVal(1);
unsigned Offset2 = V2.getConstantOperandVal(1);
assert(((Offset1 % VT1.getVectorNumElements()) == 0 ||
(Offset2 % VT2.getVectorNumElements()) == 0 ||
(Src1.getValueSizeInBits() % RootSizeInBits) == 0) &&
"Unexpected subvector extraction");
+ unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits;
+
// Convert extraction indices to mask size.
Offset1 /= VT1.getVectorNumElements();
Offset2 /= VT2.getVectorNumElements();
Offset1 *= NumMaskElts;
Offset2 *= NumMaskElts;
+ SmallVector<SDValue, 2> NewInputs;
+ NewInputs.push_back(Src1);
+ if (Src1 != Src2) {
+ NewInputs.push_back(Src2);
+ Offset2 += Scale * NumMaskElts;
+ }
+
// Create new mask for larger type.
SmallVector<int, 64> NewMask(Mask);
for (int &M : NewMask) {
@@ -32114,10 +32123,8 @@ static SDValue combineX86ShuffleChain(Ar
else
M = (M - NumMaskElts) + Offset2;
}
- unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits;
NewMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
- SDValue NewInputs[] = {Src1};
if (SDValue Res = combineX86ShuffleChain(
NewInputs, Src1, NewMask, Depth, HasVariableMask,
AllowVariableMask, DAG, Subtarget)) {
Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=362599&r1=362598&r2=362599&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Wed Jun 5 05:56:53 2019
@@ -1805,12 +1805,12 @@ define <4 x i32> @test_masked_z_16xi32_t
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,3]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,0,2]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpermd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3]
-; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermd %ymm3, %ymm1, %ymm1
+; CHECK-NEXT: vpermt2d %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
@@ -3128,10 +3128,9 @@ define <4 x float> @test_masked_z_16xflo
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
-; CHECK-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
@@ -3146,12 +3145,12 @@ define <4 x float> @test_masked_16xfloat
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
-; CHECK-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
Modified: llvm/trunk/test/CodeGen/X86/pr29112.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr29112.ll?rev=362599&r1=362598&r2=362599&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr29112.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr29112.ll Wed Jun 5 05:56:53 2019
@@ -11,45 +11,45 @@ define <4 x float> @bar(<4 x float>* %a1
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
; CHECK-NEXT: vmovaps %xmm1, %xmm8
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3]
-; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3]
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm10[0,1],xmm2[1],xmm10[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0],xmm1[1],xmm4[2,3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0,1],xmm2[1],xmm0[3]
; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1]
-; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm12
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm7[2],zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm5[0]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm1[2],zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm9[0,1],xmm2[1],xmm9[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1]
+; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
-; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm9, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll?rev=362599&r1=362598&r2=362599&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll Wed Jun 5 05:56:53 2019
@@ -2994,8 +2994,9 @@ define <32 x i8> @shuffle_v32i8_15_15_15
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
-; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32,15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32]
+; AVX512VLVBMI-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i8> %shuffle
More information about the llvm-commits
mailing list