[llvm] [X86] Fold EXTRACT_SUBVECTOR(ONEUSE(EXTRACT_SUBVECTOR(V,C1))),C2) - EXTRACT_SUBVECTOR(V,C1+C2) (PR #111685)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 9 07:18:04 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Extract from the original source vector whenever possible.
This removes a number of dependency bottlenecks and helps a number of shuffle combining cases: either by allowing us to avoid a cross-lane variable shuffle on a slow target by keeping the instruction count below the threshold, or on fast targets make it easier to recognise that the subvectors all came form the same source.
---
Patch is 23.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111685.diff
7 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+8)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll (+32-27)
- (modified) llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (+51-26)
- (modified) llvm/test/CodeGen/X86/kshift.ll (+2-4)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll (+1-2)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll (+15-7)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll (+38-31)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ddbe82b1de5cfc..654d99c013168b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57663,6 +57663,14 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
+ // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
+ if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
+ TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
+ unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
+ return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
+ }
+
// If we are extracting from an insert into a larger vector, replace with a
// smaller insert if we don't access less than the original subvector. Don't
// do this for i1 vectors.
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index cc22da4aa61d76..4972d3e4ec72bb 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -2470,8 +2470,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -2609,8 +2608,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
@@ -2740,8 +2738,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -2879,8 +2876,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3010,8 +3006,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3148,8 +3143,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3290,8 +3284,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3407,8 +3400,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: movw $1, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
@@ -4565,17 +4557,30 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 60d5f74c7a364d..1b9e9b200a9e3c 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4201,27 +4201,43 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double>
ret <4 x double> %res
}
define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
-; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm1
+; CHECK-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
ret <2 x double> %res
}
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
-; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
-; CHECK-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-FAST-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
+; CHECK-FAST-PERLANE-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
@@ -4229,15 +4245,24 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
}
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
-; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll
index f4efacc1946cff..a3b5d8aee03c10 100644
--- a/llvm/test/CodeGen/X86/kshift.ll
+++ b/llvm/test/CodeGen/X86/kshift.ll
@@ -271,8 +271,7 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm0
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
@@ -562,8 +561,7 @@ define i32 @kshiftr_v32i1_31(<32 x i16> %x, <32 x i16> %y) {
define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftr_v64i1_63:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index bad0b411f68a95..1b80fcdedb43f2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -433,8 +433,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
; SLOW-LABEL: test_v16i32_0_1_2_12:
; SLOW: # %bb.0:
-; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; SLOW-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; SLOW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 2387e05729661e..97b262cc7ac5c3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -2189,13 +2189,21 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
}
define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
-; ALL-LABEL: test_v8i64_2_5:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: ret{{[l|q]}}
+; AVX512F-LABEL: test_v8i64_2_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [2,5]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_v8i64_2_5:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [2,0,5,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> <i32 2, i32 5>
ret <2 x i64> %res
}
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 8cf277aa9796e8..ce092f9d343fc6 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -2478,8 +2478,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -2631,8 +2630,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2
@@ -2775,8 +2773,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -2927,8 +2924,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2
@@ -3071,8 +3067,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3222,8 +3217,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/111685
More information about the llvm-commits
mailing list