[llvm] 1e31a45 - [X86] lowerShuffleWithPERMV - commute VPERMV3 shuffles so any load is on the RHS

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sat Nov 23 02:48:07 PST 2024


Author: Simon Pilgrim
Date: 2024-11-23T10:47:30Z
New Revision: 1e31a4529244ead9f12abed524f33a48515abee2

URL: https://github.com/llvm/llvm-project/commit/1e31a4529244ead9f12abed524f33a48515abee2
DIFF: https://github.com/llvm/llvm-project/commit/1e31a4529244ead9f12abed524f33a48515abee2.diff

LOG: [X86] lowerShuffleWithPERMV - commute VPERMV3 shuffles so any load is on the RHS

This helps ensure we lower to VPERMI2/T2 instructions that we can commute the index arg to VPERMT2/I2.

Prep work for #79799

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
    llvm/test/CodeGen/X86/insert-into-constant-vector.ll
    llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
    llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
    llvm/test/CodeGen/X86/vector-shuffle-v48.ll
    llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index db393402757787..ff2b9199384db4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14300,9 +14300,17 @@ static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
 // the active subvector is extracted.
 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
-                                     ArrayRef<int> Mask, SDValue V1, SDValue V2,
-                                     const X86Subtarget &Subtarget,
+                                     ArrayRef<int> OriginalMask, SDValue V1,
+                                     SDValue V2, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
+  // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
+  SmallVector<int, 32> Mask(OriginalMask);
+  if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
+      !isShuffleFoldableLoad(V2)) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(V1, V2);
+  }
+
   MVT MaskVT = VT.changeTypeToInteger();
   SDValue MaskNode;
   MVT ShuffleVT = VT;

diff  --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index ffbeeb19a4aebd..d07923faa4309f 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -2540,9 +2540,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,4,6,1]
-; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,0,2,5]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -2556,10 +2556,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1]
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,0,2,5]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
@@ -4398,9 +4398,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp,
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,6]
-; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [7,0,6,2]
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
@@ -4423,11 +4423,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,6]
+; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [7,0,6,2]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index f371ec10fe25fe..6e41e1bb87eb2f 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -447,9 +447,8 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X64-AVX512F-LABEL: elt5_v8i64:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vmovq %rdi, %xmm1
-; X64-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,8,6,7]
-; X64-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [42,1,2,3,4,0,6,7]
-; X64-AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; X64-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,9,10,11,12,0,14,15]
+; X64-AVX512F-NEXT:    vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
 ; X64-AVX512F-NEXT:    retq
    %ins = insertelement <8 x i64> <i64 42, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, i64 %x, i32 5
    ret <8 x i64> %ins

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 6360c68e62cc94..6fe16f85ec6bec 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2516,10 +2516,8 @@ define  <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) {
 ;
 ; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps (%rdi), %xmm2
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,2,4]
-; AVX512VL-NEXT:    vpermi2ps %xmm0, %xmm2, %xmm1
-; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [4,2,6,0]
+; AVX512VL-NEXT:    vpermt2ps (%rdi), %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
   %1 = load <4 x float>, ptr %a1
   %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 0c70e31d68cf2c..3fd73319e85774 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -481,18 +481,14 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
 ; X86-AVX512-LABEL: test_masked_permps_v8f32:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vmovaps (%eax), %ymm2
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15]
-; X86-AVX512-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm1
-; X86-AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; X86-AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
+; X86-AVX512-NEXT:    vpermt2ps (%eax), %ymm1, %ymm0
 ; X86-AVX512-NEXT:    retl
 ;
 ; X64-AVX512-LABEL: test_masked_permps_v8f32:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %ymm2
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15]
-; X64-AVX512-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm1
-; X64-AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; X64-AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
+; X64-AVX512-NEXT:    vpermt2ps (%rdi), %ymm1, %ymm0
 ; X64-AVX512-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: test_masked_permps_v8f32:
@@ -500,18 +496,18 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
 ; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    vmovaps (%eax), %ymm1
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
-; X86-AVX512F-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
-; X86-AVX512F-NEXT:    vmovaps %ymm1, %ymm0
+; X86-AVX512F-NEXT:    vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
+; X86-AVX512F-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: test_masked_permps_v8f32:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm1
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
-; X64-AVX512F-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
-; X64-AVX512F-NEXT:    vmovaps %ymm1, %ymm0
+; X64-AVX512F-NEXT:    vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
+; X64-AVX512F-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; X64-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X64-AVX512F-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>
@@ -523,35 +519,27 @@ define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) {
 ; X86-AVX512-LABEL: test_masked_permps_v16f32:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vmovaps (%eax), %zmm2
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
-; X86-AVX512-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; X86-AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; X86-AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X86-AVX512-NEXT:    vpermt2ps (%eax), %zmm1, %zmm0
 ; X86-AVX512-NEXT:    retl
 ;
 ; X64-AVX512-LABEL: test_masked_permps_v16f32:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %zmm2
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
-; X64-AVX512-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; X64-AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; X64-AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X64-AVX512-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
 ; X64-AVX512-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: test_masked_permps_v16f32:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovaps (%eax), %zmm2
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
-; X86-AVX512F-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; X86-AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X86-AVX512F-NEXT:    vpermt2ps (%eax), %zmm1, %zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: test_masked_permps_v16f32:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovaps (%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
-; X64-AVX512F-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; X64-AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
+; X64-AVX512F-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
 ; X64-AVX512F-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 14, i32 12, i32 10, i32 8, i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
index ed9f849d35d001..0efbe018764d28 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
@@ -79,10 +79,9 @@ define <32 x i8> @foo(ptr %x0) {
 ;
 ; AVX512VBMI-LABEL: foo:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,3,4,6,7,9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,33,34,36,37,39,40,42,43,45,46]
-; AVX512VBMI-NEXT:    vpermi2b %ymm2, %ymm1, %ymm0
+; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm0 = [32,33,35,36,38,39,41,42,44,45,47,48,50,51,53,54,56,57,59,60,62,63,1,2,4,5,7,8,10,11,13,14]
+; AVX512VBMI-NEXT:    vpermi2b (%rdi), %ymm1, %ymm0
 ; AVX512VBMI-NEXT:    retq
   %1 = load <48 x i8>, ptr %x0, align 1
   %2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 4, i32 6, i32 7, i32 9, i32 10, i32 12, i32 13, i32 15, i32 16, i32 18, i32 19, i32 21, i32 22, i32 24, i32 25, i32 27, i32 28, i32 30, i32 31, i32 33, i32 34, i32 36, i32 37, i32 39, i32 40, i32 42, i32 43, i32 45, i32 46>

diff  --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index ac267544f0c0e8..181f5651784d8a 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -4895,11 +4895,10 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,0,2,0,8,0,6,0]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,0,10,0,0,0,14,0]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2q (%rdi), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4997,11 +4996,10 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [32,1,32,3,32,5,32,7,32,9,32,11,32,13,32,15,32,17,32,19,32,21,32,23,32,25,32,27,32,29,32,31]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63]
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -5411,13 +5409,12 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
 ;
 ; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15]
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT:    vpermt2d (%rdi), %zmm0, %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm2, %ymm1
+; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -5425,13 +5422,12 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
 ;
 ; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15]
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
-; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512DQ-NEXT:    vpermt2d (%rdi), %zmm0, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -5439,11 +5435,10 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
 ;
 ; AVX512BW-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2d (%rdi), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -5679,13 +5674,12 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7]
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT:    vpermt2q (%rdi), %zmm0, %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm2, %ymm1
+; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -5693,13 +5687,12 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7]
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
-; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512DQ-NEXT:    vpermt2q (%rdi), %zmm0, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -5707,11 +5700,10 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2q (%rdi), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -5938,13 +5930,12 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
 ;
 ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7]
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT:    vpermt2q (%rdi), %zmm0, %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm2, %ymm1
+; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -5952,13 +5943,12 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
 ;
 ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7]
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
-; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512DQ-NEXT:    vpermt2q (%rdi), %zmm0, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -5966,11 +5956,10 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
 ;
 ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2q (%rdi), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq


        


More information about the llvm-commits mailing list