[llvm] r340757 - [X86] When lowering v32i8 MULHS/MULHU, shuffle after the PACKUS rather than before.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 27 10:20:41 PDT 2018


Author: ctopper
Date: Mon Aug 27 10:20:41 2018
New Revision: 340757

URL: http://llvm.org/viewvc/llvm-project?rev=340757&view=rev
Log:
[X86] When lowering v32i8 MULHS/MULHU, shuffle after the PACKUS rather than before.

We're using a 256-bit PACKUS to do the truncation, but that instruction operates on 128-bit lanes. So previously we shuffled first to rearrange the lanes. But that requires 2 shuffles. Instead we can shuffle after the PACKUS using a single VPERMQ. This matches what our normal LowerTRUNCATE code does when it uses PACKUS.

Differential Revision: https://reviews.llvm.org/D51284

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/prefer-avx256-wide-mul.ll
    llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll
    llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll
    llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll
    llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=340757&r1=340756&r2=340757&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Aug 27 10:20:41 2018
@@ -23020,16 +23020,13 @@ static SDValue LowerMULH(SDValue Op, con
       Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
       Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
       Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
-      // The ymm variant of PACKUS treats the 128-bit lanes separately, so
-      // before using PACKUS we need to permute the inputs to the correct lo/hi
-      // xmm lane.
-      const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
-                            16, 17, 18, 19, 20, 21, 22, 23};
-      const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
-                            24, 25, 26, 27, 28, 29, 30, 31};
-      return DAG.getNode(X86ISD::PACKUS, dl, VT,
-                         DAG.getVectorShuffle(ExVT, dl, Lo, Hi, LoMask),
-                         DAG.getVectorShuffle(ExVT, dl, Lo, Hi, HiMask));
+
+      SDValue Res = DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+      // The ymm variant of PACKUS treats the 128-bit lanes separately, so we
+      // need to permute the final result into place.
+      Res = DAG.getBitcast(MVT::v4i64, Res);
+      Res = DAG.getVectorShuffle(MVT::v4i64, dl, Res, Res, { 0, 2, 1, 3 });
+      return DAG.getBitcast(VT, Res);
     }
 
     assert(VT == MVT::v16i8 && "Unexpected VT");

Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-wide-mul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-wide-mul.ll?rev=340757&r1=340756&r2=340757&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-wide-mul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-wide-mul.ll Mon Aug 27 10:20:41 2018
@@ -15,9 +15,8 @@ define <32 x i8> @test_div7_32i8(<32 x i
 ; AVX256BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX256BW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; AVX256BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX256BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
-; AVX256BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX256BW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX256BW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX256BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
 ; AVX256BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX256BW-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX256BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0

Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll?rev=340757&r1=340756&r2=340757&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll Mon Aug 27 10:20:41 2018
@@ -208,9 +208,8 @@ define <32 x i8> @test_div7_32i8(<32 x i
 ; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm3
 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
-; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
 ; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm0, %ymm1
 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
@@ -512,9 +511,8 @@ define <32 x i8> @test_rem7_32i8(<32 x i
 ; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm3
 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
-; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
 ; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm1, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2

Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll?rev=340757&r1=340756&r2=340757&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll Mon Aug 27 10:20:41 2018
@@ -135,9 +135,8 @@ define <64 x i8> @test_div7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm4
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -156,9 +155,8 @@ define <64 x i8> @test_div7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm7
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm7, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
 ; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
@@ -370,9 +368,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm4
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm5
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -403,9 +400,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm8
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm8, %ymm2
 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm2[2,3],ymm7[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm7
 ; AVX512F-NEXT:    vpand %ymm4, %ymm7, %ymm4

Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll?rev=340757&r1=340756&r2=340757&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll Mon Aug 27 10:20:41 2018
@@ -214,9 +214,8 @@ define <32 x i8> @test_div7_32i8(<32 x i
 ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
-; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
 ; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -518,9 +517,8 @@ define <32 x i8> @test_rem7_32i8(<32 x i
 ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
-; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
 ; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2

Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll?rev=340757&r1=340756&r2=340757&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll Mon Aug 27 10:20:41 2018
@@ -146,9 +146,8 @@ define <64 x i8> @test_div7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
@@ -164,9 +163,8 @@ define <64 x i8> @test_div7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm6, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm5[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpackuswb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
@@ -384,17 +382,16 @@ define <64 x i8> @test_rem7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm0, %ymm4
-; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpand %ymm4, %ymm5, %ymm5
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmovsxbw %xmm6, %ymm7
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm7, %ymm7
@@ -414,15 +411,14 @@ define <64 x i8> @test_rem7_64i8(<64 x i
 ; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm7, %ymm2
 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm2[2,3],ymm6[2,3]
-; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm6
 ; AVX512F-NEXT:    vpsrlw $1, %ymm6, %ymm6
-; AVX512F-NEXT:    vpand %ymm4, %ymm6, %ymm4
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm5, %ymm2
 ; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm4
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4




More information about the llvm-commits mailing list