[llvm] aeb4626 - [X86] Remove unnecessary OneUse limit from combineToExtendBoolVectorInReg (vXi1 bitcast(iX Cond)) select expansion

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 11 06:24:31 PDT 2023


Author: Simon Pilgrim
Date: 2023-04-11T14:18:50+01:00
New Revision: aeb4626b395960d885c45c39b8cd0aab4e7d6c31

URL: https://github.com/llvm/llvm-project/commit/aeb4626b395960d885c45c39b8cd0aab4e7d6c31
DIFF: https://github.com/llvm/llvm-project/commit/aeb4626b395960d885c45c39b8cd0aab4e7d6c31.diff

LOG: [X86] Remove unnecessary OneUse limit from combineToExtendBoolVectorInReg (vXi1 bitcast(iX Cond)) select expansion

We already allow multiple uses when calling from combineSext/combineZext - adding hasOneUse seems to have been a copy+paste from some of the similar AVX512 vselect folds.

Fixes #62014

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/pr62014.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 88495d8f7fbe2..21e968934cd1a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46763,7 +46763,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
   // get split by legalization.
   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
-      CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&
+      CondVT.getVectorElementType() == MVT::i1 &&
       TLI.isTypeLegal(VT.getScalarType())) {
     EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
     if (SDValue ExtCond = combineToExtendBoolVectorInReg(

diff  --git a/llvm/test/CodeGen/X86/pr62014.ll b/llvm/test/CodeGen/X86/pr62014.ll
index 6fad0616577c1..e8e15d016f0a9 100644
--- a/llvm/test/CodeGen/X86/pr62014.ll
+++ b/llvm/test/CodeGen/X86/pr62014.ll
@@ -5,77 +5,47 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f          | FileCheck %s --check-prefixes=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,avx512vl | FileCheck %s --check-prefixes=AVX512VL
 
-define <2 x i64> @select_cast_cond_multiuse_v8i64(<2 x i64> %x, <2 x i64> %y, i2 %m, ptr %o) {
-; SSE2-LABEL: select_cast_cond_multiuse_v8i64:
+define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2 %m, ptr %o) {
+; SSE2-LABEL: select_cast_cond_multiuse_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movd %edi, %xmm2
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negq %rax
-; SSE2-NEXT:    movq %rax, %xmm3
-; SSE2-NEXT:    andb $2, %dil
-; SSE2-NEXT:    shrb %dil
-; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    negq %rax
-; SSE2-NEXT:    movq %rax, %xmm4
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,2]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, (%rsi)
 ; SSE2-NEXT:    pandn %xmm1, %xmm3
 ; SSE2-NEXT:    por %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, (%rsi)
 ; SSE2-NEXT:    retq
 ;
-; SSE42-LABEL: select_cast_cond_multiuse_v8i64:
+; SSE42-LABEL: select_cast_cond_multiuse_v2i64:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movapd %xmm0, %xmm2
-; SSE42-NEXT:    movd %edi, %xmm3
-; SSE42-NEXT:    movl %edi, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    negq %rax
-; SSE42-NEXT:    movq %rax, %xmm0
-; SSE42-NEXT:    andb $2, %dil
-; SSE42-NEXT:    shrb %dil
-; SSE42-NEXT:    movzbl %dil, %eax
-; SSE42-NEXT:    negq %rax
-; SSE42-NEXT:    movq %rax, %xmm4
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [1,2]
-; SSE42-NEXT:    pand %xmm4, %xmm3
-; SSE42-NEXT:    pcmpeqq %xmm4, %xmm3
+; SSE42-NEXT:    movd %edi, %xmm0
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,2]
+; SSE42-NEXT:    pand %xmm3, %xmm0
+; SSE42-NEXT:    pcmpeqq %xmm3, %xmm0
 ; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, (%rsi)
+; SSE42-NEXT:    movdqa %xmm0, (%rsi)
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX2-LABEL: select_cast_cond_multiuse_v8i64:
+; AVX2-LABEL: select_cast_cond_multiuse_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm2
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negq %rax
-; AVX2-NEXT:    vmovq %rax, %xmm3
-; AVX2-NEXT:    andb $2, %dil
-; AVX2-NEXT:    shrb %dil
-; AVX2-NEXT:    movzbl %dil, %eax
-; AVX2-NEXT:    negq %rax
-; AVX2-NEXT:    vmovq %rax, %xmm4
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2]
-; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqq %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,2]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: select_cast_cond_multiuse_v8i64:
+; AVX512F-LABEL: select_cast_cond_multiuse_v2i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
@@ -87,7 +57,7 @@ define <2 x i64> @select_cast_cond_multiuse_v8i64(<2 x i64> %x, <2 x i64> %y, i2
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VL-LABEL: select_cast_cond_multiuse_v8i64:
+; AVX512VL-LABEL: select_cast_cond_multiuse_v2i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
@@ -105,104 +75,38 @@ define <2 x i64> @select_cast_cond_multiuse_v8i64(<2 x i64> %x, <2 x i64> %y, i2
 define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4 %m, ptr %o) {
 ; SSE2-LABEL: select_cast_cond_multiuse_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    andb $8, %al
-; SSE2-NEXT:    shrb $3, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    shrb $2, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; SSE2-NEXT:    movd %edi, %xmm2
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    shrb %dil
-; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8]
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, (%rsi)
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: select_cast_cond_multiuse_v4i32:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movaps %xmm0, %xmm2
-; SSE42-NEXT:    movl %edi, %eax
-; SSE42-NEXT:    shrb %al
-; SSE42-NEXT:    movzbl %al, %eax
-; SSE42-NEXT:    movl %edi, %ecx
-; SSE42-NEXT:    movl %edi, %edx
-; SSE42-NEXT:    movd %edi, %xmm3
-; SSE42-NEXT:    andl $1, %edi
-; SSE42-NEXT:    negl %edi
 ; SSE42-NEXT:    movd %edi, %xmm0
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    negl %eax
-; SSE42-NEXT:    pinsrd $1, %eax, %xmm0
-; SSE42-NEXT:    shrb $2, %cl
-; SSE42-NEXT:    movzbl %cl, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    negl %eax
-; SSE42-NEXT:    pinsrd $2, %eax, %xmm0
-; SSE42-NEXT:    andb $8, %dl
-; SSE42-NEXT:    shrb $3, %dl
-; SSE42-NEXT:    movzbl %dl, %eax
-; SSE42-NEXT:    negl %eax
-; SSE42-NEXT:    pinsrd $3, %eax, %xmm0
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8]
-; SSE42-NEXT:    pand %xmm4, %xmm3
-; SSE42-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE42-NEXT:    pand %xmm3, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm3, %xmm0
 ; SSE42-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, (%rsi)
+; SSE42-NEXT:    movdqa %xmm0, (%rsi)
 ; SSE42-NEXT:    movaps %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: select_cast_cond_multiuse_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrb %al
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    movl %edi, %edx
 ; AVX2-NEXT:    vmovd %edi, %xmm2
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    negl %edi
-; AVX2-NEXT:    vmovd %edi, %xmm3
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    shrb $2, %cl
-; AVX2-NEXT:    movzbl %cl, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    andb $8, %dl
-; AVX2-NEXT:    shrb $3, %dl
-; AVX2-NEXT:    movzbl %dl, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm3
 ; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8]
-; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,2,4,8]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX2-NEXT:    retq
 ;
@@ -236,179 +140,40 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4
 define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8 %m, ptr %o) {
 ; SSE2-LABEL: select_cast_cond_multiuse_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    shrb $7, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    shrb $6, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    shrb $5, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    shrb $4, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    shrb $3, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    shrb $2, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT:    movd %edi, %xmm3
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    shrb %dil
-; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    negl %eax
-; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    movd %edi, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, (%rsi)
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: select_cast_cond_multiuse_v8i16:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    movl %edi, %eax
-; SSE42-NEXT:    shrb %al
-; SSE42-NEXT:    movzbl %al, %eax
-; SSE42-NEXT:    movl %edi, %ecx
-; SSE42-NEXT:    shrb $2, %cl
-; SSE42-NEXT:    movzbl %cl, %ecx
-; SSE42-NEXT:    movl %edi, %edx
-; SSE42-NEXT:    shrb $3, %dl
-; SSE42-NEXT:    movzbl %dl, %edx
-; SSE42-NEXT:    movl %edi, %r8d
-; SSE42-NEXT:    shrb $4, %r8b
-; SSE42-NEXT:    movzbl %r8b, %r8d
-; SSE42-NEXT:    movl %edi, %r9d
-; SSE42-NEXT:    shrb $5, %r9b
-; SSE42-NEXT:    movzbl %r9b, %r9d
-; SSE42-NEXT:    movl %edi, %r10d
-; SSE42-NEXT:    movl %edi, %r11d
-; SSE42-NEXT:    movd %edi, %xmm3
-; SSE42-NEXT:    andl $1, %edi
-; SSE42-NEXT:    negl %edi
 ; SSE42-NEXT:    movd %edi, %xmm0
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    negl %eax
-; SSE42-NEXT:    pinsrw $1, %eax, %xmm0
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    negl %ecx
-; SSE42-NEXT:    pinsrw $2, %ecx, %xmm0
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    negl %edx
-; SSE42-NEXT:    pinsrw $3, %edx, %xmm0
-; SSE42-NEXT:    andl $1, %r8d
-; SSE42-NEXT:    negl %r8d
-; SSE42-NEXT:    pinsrw $4, %r8d, %xmm0
-; SSE42-NEXT:    andl $1, %r9d
-; SSE42-NEXT:    negl %r9d
-; SSE42-NEXT:    pinsrw $5, %r9d, %xmm0
-; SSE42-NEXT:    shrb $6, %r10b
-; SSE42-NEXT:    movzbl %r10b, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    negl %eax
-; SSE42-NEXT:    pinsrw $6, %eax, %xmm0
-; SSE42-NEXT:    shrb $7, %r11b
-; SSE42-NEXT:    movzbl %r11b, %eax
-; SSE42-NEXT:    negl %eax
-; SSE42-NEXT:    pinsrw $7, %eax, %xmm0
-; SSE42-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
-; SSE42-NEXT:    pand %xmm4, %xmm3
-; SSE42-NEXT:    pcmpeqw %xmm4, %xmm3
+; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; SSE42-NEXT:    pand %xmm3, %xmm0
+; SSE42-NEXT:    pcmpeqw %xmm3, %xmm0
 ; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, (%rsi)
+; SSE42-NEXT:    movdqa %xmm0, (%rsi)
 ; SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: select_cast_cond_multiuse_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    shrb %al
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    shrb $2, %cl
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    movl %edi, %edx
-; AVX2-NEXT:    shrb $3, %dl
-; AVX2-NEXT:    movzbl %dl, %edx
-; AVX2-NEXT:    movl %edi, %r8d
-; AVX2-NEXT:    shrb $4, %r8b
-; AVX2-NEXT:    movzbl %r8b, %r8d
-; AVX2-NEXT:    movl %edi, %r9d
-; AVX2-NEXT:    shrb $5, %r9b
-; AVX2-NEXT:    movzbl %r9b, %r9d
-; AVX2-NEXT:    movl %edi, %r10d
-; AVX2-NEXT:    movl %edi, %r11d
 ; AVX2-NEXT:    vmovd %edi, %xmm2
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    negl %edi
-; AVX2-NEXT:    vmovd %edi, %xmm3
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    negl %ecx
-; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    negl %edx
-; AVX2-NEXT:    vpinsrw $3, %edx, %xmm3, %xmm3
-; AVX2-NEXT:    andl $1, %r8d
-; AVX2-NEXT:    negl %r8d
-; AVX2-NEXT:    vpinsrw $4, %r8d, %xmm3, %xmm3
-; AVX2-NEXT:    andl $1, %r9d
-; AVX2-NEXT:    negl %r9d
-; AVX2-NEXT:    vpinsrw $5, %r9d, %xmm3, %xmm3
-; AVX2-NEXT:    shrb $6, %r10b
-; AVX2-NEXT:    movzbl %r10b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    shrb $7, %r11b
-; AVX2-NEXT:    movzbl %r11b, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
-; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX2-NEXT:    retq
 ;
@@ -442,249 +207,41 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8
 define <16 x i8> @select_cast_cond_multiuse_v16i8(<16 x i8> %x, <16 x i8> %y, i16 %m, ptr %o) {
 ; SSE2-LABEL: select_cast_cond_multiuse_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    btl $15, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    btl $14, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT:    btl $13, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    btl $12, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT:    btl $11, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    btl $10, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT:    btl $9, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm5
-; SSE2-NEXT:    btl $8, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT:    btl $5, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm4
-; SSE2-NEXT:    btl $4, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    btl $6, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm4
-; SSE2-NEXT:    movl %edi, %ecx
-; SSE2-NEXT:    sarb $7, %cl
-; SSE2-NEXT:    movzbl %cl, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    btl $3, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm4
-; SSE2-NEXT:    btl $2, %edi
-; SSE2-NEXT:    movl $0, %ecx
-; SSE2-NEXT:    sbbl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    btl $1, %edi
-; SSE2-NEXT:    sbbl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movd %edi, %xmm6
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    negb %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    movd %eax, %xmm7
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm6[0,0,1,1,4,5,6,7]
+; SSE2-NEXT:    movd %edi, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm7
-; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, (%rsi)
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: select_cast_cond_multiuse_v16i8:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    xorl %eax, %eax
-; SSE42-NEXT:    btl $2, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    btl $1, %edi
-; SSE42-NEXT:    movl $0, %edx
-; SSE42-NEXT:    sbbl %edx, %edx
-; SSE42-NEXT:    movl %edi, %r8d
-; SSE42-NEXT:    andb $1, %r8b
-; SSE42-NEXT:    negb %r8b
-; SSE42-NEXT:    movzbl %r8b, %r8d
-; SSE42-NEXT:    movd %r8d, %xmm0
-; SSE42-NEXT:    pinsrb $1, %edx, %xmm0
-; SSE42-NEXT:    pinsrb $2, %ecx, %xmm0
-; SSE42-NEXT:    btl $3, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $3, %ecx, %xmm0
-; SSE42-NEXT:    btl $4, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $4, %ecx, %xmm0
-; SSE42-NEXT:    btl $5, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $5, %ecx, %xmm0
-; SSE42-NEXT:    btl $6, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $6, %ecx, %xmm0
-; SSE42-NEXT:    movl %edi, %ecx
-; SSE42-NEXT:    sarb $7, %cl
-; SSE42-NEXT:    movzbl %cl, %ecx
-; SSE42-NEXT:    pinsrb $7, %ecx, %xmm0
-; SSE42-NEXT:    btl $8, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $8, %ecx, %xmm0
-; SSE42-NEXT:    btl $9, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $9, %ecx, %xmm0
-; SSE42-NEXT:    btl $10, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $10, %ecx, %xmm0
-; SSE42-NEXT:    btl $11, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $11, %ecx, %xmm0
-; SSE42-NEXT:    btl $12, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $12, %ecx, %xmm0
-; SSE42-NEXT:    btl $13, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $13, %ecx, %xmm0
-; SSE42-NEXT:    btl $14, %edi
-; SSE42-NEXT:    movl $0, %ecx
-; SSE42-NEXT:    sbbl %ecx, %ecx
-; SSE42-NEXT:    pinsrb $14, %ecx, %xmm0
-; SSE42-NEXT:    btl $15, %edi
-; SSE42-NEXT:    sbbl %eax, %eax
-; SSE42-NEXT:    pinsrb $15, %eax, %xmm0
-; SSE42-NEXT:    movd %edi, %xmm3
-; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; SSE42-NEXT:    pand %xmm4, %xmm3
-; SSE42-NEXT:    pcmpeqb %xmm4, %xmm3
+; SSE42-NEXT:    movd %edi, %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE42-NEXT:    pand %xmm3, %xmm0
+; SSE42-NEXT:    pcmpeqb %xmm3, %xmm0
 ; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, (%rsi)
+; SSE42-NEXT:    movdqa %xmm0, (%rsi)
 ; SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: select_cast_cond_multiuse_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    btl $1, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    movl %edi, %edx
-; AVX2-NEXT:    andb $1, %dl
-; AVX2-NEXT:    negb %dl
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $2, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $3, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $4, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $5, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $6, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    sarb $7, %cl
-; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $8, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $9, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $10, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $11, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $12, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $13, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $14, %edi
-; AVX2-NEXT:    movl $0, %ecx
-; AVX2-NEXT:    sbbl %ecx, %ecx
-; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    btl $15, %edi
-; AVX2-NEXT:    sbbl %eax, %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %edi, %xmm3
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
-; AVX2-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vmovd %edi, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vmovdqa %xmm3, (%rsi)
+; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: select_cast_cond_multiuse_v16i8:
@@ -716,118 +273,47 @@ define <16 x i8> @select_cast_cond_multiuse_v16i8(<16 x i8> %x, <16 x i8> %y, i1
 define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x float> %y, i8 %m, ptr %o) {
 ; SSE2-LABEL: select_cast_cond_multiuse_v8i16_v8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl %edi, %eax
-; SSE2-NEXT:    movl %edi, %ecx
-; SSE2-NEXT:    movl %edi, %edx
-; SSE2-NEXT:    movl %edi, %r8d
-; SSE2-NEXT:    movl %edi, %r9d
-; SSE2-NEXT:    movl %edi, %r10d
-; SSE2-NEXT:    movl %edi, %r11d
 ; SSE2-NEXT:    movd %edi, %xmm4
-; SSE2-NEXT:    # kill: def $dil killed $dil killed $edi
-; SSE2-NEXT:    shrb %dil
-; SSE2-NEXT:    andb $1, %dil
-; SSE2-NEXT:    movzbl %dil, %edi
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movzbl %al, %eax
-; SSE2-NEXT:    shrb $2, %cl
-; SSE2-NEXT:    andb $1, %cl
-; SSE2-NEXT:    movzbl %cl, %ecx
-; SSE2-NEXT:    shrb $3, %dl
-; SSE2-NEXT:    andb $1, %dl
-; SSE2-NEXT:    movzbl %dl, %edx
-; SSE2-NEXT:    shrb $4, %r8b
-; SSE2-NEXT:    andb $1, %r8b
-; SSE2-NEXT:    movzbl %r8b, %r8d
-; SSE2-NEXT:    shrb $5, %r9b
-; SSE2-NEXT:    andb $1, %r9b
-; SSE2-NEXT:    movzbl %r9b, %r9d
-; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    pinsrw $1, %edi, %xmm5
-; SSE2-NEXT:    pinsrw $2, %ecx, %xmm5
-; SSE2-NEXT:    pinsrw $3, %edx, %xmm5
-; SSE2-NEXT:    pinsrw $4, %r8d, %xmm5
-; SSE2-NEXT:    pinsrw $5, %r9d, %xmm5
-; SSE2-NEXT:    shrb $6, %r10b
-; SSE2-NEXT:    andb $1, %r10b
-; SSE2-NEXT:    movzbl %r10b, %eax
-; SSE2-NEXT:    pinsrw $6, %eax, %xmm5
-; SSE2-NEXT:    shrb $7, %r11b
-; SSE2-NEXT:    movzbl %r11b, %eax
-; SSE2-NEXT:    pinsrw $7, %eax, %xmm5
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128]
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pcmpeqw %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    pslld $31, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pslld $31, %xmm5
-; SSE2-NEXT:    psrad $31, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, (%rsi)
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1,2,4,8]
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm5, (%rsi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: select_cast_cond_multiuse_v8i16_v8f32:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movaps %xmm0, %xmm4
-; SSE42-NEXT:    movl %edi, %eax
-; SSE42-NEXT:    movl %edi, %ecx
-; SSE42-NEXT:    movl %edi, %edx
-; SSE42-NEXT:    movl %edi, %r8d
-; SSE42-NEXT:    movl %edi, %r9d
-; SSE42-NEXT:    movl %edi, %r10d
-; SSE42-NEXT:    movl %edi, %r11d
-; SSE42-NEXT:    movd %edi, %xmm6
-; SSE42-NEXT:    # kill: def $dil killed $dil killed $edi
-; SSE42-NEXT:    shrb %dil
-; SSE42-NEXT:    andb $1, %dil
-; SSE42-NEXT:    movzbl %dil, %edi
-; SSE42-NEXT:    andb $1, %al
-; SSE42-NEXT:    movzbl %al, %eax
-; SSE42-NEXT:    shrb $2, %cl
-; SSE42-NEXT:    andb $1, %cl
-; SSE42-NEXT:    movzbl %cl, %ecx
-; SSE42-NEXT:    shrb $3, %dl
-; SSE42-NEXT:    andb $1, %dl
-; SSE42-NEXT:    movzbl %dl, %edx
-; SSE42-NEXT:    shrb $4, %r8b
-; SSE42-NEXT:    andb $1, %r8b
-; SSE42-NEXT:    movzbl %r8b, %r8d
-; SSE42-NEXT:    shrb $5, %r9b
-; SSE42-NEXT:    andb $1, %r9b
-; SSE42-NEXT:    movzbl %r9b, %r9d
-; SSE42-NEXT:    movd %eax, %xmm5
-; SSE42-NEXT:    pinsrb $2, %edi, %xmm5
-; SSE42-NEXT:    pinsrb $4, %ecx, %xmm5
-; SSE42-NEXT:    pinsrb $6, %edx, %xmm5
-; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; SSE42-NEXT:    pinsrb $8, %r8d, %xmm5
-; SSE42-NEXT:    pinsrb $10, %r9d, %xmm5
-; SSE42-NEXT:    shrb $6, %r10b
-; SSE42-NEXT:    andb $1, %r10b
-; SSE42-NEXT:    movzbl %r10b, %eax
-; SSE42-NEXT:    pinsrb $12, %eax, %xmm5
-; SSE42-NEXT:    shrb $7, %r11b
-; SSE42-NEXT:    movzbl %r11b, %eax
-; SSE42-NEXT:    pinsrb $14, %eax, %xmm5
-; SSE42-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [1,2,4,8,16,32,64,128]
-; SSE42-NEXT:    pand %xmm7, %xmm6
-; SSE42-NEXT:    pcmpeqw %xmm7, %xmm6
-; SSE42-NEXT:    pslld $31, %xmm0
+; SSE42-NEXT:    movd %edi, %xmm0
+; SSE42-NEXT:    pshuflw {{.*#+}} xmm5 = xmm0[0,0,0,0,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,0,0]
+; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; SSE42-NEXT:    pand %xmm5, %xmm6
+; SSE42-NEXT:    pcmpeqw %xmm5, %xmm6
+; SSE42-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
+; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [1,2,4,8]
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    pand %xmm7, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm7, %xmm0
 ; SSE42-NEXT:    blendvps %xmm0, %xmm4, %xmm2
-; SSE42-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
-; SSE42-NEXT:    pslld $31, %xmm5
+; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [16,32,64,128]
+; SSE42-NEXT:    pand %xmm0, %xmm5
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm5
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
 ; SSE42-NEXT:    blendvps %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movdqa %xmm6, (%rsi)
@@ -837,59 +323,16 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f
 ;
 ; AVX2-LABEL: select_cast_cond_multiuse_v8i16_v8f32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    movl %edi, %ecx
-; AVX2-NEXT:    movl %edi, %r8d
-; AVX2-NEXT:    movl %edi, %edx
-; AVX2-NEXT:    movl %edi, %r10d
-; AVX2-NEXT:    movl %edi, %r9d
-; AVX2-NEXT:    movl %edi, %r11d
 ; AVX2-NEXT:    vmovd %edi, %xmm2
-; AVX2-NEXT:    # kill: def $dil killed $dil killed $edi
-; AVX2-NEXT:    shrb $5, %dil
-; AVX2-NEXT:    movzbl %dil, %edi
-; AVX2-NEXT:    shrb $4, %al
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    shrb $6, %cl
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    shrb $7, %r8b
-; AVX2-NEXT:    movzbl %r8b, %r8d
-; AVX2-NEXT:    shrb %r10b
-; AVX2-NEXT:    movzbl %r10b, %r10d
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vmovd %eax, %xmm3
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    negl %edi
-; AVX2-NEXT:    vpinsrd $1, %edi, %xmm3, %xmm3
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    negl %ecx
-; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    negl %r8d
-; AVX2-NEXT:    vpinsrd $3, %r8d, %xmm3, %xmm3
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    negl %edx
-; AVX2-NEXT:    vmovd %edx, %xmm4
-; AVX2-NEXT:    andl $1, %r10d
-; AVX2-NEXT:    negl %r10d
-; AVX2-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
-; AVX2-NEXT:    shrb $2, %r9b
-; AVX2-NEXT:    movzbl %r9b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    shrb $3, %r11b
-; AVX2-NEXT:    movzbl %r11b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
-; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vblendvps %ymm3, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqa %xmm3, (%rsi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: select_cast_cond_multiuse_v8i16_v8f32:


        


More information about the llvm-commits mailing list