[llvm] r265998 - [DAGCombiner] Fold xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A, B)) anytime before LegalizeVectorOprs

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 11 14:10:33 PDT 2016


Author: rksimon
Date: Mon Apr 11 16:10:33 2016
New Revision: 265998

URL: http://llvm.org/viewvc/llvm-project?rev=265998&view=rev
Log:
[DAGCombiner] Fold xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) anytime before LegalizeVectorOprs

xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) was only being combined at the AfterLegalizeTypes stage, this patch permits the combine to occur anytime before then as well.

The main aim with this to improve the ability to recognise bitmasks that can be converted to shuffles.

I had to modify a number of AVX512 mask tests as the basic bitcast to/from scalar pattern was being stripped out, preventing testing of the mmask bitops. By replacing the bitcasts with loads we can get almost the same result.

Differential Revision: http://reviews.llvm.org/D18944

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512-select.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
    llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll
    llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Mon Apr 11 16:10:33 2016
@@ -2765,7 +2765,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSa
   }
 
   // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
-  // Only perform this optimization after type legalization and before
+  // Only perform this optimization up until type legalization, before
   // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
   // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
   // we don't want to undo this promotion.
@@ -2773,7 +2773,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSa
   // on scalars.
   if ((N0.getOpcode() == ISD::BITCAST ||
        N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
-      Level == AfterLegalizeTypes) {
+       Level <= AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Mon Apr 11 16:10:33 2016
@@ -77,15 +77,33 @@ define void @mask8_mem(i8* %ptr) {
 define i16 @mand16(i16 %x, i16 %y) {
 ; CHECK-LABEL: mand16:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    orl %eax, %edi
+; CHECK-NEXT:    movw %di, %ax
+; CHECK-NEXT:    retq
+  %ma = bitcast i16 %x to <16 x i1>
+  %mb = bitcast i16 %y to <16 x i1>
+  %mc = and <16 x i1> %ma, %mb
+  %md = xor <16 x i1> %ma, %mb
+  %me = or <16 x i1> %mc, %md
+  %ret = bitcast <16 x i1> %me to i16
+  ret i16 %ret
+}
+
+define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
+; CHECK-LABEL: mand16_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw (%rdi), %k0
+; CHECK-NEXT:    kmovw (%rsi), %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k2
 ; CHECK-NEXT:    kxorw %k1, %k0, %k0
 ; CHECK-NEXT:    korw %k0, %k2, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    retq
-  %ma = bitcast i16 %x to <16 x i1>
-  %mb = bitcast i16 %y to <16 x i1>
+  %ma = load <16 x i1>, <16 x i1>* %x
+  %mb = load <16 x i1>, <16 x i1>* %y
   %mc = and <16 x i1> %ma, %mb
   %md = xor <16 x i1> %ma, %mb
   %me = or <16 x i1> %mc, %md
@@ -265,13 +283,13 @@ define <16 x i8> @test8(<16 x i32>%a, <1
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    jg LBB14_1
+; KNL-NEXT:    jg LBB15_1
 ; KNL-NEXT:  ## BB#2:
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm1, %k1
-; KNL-NEXT:    jmp LBB14_3
-; KNL-NEXT:  LBB14_1:
+; KNL-NEXT:    jmp LBB15_3
+; KNL-NEXT:  LBB15_1:
 ; KNL-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1
-; KNL-NEXT:  LBB14_3:
+; KNL-NEXT:  LBB15_3:
 ; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -280,12 +298,12 @@ define <16 x i8> @test8(<16 x i32>%a, <1
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; SKX-NEXT:    cmpl %esi, %edi
-; SKX-NEXT:    jg LBB14_1
+; SKX-NEXT:    jg LBB15_1
 ; SKX-NEXT:  ## BB#2:
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB14_1:
+; SKX-NEXT:  LBB15_1:
 ; SKX-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
 ; SKX-NEXT:    retq
@@ -300,13 +318,13 @@ define <16 x i1> @test9(<16 x i1>%a, <16
 ; KNL-LABEL: test9:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    jg LBB15_1
+; KNL-NEXT:    jg LBB16_1
 ; KNL-NEXT:  ## BB#2:
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm0
-; KNL-NEXT:    jmp LBB15_3
-; KNL-NEXT:  LBB15_1:
+; KNL-NEXT:    jmp LBB16_3
+; KNL-NEXT:  LBB16_1:
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:  LBB15_3:
+; KNL-NEXT:  LBB16_3:
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
@@ -316,13 +334,13 @@ define <16 x i1> @test9(<16 x i1>%a, <16
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
-; SKX-NEXT:    jg LBB15_1
+; SKX-NEXT:    jg LBB16_1
 ; SKX-NEXT:  ## BB#2:
 ; SKX-NEXT:    vpsllw $7, %xmm1, %xmm0
-; SKX-NEXT:    jmp LBB15_3
-; SKX-NEXT:  LBB15_1:
+; SKX-NEXT:    jmp LBB16_3
+; SKX-NEXT:  LBB16_1:
 ; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
-; SKX-NEXT:  LBB15_3:
+; SKX-NEXT:  LBB16_3:
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
 ; SKX-NEXT:    retq
@@ -339,22 +357,22 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x
 ; KNL-LABEL: test11:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    jg LBB17_2
+; KNL-NEXT:    jg LBB18_2
 ; KNL-NEXT:  ## BB#1:
 ; KNL-NEXT:    vmovaps %zmm1, %zmm0
-; KNL-NEXT:  LBB17_2:
+; KNL-NEXT:  LBB18_2:
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test11:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
-; SKX-NEXT:    jg LBB17_1
+; SKX-NEXT:    jg LBB18_1
 ; SKX-NEXT:  ## BB#2:
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm0
-; SKX-NEXT:    jmp LBB17_3
-; SKX-NEXT:  LBB17_1:
+; SKX-NEXT:    jmp LBB18_3
+; SKX-NEXT:  LBB18_1:
 ; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
-; SKX-NEXT:  LBB17_3:
+; SKX-NEXT:  LBB18_3:
 ; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
 ; SKX-NEXT:    retq
@@ -794,11 +812,11 @@ define void @ktest_1(<8 x double> %in, d
 ; KNL-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB38_2
+; KNL-NEXT:    je LBB39_2
 ; KNL-NEXT:  ## BB#1: ## %L1
 ; KNL-NEXT:    vmovapd %zmm0, (%rdi)
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB38_2: ## %L2
+; KNL-NEXT:  LBB39_2: ## %L2
 ; KNL-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -809,11 +827,11 @@ define void @ktest_1(<8 x double> %in, d
 ; SKX-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
 ; SKX-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; SKX-NEXT:    ktestb %k0, %k0
-; SKX-NEXT:    je LBB38_2
+; SKX-NEXT:    je LBB39_2
 ; SKX-NEXT:  ## BB#1: ## %L1
 ; SKX-NEXT:    vmovapd %zmm0, (%rdi)
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB38_2: ## %L2
+; SKX-NEXT:  LBB39_2: ## %L2
 ; SKX-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; SKX-NEXT:    retq
   %addr1 = getelementptr double, double * %base, i64 0
@@ -859,12 +877,12 @@ define void @ktest_2(<32 x float> %in, f
 ; SKX-NEXT:    kunpckwd %k1, %k2, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
 ; SKX-NEXT:    ktestd %k0, %k0
-; SKX-NEXT:    je LBB39_2
+; SKX-NEXT:    je LBB40_2
 ; SKX-NEXT:  ## BB#1: ## %L1
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB39_2: ## %L2
+; SKX-NEXT:  LBB40_2: ## %L2
 ; SKX-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; SKX-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512-select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-select.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-select.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-select.ll Mon Apr 11 16:10:33 2016
@@ -71,10 +71,8 @@ define <16 x double> @select04(<16 x dou
 define i8 @select05(i8 %a.0, i8 %m) {
 ; CHECK-LABEL: select05:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    korw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    movb %dil, %al
 ; CHECK-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
@@ -83,13 +81,28 @@ define i8 @select05(i8 %a.0, i8 %m) {
   ret i8 %res;
 }
 
+define i8 @select05_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
+; CHECK-LABEL: select05_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbw (%rsi), %ax
+; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    movzbw (%rdi), %ax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
+  %mask = load <8 x i1> , <8 x i1>* %m
+  %a = load <8 x i1> , <8 x i1>* %a.0
+  %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}
+
 define i8 @select06(i8 %a.0, i8 %m) {
 ; CHECK-LABEL: select06:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    movb %dil, %al
 ; CHECK-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
@@ -98,6 +111,22 @@ define i8 @select06(i8 %a.0, i8 %m) {
   ret i8 %res;
 }
 
+define i8 @select06_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
+; CHECK-LABEL: select06_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbw (%rsi), %ax
+; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    movzbw (%rdi), %ax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kandw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
+  %mask = load <8 x i1> , <8 x i1>* %m
+  %a = load <8 x i1> , <8 x i1>* %a.0
+  %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}
 define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
 ; CHECK-LABEL: select07:
 ; CHECK:       ## BB#0:

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll Mon Apr 11 16:10:33 2016
@@ -80,15 +80,33 @@ define void @mask64_mem(i64* %ptr) {
 define i32 @mand32(i32 %x, i32 %y) {
 ; CHECK-LABEL: mand32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k0
-; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    orl %eax, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  %ma = bitcast i32 %x to <32 x i1>
+  %mb = bitcast i32 %y to <32 x i1>
+  %mc = and <32 x i1> %ma, %mb
+  %md = xor <32 x i1> %ma, %mb
+  %me = or <32 x i1> %mc, %md
+  %ret = bitcast <32 x i1> %me to i32
+  ret i32 %ret
+}
+
+define i32 @mand32_mem(<32 x i1>* %x, <32 x i1>* %y) {
+; CHECK-LABEL: mand32_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd (%rdi), %k0
+; CHECK-NEXT:    kmovd (%rsi), %k1
 ; CHECK-NEXT:    kandd %k1, %k0, %k2
 ; CHECK-NEXT:    kxord %k1, %k0, %k0
 ; CHECK-NEXT:    kord %k0, %k2, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    retq
-  %ma = bitcast i32 %x to <32 x i1>
-  %mb = bitcast i32 %y to <32 x i1>
+  %ma = load <32 x i1>, <32 x i1>* %x
+  %mb = load <32 x i1>, <32 x i1>* %y
   %mc = and <32 x i1> %ma, %mb
   %md = xor <32 x i1> %ma, %mb
   %me = or <32 x i1> %mc, %md
@@ -99,15 +117,33 @@ define i32 @mand32(i32 %x, i32 %y) {
 define i64 @mand64(i64 %x, i64 %y) {
 ; CHECK-LABEL: mand64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovq %rdi, %k0
-; CHECK-NEXT:    kmovq %rsi, %k1
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    xorq %rsi, %rdi
+; CHECK-NEXT:    orq %rax, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
+  %ma = bitcast i64 %x to <64 x i1>
+  %mb = bitcast i64 %y to <64 x i1>
+  %mc = and <64 x i1> %ma, %mb
+  %md = xor <64 x i1> %ma, %mb
+  %me = or <64 x i1> %mc, %md
+  %ret = bitcast <64 x i1> %me to i64
+  ret i64 %ret
+}
+
+define i64 @mand64_mem(<64 x i1>* %x, <64 x i1>* %y) {
+; CHECK-LABEL: mand64_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovq (%rdi), %k0
+; CHECK-NEXT:    kmovq (%rsi), %k1
 ; CHECK-NEXT:    kandq %k1, %k0, %k2
 ; CHECK-NEXT:    kxorq %k1, %k0, %k0
 ; CHECK-NEXT:    korq %k0, %k2, %k0
 ; CHECK-NEXT:    kmovq %k0, %rax
 ; CHECK-NEXT:    retq
-  %ma = bitcast i64 %x to <64 x i1>
-  %mb = bitcast i64 %y to <64 x i1>
+  %ma = load <64 x i1>, <64 x i1>* %x
+  %mb = load <64 x i1>, <64 x i1>* %y
   %mc = and <64 x i1> %ma, %mb
   %md = xor <64 x i1> %ma, %mb
   %me = or <64 x i1> %mc, %md

Modified: llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll Mon Apr 11 16:10:33 2016
@@ -32,15 +32,33 @@ define void @mask8_mem(i8* %ptr) {
 define i8 @mand8(i8 %x, i8 %y) {
 ; CHECK-LABEL: mand8:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k0
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    orl %eax, %edi
+; CHECK-NEXT:    movb %dil, %al
+; CHECK-NEXT:    retq
+  %ma = bitcast i8 %x to <8 x i1>
+  %mb = bitcast i8 %y to <8 x i1>
+  %mc = and <8 x i1> %ma, %mb
+  %md = xor <8 x i1> %ma, %mb
+  %me = or <8 x i1> %mc, %md
+  %ret = bitcast <8 x i1> %me to i8
+  ret i8 %ret
+}
+
+define i8 @mand8_mem(<8 x i1>* %x, <8 x i1>* %y) {
+; CHECK-LABEL: mand8_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb (%rdi), %k0
+; CHECK-NEXT:    kmovb (%rsi), %k1
 ; CHECK-NEXT:    kandb %k1, %k0, %k2
 ; CHECK-NEXT:    kxorb %k1, %k0, %k0
 ; CHECK-NEXT:    korb %k0, %k2, %k0
 ; CHECK-NEXT:    kmovb %k0, %eax
 ; CHECK-NEXT:    retq
-  %ma = bitcast i8 %x to <8 x i1>
-  %mb = bitcast i8 %y to <8 x i1>
+  %ma = load <8 x i1>, <8 x i1>* %x
+  %mb = load <8 x i1>, <8 x i1>* %y
   %mc = and <8 x i1> %ma, %mb
   %md = xor <8 x i1> %ma, %mb
   %me = or <8 x i1> %mc, %md

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Mon Apr 11 16:10:33 2016
@@ -1869,48 +1869,34 @@ define <4 x float> @mask_v4f32_4127(<4 x
 define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: mask_v4f32_0127:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: mask_v4f32_0127:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    orps %xmm1, %xmm0
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: mask_v4f32_0127:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: mask_v4f32_0127:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: mask_v4f32_0127:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: mask_v4f32_0127:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: mask_v4f32_0127:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT:    retq
   %1 = bitcast <4 x float> %a to <2 x i64>
   %2 = bitcast <4 x float> %b to <2 x i64>
   %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
@@ -1923,47 +1909,38 @@ define <4 x float> @mask_v4f32_0127(<4 x
 define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: mask_v4i32_0127:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: mask_v4i32_0127:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    orps %xmm1, %xmm0
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: mask_v4i32_0127:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: mask_v4i32_0127:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: mask_v4i32_0127:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: mask_v4i32_0127:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX2-NEXT:    retq
   %1 = bitcast <4 x i32> %a to <2 x i64>
   %2 = bitcast <4 x i32> %b to <2 x i64>

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Mon Apr 11 16:10:33 2016
@@ -2140,40 +2140,31 @@ define <8 x i16> @shuffle_v8i16_8012345u
 define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: mask_v8i16_012345ef:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: mask_v8i16_012345ef:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: mask_v8i16_012345ef:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: mask_v8i16_012345ef:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: mask_v8i16_012345ef:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX2-NEXT:    retq
   %1 = bitcast <8 x i16> %a to <2 x i64>
   %2 = bitcast <8 x i16> %b to <2 x i64>

Modified: llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll Mon Apr 11 16:10:33 2016
@@ -9,24 +9,14 @@
 define i24 @and_i24_as_v3i8(i24 %a, i24 %b) nounwind {
 ; X32-SSE-LABEL: and_i24_as_v3i8:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_i24_as_v3i8:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movd %esi, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    movd %edi, %xmm1
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-SSE-NEXT:    pand %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <3 x i8>
   %2 = bitcast i24 %b to <3 x i8>
@@ -38,24 +28,14 @@ define i24 @and_i24_as_v3i8(i24 %a, i24
 define i24 @xor_i24_as_v3i8(i24 %a, i24 %b) nounwind {
 ; X32-SSE-LABEL: xor_i24_as_v3i8:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pxor %xmm0, %xmm1
-; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_i24_as_v3i8:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movd %esi, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    movd %edi, %xmm1
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-SSE-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    xorl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <3 x i8>
   %2 = bitcast i24 %b to <3 x i8>
@@ -67,24 +47,14 @@ define i24 @xor_i24_as_v3i8(i24 %a, i24
 define i24 @or_i24_as_v3i8(i24 %a, i24 %b) nounwind {
 ; X32-SSE-LABEL: or_i24_as_v3i8:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    por %xmm0, %xmm1
-; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_i24_as_v3i8:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movd %esi, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    movd %edi, %xmm1
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-SSE-NEXT:    por %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    orl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <3 x i8>
   %2 = bitcast i24 %b to <3 x i8>
@@ -100,186 +70,14 @@ define i24 @or_i24_as_v3i8(i24 %a, i24 %
 define i24 @and_i24_as_v8i3(i24 %a, i24 %b) nounwind {
 ; X32-SSE-LABEL: and_i24_as_v8i3:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    movl 12(%ebp), %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movl 8(%ebp), %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
-; X32-SSE-NEXT:    pand %xmm1, %xmm0
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-SSE-NEXT:    shll $16, %ecx
-; X32-SSE-NEXT:    movzwl (%esp), %eax
-; X32-SSE-NEXT:    orl %ecx, %eax
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_i24_as_v8i3:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %esi
-; X64-SSE-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movw %di, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %edi
-; X64-SSE-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $3, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $6, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $9, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $15, %eax
-; X64-SSE-NEXT:    movzwl %ax, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    xorl %eax, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $3, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movl %ecx, %esi
-; X64-SSE-NEXT:    andl $7, %esi
-; X64-SSE-NEXT:    movd %esi, %xmm1
-; X64-SSE-NEXT:    pinsrw $1, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $6, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $2, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $9, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $3, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $12, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $4, %edx, %xmm1
-; X64-SSE-NEXT:    shrl $15, %ecx
-; X64-SSE-NEXT:    movzwl %cx, %ecx
-; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm1
-; X64-SSE-NEXT:    pand %xmm0, %xmm1
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    shll $16, %ecx
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    orl %ecx, %eax
+; X64-SSE-NEXT:    andl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <8 x i3>
   %2 = bitcast i24 %b to <8 x i3>
@@ -291,186 +89,14 @@ define i24 @and_i24_as_v8i3(i24 %a, i24
 define i24 @xor_i24_as_v8i3(i24 %a, i24 %b) nounwind {
 ; X32-SSE-LABEL: xor_i24_as_v8i3:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    movl 12(%ebp), %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movl 8(%ebp), %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
-; X32-SSE-NEXT:    pxor %xmm1, %xmm0
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-SSE-NEXT:    shll $16, %ecx
-; X32-SSE-NEXT:    movzwl (%esp), %eax
-; X32-SSE-NEXT:    orl %ecx, %eax
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_i24_as_v8i3:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %esi
-; X64-SSE-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movw %di, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %edi
-; X64-SSE-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $3, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $6, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $9, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $15, %eax
-; X64-SSE-NEXT:    movzwl %ax, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    xorl %eax, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $3, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movl %ecx, %esi
-; X64-SSE-NEXT:    andl $7, %esi
-; X64-SSE-NEXT:    movd %esi, %xmm1
-; X64-SSE-NEXT:    pinsrw $1, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $6, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $2, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $9, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $3, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $12, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $4, %edx, %xmm1
-; X64-SSE-NEXT:    shrl $15, %ecx
-; X64-SSE-NEXT:    movzwl %cx, %ecx
-; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm1
-; X64-SSE-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    shll $16, %ecx
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    orl %ecx, %eax
+; X64-SSE-NEXT:    xorl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <8 x i3>
   %2 = bitcast i24 %b to <8 x i3>
@@ -482,186 +108,14 @@ define i24 @xor_i24_as_v8i3(i24 %a, i24
 define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind {
 ; X32-SSE-LABEL: or_i24_as_v8i3:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    movl 12(%ebp), %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movl 8(%ebp), %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
-; X32-SSE-NEXT:    por %xmm1, %xmm0
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-SSE-NEXT:    shll $16, %ecx
-; X32-SSE-NEXT:    movzwl (%esp), %eax
-; X32-SSE-NEXT:    orl %ecx, %eax
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_i24_as_v8i3:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %esi
-; X64-SSE-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movw %di, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %edi
-; X64-SSE-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $3, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $6, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $9, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $15, %eax
-; X64-SSE-NEXT:    movzwl %ax, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    xorl %eax, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $3, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movl %ecx, %esi
-; X64-SSE-NEXT:    andl $7, %esi
-; X64-SSE-NEXT:    movd %esi, %xmm1
-; X64-SSE-NEXT:    pinsrw $1, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $6, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $2, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $9, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $3, %edx, %xmm1
-; X64-SSE-NEXT:    movl %ecx, %edx
-; X64-SSE-NEXT:    shrl $12, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    pinsrw $4, %edx, %xmm1
-; X64-SSE-NEXT:    shrl $15, %ecx
-; X64-SSE-NEXT:    movzwl %cx, %ecx
-; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm1
-; X64-SSE-NEXT:    por %xmm0, %xmm1
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    shll $16, %ecx
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    orl %ecx, %eax
+; X64-SSE-NEXT:    orl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <8 x i3>
   %2 = bitcast i24 %b to <8 x i3>
@@ -677,22 +131,16 @@ define i24 @or_i24_as_v8i3(i24 %a, i24 %
 define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 ; X32-SSE-LABEL: and_v3i8_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %eax
 ; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %ecx
-; X32-SSE-NEXT:    andl %eax, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm0
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X32-SSE-NEXT:    pextrb $0, %xmm0, %eax
-; X32-SSE-NEXT:    pextrb $4, %xmm0, %edx
-; X32-SSE-NEXT:    pextrb $8, %xmm0, %ecx
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT:    pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT:    pextrb $8, %xmm1, %ecx
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_v3i8_as_i24:
@@ -700,20 +148,13 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8
 ; X64-SSE-NEXT:    movd %ecx, %xmm0
 ; X64-SSE-NEXT:    pinsrd $1, %r8d, %xmm0
 ; X64-SSE-NEXT:    pinsrd $2, %r9d, %xmm0
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-SSE-NEXT:    pshufb %xmm1, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    movd %edi, %xmm0
-; X64-SSE-NEXT:    pinsrd $1, %esi, %xmm0
-; X64-SSE-NEXT:    pinsrd $2, %edx, %xmm0
-; X64-SSE-NEXT:    pshufb %xmm1, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    andl %eax, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    pextrb $0, %xmm0, %eax
-; X64-SSE-NEXT:    pextrb $4, %xmm0, %edx
-; X64-SSE-NEXT:    pextrb $8, %xmm0, %ecx
+; X64-SSE-NEXT:    movd %edi, %xmm1
+; X64-SSE-NEXT:    pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT:    pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT:    pand %xmm0, %xmm1
+; X64-SSE-NEXT:    pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT:    pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT:    pextrb $8, %xmm1, %ecx
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <3 x i8> %a to i24
   %2 = bitcast <3 x i8> %b to i24
@@ -725,22 +166,16 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8
 define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 ; X32-SSE-LABEL: xor_v3i8_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %eax
 ; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %ecx
-; X32-SSE-NEXT:    xorl %eax, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm0
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X32-SSE-NEXT:    pextrb $0, %xmm0, %eax
-; X32-SSE-NEXT:    pextrb $4, %xmm0, %edx
-; X32-SSE-NEXT:    pextrb $8, %xmm0, %ecx
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pxor %xmm0, %xmm1
+; X32-SSE-NEXT:    pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT:    pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT:    pextrb $8, %xmm1, %ecx
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_v3i8_as_i24:
@@ -748,20 +183,13 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8
 ; X64-SSE-NEXT:    movd %ecx, %xmm0
 ; X64-SSE-NEXT:    pinsrd $1, %r8d, %xmm0
 ; X64-SSE-NEXT:    pinsrd $2, %r9d, %xmm0
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-SSE-NEXT:    pshufb %xmm1, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    movd %edi, %xmm0
-; X64-SSE-NEXT:    pinsrd $1, %esi, %xmm0
-; X64-SSE-NEXT:    pinsrd $2, %edx, %xmm0
-; X64-SSE-NEXT:    pshufb %xmm1, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    xorl %eax, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    pextrb $0, %xmm0, %eax
-; X64-SSE-NEXT:    pextrb $4, %xmm0, %edx
-; X64-SSE-NEXT:    pextrb $8, %xmm0, %ecx
+; X64-SSE-NEXT:    movd %edi, %xmm1
+; X64-SSE-NEXT:    pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT:    pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT:    pxor %xmm0, %xmm1
+; X64-SSE-NEXT:    pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT:    pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT:    pextrb $8, %xmm1, %ecx
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <3 x i8> %a to i24
   %2 = bitcast <3 x i8> %b to i24
@@ -773,22 +201,16 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8
 define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 ; X32-SSE-LABEL: or_v3i8_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %eax
 ; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %ecx
-; X32-SSE-NEXT:    orl %eax, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm0
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X32-SSE-NEXT:    pextrb $0, %xmm0, %eax
-; X32-SSE-NEXT:    pextrb $4, %xmm0, %edx
-; X32-SSE-NEXT:    pextrb $8, %xmm0, %ecx
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    por %xmm0, %xmm1
+; X32-SSE-NEXT:    pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT:    pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT:    pextrb $8, %xmm1, %ecx
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_v3i8_as_i24:
@@ -796,20 +218,13 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8>
 ; X64-SSE-NEXT:    movd %ecx, %xmm0
 ; X64-SSE-NEXT:    pinsrd $1, %r8d, %xmm0
 ; X64-SSE-NEXT:    pinsrd $2, %r9d, %xmm0
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-SSE-NEXT:    pshufb %xmm1, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    movd %edi, %xmm0
-; X64-SSE-NEXT:    pinsrd $1, %esi, %xmm0
-; X64-SSE-NEXT:    pinsrd $2, %edx, %xmm0
-; X64-SSE-NEXT:    pshufb %xmm1, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    orl %eax, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    pextrb $0, %xmm0, %eax
-; X64-SSE-NEXT:    pextrb $4, %xmm0, %edx
-; X64-SSE-NEXT:    pextrb $8, %xmm0, %ecx
+; X64-SSE-NEXT:    movd %edi, %xmm1
+; X64-SSE-NEXT:    pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT:    pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT:    por %xmm0, %xmm1
+; X64-SSE-NEXT:    pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT:    pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT:    pextrb $8, %xmm1, %ecx
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <3 x i8> %a to i24
   %2 = bitcast <3 x i8> %b to i24
@@ -825,186 +240,12 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8>
 define <8 x i3> @and_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
 ; X32-SSE-LABEL: and_v8i3_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    shll $16, %eax
-; X32-SSE-NEXT:    movzwl (%esp), %ecx
-; X32-SSE-NEXT:    orl %eax, %ecx
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-SSE-NEXT:    shll $16, %edx
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    orl %edx, %eax
-; X32-SSE-NEXT:    andl %ecx, %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT:    pxor %xmm0, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    andps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_v8i3_as_i24:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    orl %eax, %ecx
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %edx
-; X64-SSE-NEXT:    orl %eax, %edx
-; X64-SSE-NEXT:    andl %ecx, %edx
-; X64-SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %edx
-; X64-SSE-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $3, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $6, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $9, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $15, %eax
-; X64-SSE-NEXT:    movzwl %ax, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    xorl %eax, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    andps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <8 x i3> %a to i24
   %2 = bitcast <8 x i3> %b to i24
@@ -1016,186 +257,12 @@ define <8 x i3> @and_v8i3_as_i24(<8 x i3
 define <8 x i3> @xor_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
 ; X32-SSE-LABEL: xor_v8i3_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    shll $16, %eax
-; X32-SSE-NEXT:    movzwl (%esp), %ecx
-; X32-SSE-NEXT:    orl %eax, %ecx
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-SSE-NEXT:    shll $16, %edx
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    orl %edx, %eax
-; X32-SSE-NEXT:    xorl %ecx, %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT:    pxor %xmm0, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    xorps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_v8i3_as_i24:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    orl %eax, %ecx
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %edx
-; X64-SSE-NEXT:    orl %eax, %edx
-; X64-SSE-NEXT:    xorl %ecx, %edx
-; X64-SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %edx
-; X64-SSE-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $3, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $6, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $9, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $15, %eax
-; X64-SSE-NEXT:    movzwl %ax, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    xorl %eax, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    xorps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <8 x i3> %a to i24
   %2 = bitcast <8 x i3> %b to i24
@@ -1207,186 +274,12 @@ define <8 x i3> @xor_v8i3_as_i24(<8 x i3
 define <8 x i3> @or_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
 ; X32-SSE-LABEL: or_v8i3_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    shll $16, %eax
-; X32-SSE-NEXT:    movzwl (%esp), %ecx
-; X32-SSE-NEXT:    orl %eax, %ecx
-; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-SSE-NEXT:    shll $16, %edx
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    orl %edx, %eax
-; X32-SSE-NEXT:    orl %ecx, %eax
-; X32-SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    shrl $16, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $3, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $6, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $9, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $7, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT:    pxor %xmm0, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    orps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_v8i3_as_i24:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT:    orl %eax, %ecx
-; X64-SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %edx
-; X64-SSE-NEXT:    orl %eax, %edx
-; X64-SSE-NEXT:    orl %ecx, %edx
-; X64-SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    shrl $16, %edx
-; X64-SSE-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $3, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $7, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $6, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $9, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $7, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $15, %eax
-; X64-SSE-NEXT:    movzwl %ax, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    xorl %eax, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    orps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <8 x i3> %a to i24
   %2 = bitcast <8 x i3> %b to i24

Modified: llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll?rev=265998&r1=265997&r2=265998&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll Mon Apr 11 16:10:33 2016
@@ -9,24 +9,14 @@
 define i32 @and_i32_as_v4i8(i32 %a, i32 %b) nounwind {
 ; X32-SSE-LABEL: and_i32_as_v4i8:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %eax
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    popl %ecx
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_i32_as_v4i8:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movd %esi, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    movd %edi, %xmm1
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-SSE-NEXT:    pand %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <4 x i8>
   %2 = bitcast i32 %b to <4 x i8>
@@ -38,24 +28,14 @@ define i32 @and_i32_as_v4i8(i32 %a, i32
 define i32 @xor_i32_as_v4i8(i32 %a, i32 %b) nounwind {
 ; X32-SSE-LABEL: xor_i32_as_v4i8:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %eax
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pxor %xmm0, %xmm1
-; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    popl %ecx
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_i32_as_v4i8:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movd %esi, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    movd %edi, %xmm1
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-SSE-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    xorl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <4 x i8>
   %2 = bitcast i32 %b to <4 x i8>
@@ -67,24 +47,14 @@ define i32 @xor_i32_as_v4i8(i32 %a, i32
 define i32 @or_i32_as_v4i8(i32 %a, i32 %b) nounwind {
 ; X32-SSE-LABEL: or_i32_as_v4i8:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %eax
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X32-SSE-NEXT:    por %xmm0, %xmm1
-; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    popl %ecx
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_i32_as_v4i8:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movd %esi, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE-NEXT:    movd %edi, %xmm1
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-SSE-NEXT:    por %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    orl %esi, %edi
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <4 x i8>
   %2 = bitcast i32 %b to <4 x i8>
@@ -100,186 +70,14 @@ define i32 @or_i32_as_v4i8(i32 %a, i32 %
 define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind {
 ; X32-SSE-LABEL: and_i32_as_v8i4:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    movl 12(%ebp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X32-SSE-NEXT:    movl 8(%ebp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movl (%esp), %eax
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_i32_as_v8i4:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $4, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movl %esi, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $8, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $12, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $16, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $20, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $24, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    shrl $28, %esi
-; X64-SSE-NEXT:    pinsrw $7, %esi, %xmm0
+; X64-SSE-NEXT:    andl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $4, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movl %edi, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm1
-; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $8, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $12, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $16, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $20, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $24, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
-; X64-SSE-NEXT:    shrl $28, %edi
-; X64-SSE-NEXT:    pinsrw $7, %edi, %xmm1
-; X64-SSE-NEXT:    pand %xmm0, %xmm1
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <8 x i4>
   %2 = bitcast i32 %b to <8 x i4>
@@ -291,186 +89,14 @@ define i32 @and_i32_as_v8i4(i32 %a, i32
 define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind {
 ; X32-SSE-LABEL: xor_i32_as_v8i4:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    movl 12(%ebp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X32-SSE-NEXT:    movl 8(%ebp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
-; X32-SSE-NEXT:    pxor %xmm0, %xmm1
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movl (%esp), %eax
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_i32_as_v8i4:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $4, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movl %esi, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $8, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $12, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $16, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $20, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $24, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    shrl $28, %esi
-; X64-SSE-NEXT:    pinsrw $7, %esi, %xmm0
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $4, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movl %edi, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm1
-; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; X64-SSE-NEXT:    xorl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $8, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $12, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $16, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $20, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $24, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
-; X64-SSE-NEXT:    shrl $28, %edi
-; X64-SSE-NEXT:    pinsrw $7, %edi, %xmm1
-; X64-SSE-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <8 x i4>
   %2 = bitcast i32 %b to <8 x i4>
@@ -482,186 +108,14 @@ define i32 @xor_i32_as_v8i4(i32 %a, i32
 define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind {
 ; X32-SSE-LABEL: or_i32_as_v8i4:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    movl 12(%ebp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X32-SSE-NEXT:    movl 8(%ebp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
-; X32-SSE-NEXT:    por %xmm0, %xmm1
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movl (%esp), %eax
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_i32_as_v8i4:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $4, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movl %esi, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $8, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $12, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $16, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $20, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; X64-SSE-NEXT:    movl %esi, %eax
-; X64-SSE-NEXT:    shrl $24, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; X64-SSE-NEXT:    shrl $28, %esi
-; X64-SSE-NEXT:    pinsrw $7, %esi, %xmm0
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $4, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movl %edi, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm1
-; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $8, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm1
+; X64-SSE-NEXT:    orl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $12, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $16, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $20, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X64-SSE-NEXT:    movl %edi, %eax
-; X64-SSE-NEXT:    shrl $24, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
-; X64-SSE-NEXT:    shrl $28, %edi
-; X64-SSE-NEXT:    pinsrw $7, %edi, %xmm1
-; X64-SSE-NEXT:    por %xmm0, %xmm1
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <8 x i4>
   %2 = bitcast i32 %b to <8 x i4>
@@ -677,28 +131,12 @@ define i32 @or_i32_as_v8i4(i32 %a, i32 %
 define <4 x i8> @and_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
 ; X32-SSE-LABEL: and_v4i8_as_i32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm1
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %ecx
-; X32-SSE-NEXT:    andl %eax, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm0
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    andps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_v4i8_as_i32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-SSE-NEXT:    pshufb %xmm2, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    andl %eax, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    andps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <4 x i8> %a to i32
   %2 = bitcast <4 x i8> %b to i32
@@ -710,28 +148,12 @@ define <4 x i8> @and_v4i8_as_i32(<4 x i8
 define <4 x i8> @xor_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
 ; X32-SSE-LABEL: xor_v4i8_as_i32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm1
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %ecx
-; X32-SSE-NEXT:    xorl %eax, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm0
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    xorps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_v4i8_as_i32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-SSE-NEXT:    pshufb %xmm2, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    xorl %eax, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    xorps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <4 x i8> %a to i32
   %2 = bitcast <4 x i8> %b to i32
@@ -743,28 +165,12 @@ define <4 x i8> @xor_v4i8_as_i32(<4 x i8
 define <4 x i8> @or_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
 ; X32-SSE-LABEL: or_v4i8_as_i32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    subl $12, %esp
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm1
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    movd %xmm0, %ecx
-; X32-SSE-NEXT:    orl %eax, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm0
-; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    orps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_v4i8_as_i32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-SSE-NEXT:    pshufb %xmm2, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    orl %eax, %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    orps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <4 x i8> %a to i32
   %2 = bitcast <4 x i8> %b to i32
@@ -780,174 +186,12 @@ define <4 x i8> @or_v4i8_as_i32(<4 x i8>
 define <8 x i4> @and_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
 ; X32-SSE-LABEL: and_v8i4_as_i32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    andl (%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    andps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: and_v8i4_as_i32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $4, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $15, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $8, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $16, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $20, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $24, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $28, %eax
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    andps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <8 x i4> %a to i32
   %2 = bitcast <8 x i4> %b to i32
@@ -959,174 +203,12 @@ define <8 x i4> @and_v8i4_as_i32(<8 x i4
 define <8 x i4> @xor_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
 ; X32-SSE-LABEL: xor_v8i4_as_i32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    xorl (%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    xorps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: xor_v8i4_as_i32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    xorl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $4, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $15, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $8, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $16, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $20, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $24, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $28, %eax
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    xorps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <8 x i4> %a to i32
   %2 = bitcast <8 x i4> %b to i32
@@ -1138,174 +220,12 @@ define <8 x i4> @xor_v8i4_as_i32(<8 x i4
 define <8 x i4> @or_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
 ; X32-SSE-LABEL: or_v8i4_as_i32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pushl %ebp
-; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-8, %esp
-; X32-SSE-NEXT:    subl $24, %esp
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    movd %xmm0, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movd %xmm1, %eax
-; X32-SSE-NEXT:    andl $15, %eax
-; X32-SSE-NEXT:    movb %al, (%esp)
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    orl (%esp), %eax
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $4, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    movl %eax, %edx
-; X32-SSE-NEXT:    andl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm0
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $8, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $12, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $16, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $20, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X32-SSE-NEXT:    movl %eax, %ecx
-; X32-SSE-NEXT:    shrl $24, %ecx
-; X32-SSE-NEXT:    andl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X32-SSE-NEXT:    shrl $28, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
-; X32-SSE-NEXT:    movl %ebp, %esp
-; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    orps %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: or_v8i4_as_i32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    andl $15, %eax
-; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    orl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $4, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    movl %eax, %edx
-; X64-SSE-NEXT:    andl $15, %edx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $8, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $12, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $16, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $20, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
-; X64-SSE-NEXT:    movl %eax, %ecx
-; X64-SSE-NEXT:    shrl $24, %ecx
-; X64-SSE-NEXT:    andl $15, %ecx
-; X64-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
-; X64-SSE-NEXT:    shrl $28, %eax
-; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    orps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
   %1 = bitcast <8 x i4> %a to i32
   %2 = bitcast <8 x i4> %b to i32




More information about the llvm-commits mailing list