[llvm] r265930 - [X86] Added extra widening tests for and/xor/or bit operations

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 11 04:10:36 PDT 2016


Author: rksimon
Date: Mon Apr 11 06:10:36 2016
New Revision: 265930

URL: http://llvm.org/viewvc/llvm-project?rev=265930&view=rev
Log:
[X86] Added extra widening tests for and/xor/or bit operations

Add tests for bitcasting an illegal vector to/from a legal scalar

Additional tests requested for D18944

Added:
    llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll

Added: llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll?rev=265930&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll (added)
+++ llvm/trunk/test/CodeGen/X86/widen_bitops-1.ll Mon Apr 11 06:10:36 2016
@@ -0,0 +1,1315 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE42
+
+;
+; AND/XOR/OR i32 as v4i8
+;
+
+define i32 @and_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: and_i32_as_v4i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %eax
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    popl %ecx
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: and_i32_as_v4i8:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movd %esi, %xmm0
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    movd %edi, %xmm1
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X64-SSE-NEXT:    pand %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    retq
+  %1 = bitcast i32 %a to <4 x i8>
+  %2 = bitcast i32 %b to <4 x i8>
+  %3 = and <4 x i8> %1, %2
+  %4 = bitcast <4 x i8> %3 to i32
+  ret i32 %4
+}
+
+define i32 @xor_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: xor_i32_as_v4i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %eax
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-SSE-NEXT:    pxor %xmm0, %xmm1
+; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    popl %ecx
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: xor_i32_as_v4i8:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movd %esi, %xmm0
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    movd %edi, %xmm1
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X64-SSE-NEXT:    pxor %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    retq
+  %1 = bitcast i32 %a to <4 x i8>
+  %2 = bitcast i32 %b to <4 x i8>
+  %3 = xor <4 x i8> %1, %2
+  %4 = bitcast <4 x i8> %3 to i32
+  ret i32 %4
+}
+
+define i32 @or_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: or_i32_as_v4i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %eax
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-SSE-NEXT:    por %xmm0, %xmm1
+; X32-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    popl %ecx
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: or_i32_as_v4i8:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movd %esi, %xmm0
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    movd %edi, %xmm1
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X64-SSE-NEXT:    por %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    retq
+  %1 = bitcast i32 %a to <4 x i8>
+  %2 = bitcast i32 %b to <4 x i8>
+  %3 = or <4 x i8> %1, %2
+  %4 = bitcast <4 x i8> %3 to i32
+  ret i32 %4
+}
+
+;
+; AND/XOR/OR i32 as v8i4
+;
+
+define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: and_i32_as_v8i4:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %ebp
+; X32-SSE-NEXT:    movl %esp, %ebp
+; X32-SSE-NEXT:    andl $-8, %esp
+; X32-SSE-NEXT:    subl $24, %esp
+; X32-SSE-NEXT:    movl 12(%ebp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm0
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE-NEXT:    movl 8(%ebp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm1
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movl (%esp), %eax
+; X32-SSE-NEXT:    movl %ebp, %esp
+; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: and_i32_as_v8i4:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $4, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movl %esi, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $8, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $12, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $16, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $20, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $24, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
+; X64-SSE-NEXT:    shrl $28, %esi
+; X64-SSE-NEXT:    pinsrw $7, %esi, %xmm0
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $4, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movl %edi, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm1
+; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $8, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $12, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $16, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $20, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $24, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
+; X64-SSE-NEXT:    shrl $28, %edi
+; X64-SSE-NEXT:    pinsrw $7, %edi, %xmm1
+; X64-SSE-NEXT:    pand %xmm0, %xmm1
+; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    retq
+  %1 = bitcast i32 %a to <8 x i4>
+  %2 = bitcast i32 %b to <8 x i4>
+  %3 = and <8 x i4> %1, %2
+  %4 = bitcast <8 x i4> %3 to i32
+  ret i32 %4
+}
+
+define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: xor_i32_as_v8i4:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %ebp
+; X32-SSE-NEXT:    movl %esp, %ebp
+; X32-SSE-NEXT:    andl $-8, %esp
+; X32-SSE-NEXT:    subl $24, %esp
+; X32-SSE-NEXT:    movl 12(%ebp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm0
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE-NEXT:    movl 8(%ebp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm1
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
+; X32-SSE-NEXT:    pxor %xmm0, %xmm1
+; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movl (%esp), %eax
+; X32-SSE-NEXT:    movl %ebp, %esp
+; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: xor_i32_as_v8i4:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $4, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movl %esi, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $8, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $12, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $16, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $20, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $24, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
+; X64-SSE-NEXT:    shrl $28, %esi
+; X64-SSE-NEXT:    pinsrw $7, %esi, %xmm0
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $4, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movl %edi, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm1
+; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $8, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $12, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $16, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $20, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $24, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
+; X64-SSE-NEXT:    shrl $28, %edi
+; X64-SSE-NEXT:    pinsrw $7, %edi, %xmm1
+; X64-SSE-NEXT:    pxor %xmm0, %xmm1
+; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    retq
+  %1 = bitcast i32 %a to <8 x i4>
+  %2 = bitcast i32 %b to <8 x i4>
+  %3 = xor <8 x i4> %1, %2
+  %4 = bitcast <8 x i4> %3 to i32
+  ret i32 %4
+}
+
+define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: or_i32_as_v8i4:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %ebp
+; X32-SSE-NEXT:    movl %esp, %ebp
+; X32-SSE-NEXT:    andl $-8, %esp
+; X32-SSE-NEXT:    subl $24, %esp
+; X32-SSE-NEXT:    movl 12(%ebp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm0
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE-NEXT:    movl 8(%ebp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm1
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
+; X32-SSE-NEXT:    por %xmm0, %xmm1
+; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movl (%esp), %eax
+; X32-SSE-NEXT:    movl %ebp, %esp
+; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: or_i32_as_v8i4:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $4, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movl %esi, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $8, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $12, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $16, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $20, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm0
+; X64-SSE-NEXT:    movl %esi, %eax
+; X64-SSE-NEXT:    shrl $24, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm0
+; X64-SSE-NEXT:    shrl $28, %esi
+; X64-SSE-NEXT:    pinsrw $7, %esi, %xmm0
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $4, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movl %edi, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm1
+; X64-SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $8, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $2, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $12, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $3, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $16, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $4, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $20, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $5, %eax, %xmm1
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    shrl $24, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    pinsrw $6, %eax, %xmm1
+; X64-SSE-NEXT:    shrl $28, %edi
+; X64-SSE-NEXT:    pinsrw $7, %edi, %xmm1
+; X64-SSE-NEXT:    por %xmm0, %xmm1
+; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    retq
+  %1 = bitcast i32 %a to <8 x i4>
+  %2 = bitcast i32 %b to <8 x i4>
+  %3 = or <8 x i4> %1, %2
+  %4 = bitcast <8 x i4> %3 to i32
+  ret i32 %4
+}
+
+;
+; AND/XOR/OR v4i8 as i32
+;
+
+define <4 x i8> @and_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: and_v4i8_as_i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    subl $12, %esp
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm1
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
+; X32-SSE-NEXT:    movd %xmm0, %ecx
+; X32-SSE-NEXT:    andl %eax, %ecx
+; X32-SSE-NEXT:    movd %ecx, %xmm0
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: and_v4i8_as_i32:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; X64-SSE-NEXT:    pshufb %xmm2, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    pshufb %xmm2, %xmm0
+; X64-SSE-NEXT:    movd %xmm0, %ecx
+; X64-SSE-NEXT:    andl %eax, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    retq
+  %1 = bitcast <4 x i8> %a to i32
+  %2 = bitcast <4 x i8> %b to i32
+  %3 = and i32 %1, %2
+  %4 = bitcast i32 %3 to <4 x i8>
+  ret <4 x i8>  %4
+}
+
+define <4 x i8> @xor_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: xor_v4i8_as_i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    subl $12, %esp
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm1
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
+; X32-SSE-NEXT:    movd %xmm0, %ecx
+; X32-SSE-NEXT:    xorl %eax, %ecx
+; X32-SSE-NEXT:    movd %ecx, %xmm0
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: xor_v4i8_as_i32:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; X64-SSE-NEXT:    pshufb %xmm2, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    pshufb %xmm2, %xmm0
+; X64-SSE-NEXT:    movd %xmm0, %ecx
+; X64-SSE-NEXT:    xorl %eax, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    retq
+  %1 = bitcast <4 x i8> %a to i32
+  %2 = bitcast <4 x i8> %b to i32
+  %3 = xor i32 %1, %2
+  %4 = bitcast i32 %3 to <4 x i8>
+  ret <4 x i8>  %4
+}
+
+define <4 x i8> @or_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: or_v4i8_as_i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    subl $12, %esp
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm1
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
+; X32-SSE-NEXT:    movd %xmm0, %ecx
+; X32-SSE-NEXT:    orl %eax, %ecx
+; X32-SSE-NEXT:    movd %ecx, %xmm0
+; X32-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X32-SSE-NEXT:    addl $12, %esp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: or_v4i8_as_i32:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; X64-SSE-NEXT:    pshufb %xmm2, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    pshufb %xmm2, %xmm0
+; X64-SSE-NEXT:    movd %xmm0, %ecx
+; X64-SSE-NEXT:    orl %eax, %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE-NEXT:    retq
+  %1 = bitcast <4 x i8> %a to i32
+  %2 = bitcast <4 x i8> %b to i32
+  %3 = or i32 %1, %2
+  %4 = bitcast i32 %3 to <4 x i8>
+  ret <4 x i8>  %4
+}
+
+;
+; AND/XOR/OR v8i4 as i32
+;
+
+define <8 x i4> @and_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: and_v8i4_as_i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %ebp
+; X32-SSE-NEXT:    movl %esp, %ebp
+; X32-SSE-NEXT:    andl $-8, %esp
+; X32-SSE-NEXT:    subl $24, %esp
+; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movd %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    andl (%esp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm0
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE-NEXT:    movl %ebp, %esp
+; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: and_v8i4_as_i32:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $4, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movl %eax, %edx
+; X64-SSE-NEXT:    andl $15, %edx
+; X64-SSE-NEXT:    movd %edx, %xmm0
+; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $8, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $12, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $16, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $20, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $24, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X64-SSE-NEXT:    shrl $28, %eax
+; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    retq
+  %1 = bitcast <8 x i4> %a to i32
+  %2 = bitcast <8 x i4> %b to i32
+  %3 = and i32 %1, %2
+  %4 = bitcast i32 %3 to <8 x i4>
+  ret <8 x i4>  %4
+}
+
+define <8 x i4> @xor_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: xor_v8i4_as_i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %ebp
+; X32-SSE-NEXT:    movl %esp, %ebp
+; X32-SSE-NEXT:    andl $-8, %esp
+; X32-SSE-NEXT:    subl $24, %esp
+; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movd %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    xorl (%esp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm0
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE-NEXT:    movl %ebp, %esp
+; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: xor_v8i4_as_i32:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    xorl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $4, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movl %eax, %edx
+; X64-SSE-NEXT:    andl $15, %edx
+; X64-SSE-NEXT:    movd %edx, %xmm0
+; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $8, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $12, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $16, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $20, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $24, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X64-SSE-NEXT:    shrl $28, %eax
+; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    retq
+  %1 = bitcast <8 x i4> %a to i32
+  %2 = bitcast <8 x i4> %b to i32
+  %3 = xor i32 %1, %2
+  %4 = bitcast i32 %3 to <8 x i4>
+  ret <8 x i4>  %4
+}
+
+define <8 x i4> @or_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: or_v8i4_as_i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %ebp
+; X32-SSE-NEXT:    movl %esp, %ebp
+; X32-SSE-NEXT:    andl $-8, %esp
+; X32-SSE-NEXT:    subl $24, %esp
+; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movd %xmm0, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    andl $15, %eax
+; X32-SSE-NEXT:    movb %al, (%esp)
+; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT:    orl (%esp), %eax
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $4, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    movl %eax, %edx
+; X32-SSE-NEXT:    andl $15, %edx
+; X32-SSE-NEXT:    movd %edx, %xmm0
+; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $8, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $12, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $16, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $20, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE-NEXT:    movl %eax, %ecx
+; X32-SSE-NEXT:    shrl $24, %ecx
+; X32-SSE-NEXT:    andl $15, %ecx
+; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE-NEXT:    shrl $28, %eax
+; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE-NEXT:    movl %ebp, %esp
+; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: or_v8i4_as_i32:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    pextrw $7, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm0, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $7, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $6, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $5, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $4, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $3, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $2, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    andl $15, %eax
+; X64-SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    orl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $4, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    movl %eax, %edx
+; X64-SSE-NEXT:    andl $15, %edx
+; X64-SSE-NEXT:    movd %edx, %xmm0
+; X64-SSE-NEXT:    pinsrw $1, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $8, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $2, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $12, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $3, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $16, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $4, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $20, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $5, %ecx, %xmm0
+; X64-SSE-NEXT:    movl %eax, %ecx
+; X64-SSE-NEXT:    shrl $24, %ecx
+; X64-SSE-NEXT:    andl $15, %ecx
+; X64-SSE-NEXT:    pinsrw $6, %ecx, %xmm0
+; X64-SSE-NEXT:    shrl $28, %eax
+; X64-SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; X64-SSE-NEXT:    retq
+  %1 = bitcast <8 x i4> %a to i32
+  %2 = bitcast <8 x i4> %b to i32
+  %3 = or i32 %1, %2
+  %4 = bitcast i32 %3 to <8 x i4>
+  ret <8 x i4>  %4
+}




More information about the llvm-commits mailing list