[llvm] r283666 - [X86][AVX2] Regenerate and add 32-bit tests to core tests

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 8 11:36:57 PDT 2016


Author: rksimon
Date: Sat Oct  8 13:36:57 2016
New Revision: 283666

URL: http://llvm.org/viewvc/llvm-project?rev=283666&view=rev
Log:
[X86][AVX2] Regenerate and add 32-bit tests to core tests

Modified:
    llvm/trunk/test/CodeGen/X86/avx2-arith.ll
    llvm/trunk/test/CodeGen/X86/avx2-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx2-conversions.ll
    llvm/trunk/test/CodeGen/X86/avx2-fma-fneg-combine.ll
    llvm/trunk/test/CodeGen/X86/avx2-gather.ll
    llvm/trunk/test/CodeGen/X86/avx2-logic.ll
    llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll
    llvm/trunk/test/CodeGen/X86/avx2-shift.ll
    llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
    llvm/trunk/test/CodeGen/X86/avx2-vperm.ll

Modified: llvm/trunk/test/CodeGen/X86/avx2-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-arith.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-arith.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-arith.ll Sat Oct  8 13:36:57 2016
@@ -1,211 +1,326 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
-; CHECK: vpaddq %ymm
 define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; X32-LABEL: test_vpaddq:
+; X32:       ## BB#0:
+; X32-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpaddq:
+; X64:       ## BB#0:
+; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = add <4 x i64> %i, %j
   ret <4 x i64> %x
 }
 
-; CHECK: vpaddd %ymm
 define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; X32-LABEL: test_vpaddd:
+; X32:       ## BB#0:
+; X32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpaddd:
+; X64:       ## BB#0:
+; X64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = add <8 x i32> %i, %j
   ret <8 x i32> %x
 }
 
-; CHECK: vpaddw %ymm
 define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; X32-LABEL: test_vpaddw:
+; X32:       ## BB#0:
+; X32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpaddw:
+; X64:       ## BB#0:
+; X64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = add <16 x i16> %i, %j
   ret <16 x i16> %x
 }
 
-; CHECK: vpaddb %ymm
 define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; X32-LABEL: test_vpaddb:
+; X32:       ## BB#0:
+; X32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpaddb:
+; X64:       ## BB#0:
+; X64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = add <32 x i8> %i, %j
   ret <32 x i8> %x
 }
 
-; CHECK: vpsubq %ymm
 define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; X32-LABEL: test_vpsubq:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpsubq:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = sub <4 x i64> %i, %j
   ret <4 x i64> %x
 }
 
-; CHECK: vpsubd %ymm
 define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; X32-LABEL: test_vpsubd:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpsubd:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = sub <8 x i32> %i, %j
   ret <8 x i32> %x
 }
 
-; CHECK: vpsubw %ymm
 define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; X32-LABEL: test_vpsubw:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpsubw:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = sub <16 x i16> %i, %j
   ret <16 x i16> %x
 }
 
-; CHECK: vpsubb %ymm
 define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; X32-LABEL: test_vpsubb:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpsubb:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = sub <32 x i8> %i, %j
   ret <32 x i8> %x
 }
 
-; CHECK: vpmulld %ymm
 define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; X32-LABEL: test_vpmulld:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpmulld:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = mul <8 x i32> %i, %j
   ret <8 x i32> %x
 }
 
-; CHECK: vpmullw %ymm
 define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; X32-LABEL: test_vpmullw:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpmullw:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = mul <16 x i16> %i, %j
   ret <16 x i16> %x
 }
 
-; CHECK: mul-v16i8
-; CHECK:       # BB#0:
-; CHECK-NEXT:  vpmovsxbw %xmm1, %ymm1
-; CHECK-NEXT:  vpmovsxbw %xmm0, %ymm0
-; CHECK-NEXT:  vpmullw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:  vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:  vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:  vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:  vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:  vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:  vzeroupper
-; CHECK-NEXT:  retq
 define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
   %x = mul <16 x i8> %i, %j
   ret <16 x i8> %x
 }
 
-; CHECK: mul-v32i8
-; CHECK:       # BB#0:
-; CHECK-NEXT:  vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:  vpmovsxbw %xmm2, %ymm2
-; CHECK-NEXT:  vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT:  vpmovsxbw %xmm3, %ymm3
-; CHECK-NEXT:  vpmullw %ymm2, %ymm3, %ymm2
-; CHECK-NEXT:  vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:  vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:  vpshufb %xmm4, %xmm3, %xmm3
-; CHECK-NEXT:  vpshufb %xmm4, %xmm2, %xmm2
-; CHECK-NEXT:  vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; CHECK-NEXT:  vpmovsxbw %xmm1, %ymm1
-; CHECK-NEXT:  vpmovsxbw %xmm0, %ymm0
-; CHECK-NEXT:  vpmullw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:  vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:  vpshufb %xmm4, %xmm1, %xmm1
-; CHECK-NEXT:  vpshufb %xmm4, %xmm0, %xmm0
-; CHECK-NEXT:  vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:  vinserti128 $1, %xmm2, %ymm0, %ymm0
-; CHECK-NEXT:  retq
 define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
   %x = mul <32 x i8> %i, %j
   ret <32 x i8> %x
 }
 
-; CHECK: mul-v4i64
-; CHECK: vpmuludq %ymm
-; CHECK-NEXT: vpsrlq $32, %ymm
-; CHECK-NEXT: vpmuludq %ymm
-; CHECK-NEXT: vpsllq $32, %ymm
-; CHECK-NEXT: vpaddq %ymm
-; CHECK-NEXT: vpsrlq $32, %ymm
-; CHECK-NEXT: vpmuludq %ymm
-; CHECK-NEXT: vpsllq $32, %ymm
-; CHECK-NEXT: vpaddq %ymm
 define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
   %x = mul <4 x i64> %i, %j
   ret <4 x i64> %x
 }
 
-; CHECK: mul_const1
-; CHECK: vpaddd
-; CHECK: ret
 define <8 x i32> @mul_const1(<8 x i32> %x) {
+; X32-LABEL: mul_const1:
+; X32:       ## BB#0:
+; X32-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const1:
+; X64:       ## BB#0:
+; X64-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
   %y = mul <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i32> %y
 }
 
-; CHECK: mul_const2
-; CHECK: vpsllq  $2
-; CHECK: ret
 define <4 x i64> @mul_const2(<4 x i64> %x) {
+; X32-LABEL: mul_const2:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllq $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const2:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllq $2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %y = mul <4 x i64> %x, <i64 4, i64 4, i64 4, i64 4>
   ret <4 x i64> %y
 }
 
-; CHECK: mul_const3
-; CHECK: vpsllw  $3
-; CHECK: ret
 define <16 x i16> @mul_const3(<16 x i16> %x) {
+; X32-LABEL: mul_const3:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $3, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const3:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $3, %ymm0, %ymm0
+; X64-NEXT:    retq
   %y = mul <16 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   ret <16 x i16> %y
 }
 
-; CHECK: mul_const4
-; CHECK: vpxor
-; CHECK: vpsubq
-; CHECK: ret
 define <4 x i64> @mul_const4(<4 x i64> %x) {
+; X32-LABEL: mul_const4:
+; X32:       ## BB#0:
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const4:
+; X64:       ## BB#0:
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %y = mul <4 x i64> %x, <i64 -1, i64 -1, i64 -1, i64 -1>
   ret <4 x i64> %y
 }
 
-; CHECK: mul_const5
-; CHECK: vxorps
-; CHECK-NEXT: ret
 define <8 x i32> @mul_const5(<8 x i32> %x) {
+; X32-LABEL: mul_const5:
+; X32:       ## BB#0:
+; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const5:
+; X64:       ## BB#0:
+; X64-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
   %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %y
 }
 
-; CHECK: mul_const6
-; CHECK: vpmulld
-; CHECK: ret
 define <8 x i32> @mul_const6(<8 x i32> %x) {
+; X32-LABEL: mul_const6:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmulld LCPI18_0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const6:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    retq
   %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
   ret <8 x i32> %y
 }
 
-; CHECK: mul_const7
-; CHECK: vpaddq
-; CHECK: vpaddq
-; CHECK: ret
 define <8 x i64> @mul_const7(<8 x i64> %x) {
+; X32-LABEL: mul_const7:
+; X32:       ## BB#0:
+; X32-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
+; X32-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const7:
+; X64:       ## BB#0:
+; X64-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
+; X64-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; X64-NEXT:    retq
   %y = mul <8 x i64> %x, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
   ret <8 x i64> %y
 }
 
-; CHECK: mul_const8
-; CHECK: vpsllw  $3
-; CHECK: ret
 define <8 x i16> @mul_const8(<8 x i16> %x) {
+; X32-LABEL: mul_const8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $3, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $3, %xmm0, %xmm0
+; X64-NEXT:    retq
   %y = mul <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   ret <8 x i16> %y
 }
 
-; CHECK: mul_const9
-; CHECK: vpmulld
-; CHECK: ret
 define <8 x i32> @mul_const9(<8 x i32> %x) {
+; X32-LABEL: mul_const9:
+; X32:       ## BB#0:
+; X32-NEXT:    movl $2, %eax
+; X32-NEXT:    vmovd %eax, %xmm1
+; X32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const9:
+; X64:       ## BB#0:
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    vmovd %eax, %xmm1
+; X64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %y
 }
 
-; CHECK: mul_const10
-; CHECK: vpmulld
-; CHECK: ret
 define <4 x i32> @mul_const10(<4 x i32> %x) {
   ; %x * 0x01010101
+; X32-LABEL: mul_const10:
+; X32:       ## BB#0:
+; X32-NEXT:    vpbroadcastd LCPI22_0, %xmm1
+; X32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const10:
+; X64:       ## BB#0:
+; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
   %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
   ret <4 x i32> %m
 }
 
-; CHECK: mul_const11
-; CHECK: vpmulld
-; CHECK: ret
 define <4 x i32> @mul_const11(<4 x i32> %x) {
   ; %x * 0x80808080
+; X32-LABEL: mul_const11:
+; X32:       ## BB#0:
+; X32-NEXT:    vpbroadcastd LCPI23_0, %xmm1
+; X32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_const11:
+; X64:       ## BB#0:
+; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
   %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
   ret <4 x i32> %m
 }

Modified: llvm/trunk/test/CodeGen/X86/avx2-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-cmp.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-cmp.ll Sat Oct  8 13:36:57 2016
@@ -1,58 +1,123 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; CHECK: vpcmpgtd  %ymm
-define <8 x i32> @int256-cmp(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define <8 x i32> @v8i32_cmpgt(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; X32-LABEL: v8i32_cmpgt:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: v8i32_cmpgt:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp slt <8 x i32> %i, %j
   %x = sext <8 x i1> %bincmp to <8 x i32>
   ret <8 x i32> %x
 }
 
-; CHECK: vpcmpgtq  %ymm
-define <4 x i64> @v4i64-cmp(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @v4i64_cmpgt(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; X32-LABEL: v4i64_cmpgt:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: v4i64_cmpgt:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp slt <4 x i64> %i, %j
   %x = sext <4 x i1> %bincmp to <4 x i64>
   ret <4 x i64> %x
 }
 
-; CHECK: vpcmpgtw  %ymm
-define <16 x i16> @v16i16-cmp(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+define <16 x i16> @v16i16_cmpgt(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; X32-LABEL: v16i16_cmpgt:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: v16i16_cmpgt:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp slt <16 x i16> %i, %j
   %x = sext <16 x i1> %bincmp to <16 x i16>
   ret <16 x i16> %x
 }
 
-; CHECK: vpcmpgtb  %ymm
-define <32 x i8> @v32i8-cmp(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @v32i8_cmpgt(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; X32-LABEL: v32i8_cmpgt:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: v32i8_cmpgt:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp slt <32 x i8> %i, %j
   %x = sext <32 x i1> %bincmp to <32 x i8>
   ret <32 x i8> %x
 }
 
-; CHECK: vpcmpeqd  %ymm
-define <8 x i32> @int256-cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; X32-LABEL: int256_cmpeq:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: int256_cmpeq:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp eq <8 x i32> %i, %j
   %x = sext <8 x i1> %bincmp to <8 x i32>
   ret <8 x i32> %x
 }
 
-; CHECK: vpcmpeqq  %ymm
-define <4 x i64> @v4i64-cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; X32-LABEL: v4i64_cmpeq:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: v4i64_cmpeq:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp eq <4 x i64> %i, %j
   %x = sext <4 x i1> %bincmp to <4 x i64>
   ret <4 x i64> %x
 }
 
-; CHECK: vpcmpeqw  %ymm
-define <16 x i16> @v16i16-cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; X32-LABEL: v16i16_cmpeq:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: v16i16_cmpeq:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp eq <16 x i16> %i, %j
   %x = sext <16 x i1> %bincmp to <16 x i16>
   ret <16 x i16> %x
 }
 
-; CHECK: vpcmpeqb  %ymm
-define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; X32-LABEL: v32i8_cmpeq:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: v32i8_cmpeq:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %bincmp = icmp eq <32 x i8> %i, %j
   %x = sext <32 x i1> %bincmp to <32 x i8>
   ret <32 x i8> %x
 }
-

Modified: llvm/trunk/test/CodeGen/X86/avx2-conversions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-conversions.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-conversions.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-conversions.ll Sat Oct  8 13:36:57 2016
@@ -1,153 +1,246 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
 define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
-; CHECK-LABEL: trunc4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X32-LABEL: trunc4:
+; X32:       ## BB#0:
+; X32-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: trunc4:
+; X64:       ## BB#0:
+; X64-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %B = trunc <4 x i64> %A to <4 x i32>
   ret <4 x i32>%B
 }
 
 define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
-; CHECK-LABEL: trunc8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X32-LABEL: trunc8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: trunc8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %B = trunc <8 x i32> %A to <8 x i16>
   ret <8 x i16>%B
 }
 
 define <4 x i64> @sext4(<4 x i32> %A) nounwind {
-; CHECK-LABEL: sext4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: sext4:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovsxdq %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: sext4:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxdq %xmm0, %ymm0
+; X64-NEXT:    retq
   %B = sext <4 x i32> %A to <4 x i64>
   ret <4 x i64>%B
 }
 
 define <8 x i32> @sext8(<8 x i16> %A) nounwind {
-; CHECK-LABEL: sext8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: sext8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: sext8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
+; X64-NEXT:    retq
   %B = sext <8 x i16> %A to <8 x i32>
   ret <8 x i32>%B
 }
 
 define <4 x i64> @zext4(<4 x i32> %A) nounwind {
-; CHECK-LABEL: zext4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    retq
+; X32-LABEL: zext4:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: zext4:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT:    retq
   %B = zext <4 x i32> %A to <4 x i64>
   ret <4 x i64>%B
 }
 
 define <8 x i32> @zext8(<8 x i16> %A) nounwind {
-; CHECK-LABEL: zext8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT:    retq
+; X32-LABEL: zext8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: zext8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT:    retq
   %B = zext <8 x i16> %A to <8 x i32>
   ret <8 x i32>%B
 }
 
 define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
-; CHECK-LABEL: zext_8i8_8i32:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT:    retq
+; X32-LABEL: zext_8i8_8i32:
+; X32:       ## BB#0:
+; X32-NEXT:    vpand LCPI6_0, %xmm0, %xmm0
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: zext_8i8_8i32:
+; X64:       ## BB#0:
+; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT:    retq
   %B = zext <8 x i8> %A to <8 x i32>
   ret <8 x i32>%B
 }
 
 define <16 x i16> @zext_16i8_16i16(<16 x i8> %z) {
-; CHECK-LABEL: zext_16i8_16i16:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; CHECK-NEXT:    retq
+; X32-LABEL: zext_16i8_16i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: zext_16i8_16i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X64-NEXT:    retq
   %t = zext <16 x i8> %z to <16 x i16>
   ret <16 x i16> %t
 }
 
 define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
-; CHECK-LABEL: sext_16i8_16i16:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: sext_16i8_16i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: sext_16i8_16i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-NEXT:    retq
   %t = sext <16 x i8> %z to <16 x i16>
   ret <16 x i16> %t
 }
 
 define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
-; CHECK-LABEL: trunc_16i16_16i8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X32-LABEL: trunc_16i16_16i8:
+; X32:       ## BB#0:
+; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X32-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; X32-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: trunc_16i16_16i8:
+; X64:       ## BB#0:
+; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; X64-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %t = trunc <16 x i16> %z to <16 x i8>
   ret <16 x i8> %t
 }
 
 define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
-; CHECK-LABEL: load_sext_test1:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxdq (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_sext_test1:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmovsxdq (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_sext_test1:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxdq (%rdi), %ymm0
+; X64-NEXT:    retq
  %X = load <4 x i32>, <4 x i32>* %ptr
  %Y = sext <4 x i32> %X to <4 x i64>
  ret <4 x i64>%Y
 }
 
 define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
-; CHECK-LABEL: load_sext_test2:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxbq (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_sext_test2:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmovsxbq (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_sext_test2:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxbq (%rdi), %ymm0
+; X64-NEXT:    retq
  %X = load <4 x i8>, <4 x i8>* %ptr
  %Y = sext <4 x i8> %X to <4 x i64>
  ret <4 x i64>%Y
 }
 
 define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
-; CHECK-LABEL: load_sext_test3:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxwq (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_sext_test3:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmovsxwq (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_sext_test3:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxwq (%rdi), %ymm0
+; X64-NEXT:    retq
  %X = load <4 x i16>, <4 x i16>* %ptr
  %Y = sext <4 x i16> %X to <4 x i64>
  ret <4 x i64>%Y
 }
 
 define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
-; CHECK-LABEL: load_sext_test4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxwd (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_sext_test4:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmovsxwd (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_sext_test4:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxwd (%rdi), %ymm0
+; X64-NEXT:    retq
  %X = load <8 x i16>, <8 x i16>* %ptr
  %Y = sext <8 x i16> %X to <8 x i32>
  ret <8 x i32>%Y
 }
 
 define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) {
-; CHECK-LABEL: load_sext_test5:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxbd (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_sext_test5:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmovsxbd (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_sext_test5:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxbd (%rdi), %ymm0
+; X64-NEXT:    retq
  %X = load <8 x i8>, <8 x i8>* %ptr
  %Y = sext <8 x i8> %X to <8 x i32>
  ret <8 x i32>%Y

Modified: llvm/trunk/test/CodeGen/X86/avx2-fma-fneg-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-fma-fneg-combine.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-fma-fneg-combine.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-fma-fneg-combine.ll Sat Oct  8 13:36:57 2016
@@ -1,13 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -mattr=+fma  | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
 ; This test checks combinations of FNEG and FMA intrinsics
 
 define <8 x float> @test1(<8 x float> %a, <8 x float> %b, <8 x float> %c)  {
-; CHECK-LABEL: test1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; X64-NEXT:    retq
 entry:
   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
   %0 = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.i) #2
@@ -17,10 +23,15 @@ entry:
 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 define <4 x float> @test2(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: test2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = tail call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
@@ -30,12 +41,19 @@ entry:
 declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c)
 
 define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c)  {
-; CHECK-LABEL: test3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
+; X32-NEXT:    vbroadcastss LCPI2_0, %xmm1
+; X32-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
+; X64-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; X64-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
@@ -45,10 +63,15 @@ entry:
 declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
 
 define <8 x float> @test4(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
-; CHECK-LABEL: test4:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test4:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test4:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; X64-NEXT:    retq
 entry:
   %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
@@ -56,12 +79,19 @@ entry:
 }
 
 define <8 x float> @test5(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
-; CHECK-LABEL: test5:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
-; CHECK-NEXT:    vxorps %ymm3, %ymm2, %ymm2
-; CHECK-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test5:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vbroadcastss LCPI4_0, %ymm3
+; X32-NEXT:    vxorps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test5:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
+; X64-NEXT:    vxorps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; X64-NEXT:    retq
 entry:
   %sub.c = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
   %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2
@@ -72,10 +102,15 @@ declare <8 x float> @llvm.x86.fma.vfmsub
 
 
 define <2 x double> @test6(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: test6:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test6:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test6:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = tail call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2
   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %0

Modified: llvm/trunk/test/CodeGen/X86/avx2-gather.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-gather.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-gather.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-gather.ll Sat Oct  8 13:36:57 2016
@@ -1,61 +1,87 @@
-; RUN: not llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
                       <4 x i32>, <4 x float>, i8) nounwind readonly
 
-define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1,
-                     <4 x i32> %idx, <4 x float> %mask) {
+define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1, <4 x i32> %idx, <4 x float> %mask) {
+; X32-LABEL: test_x86_avx2_gather_d_ps:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vgatherdps %xmm1, (%eax,%xmm0,2), %xmm2
+; X32-NEXT:    vmovaps %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_gather_d_ps:
+; X64:       ## BB#0:
+; X64-NEXT:    vgatherdps %xmm1, (%rdi,%xmm0,2), %xmm2
+; X64-NEXT:    vmovaps %xmm2, %xmm0
+; X64-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef,
                             i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
   ret <4 x float> %res
 }
 
-; CHECK: test_x86_avx2_gather_d_ps
-; CHECK: vgatherdps
-; CHECK-NOT: [[DST]]
-; CHECK: [[DST:%xmm[0-9]+]]{{$}}
-; CHECK: vmovaps
-; CHECK: ret
-
 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
                       <4 x i32>, <2 x double>, i8) nounwind readonly
 
-define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1,
-                     <4 x i32> %idx, <2 x double> %mask) {
+define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
+; X32-LABEL: test_x86_avx2_gather_d_pd:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vgatherdpd %xmm1, (%eax,%xmm0,2), %xmm2
+; X32-NEXT:    vmovapd %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_gather_d_pd:
+; X64:       ## BB#0:
+; X64-NEXT:    vgatherdpd %xmm1, (%rdi,%xmm0,2), %xmm2
+; X64-NEXT:    vmovapd %xmm2, %xmm0
+; X64-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef,
                             i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
   ret <2 x double> %res
 }
 
-; CHECK: test_x86_avx2_gather_d_pd
-; CHECK: vgatherdpd
-; CHECK: vmovapd
-; CHECK: ret
-
 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
                       <8 x i32>, <8 x float>, i8) nounwind readonly
 
-define <8 x float> @test_x86_avx2_gather_d_ps_256(i8* %a1,
-                     <8 x i32> %idx, <8 x float> %mask) {
+define <8 x float> @test_x86_avx2_gather_d_ps_256(i8* %a1, <8 x i32> %idx, <8 x float> %mask) {
+; X32-LABEL: test_x86_avx2_gather_d_ps_256:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2
+; X32-NEXT:    vmovaps %ymm2, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_gather_d_ps_256:
+; X64:       ## BB#0:
+; X64-NEXT:    vgatherdps %ymm1, (%rdi,%ymm0,4), %ymm2
+; X64-NEXT:    vmovaps %ymm2, %ymm0
+; X64-NEXT:    retq
   %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef,
                             i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
   ret <8 x float> %res
 }
-; CHECK-LABEL: @test_x86_avx2_gather_d_ps_256
-; CHECK: vgatherdps %ymm
-; CHECK: ret
 
 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
                       <4 x i32>, <4 x double>, i8) nounwind readonly
 
-define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1,
-                     <4 x i32> %idx, <4 x double> %mask) {
+define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1, <4 x i32> %idx, <4 x double> %mask) {
+; X32-LABEL: test_x86_avx2_gather_d_pd_256:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vgatherdpd %ymm1, (%eax,%xmm0,8), %ymm2
+; X32-NEXT:    vmovapd %ymm2, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_gather_d_pd_256:
+; X64:       ## BB#0:
+; X64-NEXT:    vgatherdpd %ymm1, (%rdi,%xmm0,8), %ymm2
+; X64-NEXT:    vmovapd %ymm2, %ymm0
+; X64-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef,
                             i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 8) ;
   ret <4 x double> %res
 }
-
-; CHECK-LABEL: test_x86_avx2_gather_d_pd_256
-; CHECK: vgatherdpd %ymm
-; CHECK: ret

Modified: llvm/trunk/test/CodeGen/X86/avx2-logic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-logic.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-logic.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-logic.ll Sat Oct  8 13:36:57 2016
@@ -1,9 +1,20 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
-; CHECK: vpandn
-; CHECK: vpandn  %ymm
-; CHECK: ret
 define <4 x i64> @vpandn(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; X32-LABEL: vpandn:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpaddq LCPI0_0, %ymm0, %ymm1
+; X32-NEXT:    vpandn %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vpandn:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; X64-NEXT:    vpandn %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
@@ -12,10 +23,19 @@ entry:
   ret <4 x i64> %x
 }
 
-; CHECK: vpand
-; CHECK: vpand %ymm
-; CHECK: ret
 define <4 x i64> @vpand(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; X32-LABEL: vpand:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpaddq LCPI1_0, %ymm0, %ymm0
+; X32-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vpand:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
@@ -23,10 +43,19 @@ entry:
   ret <4 x i64> %x
 }
 
-; CHECK: vpor
-; CHECK: vpor %ymm
-; CHECK: ret
 define <4 x i64> @vpor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; X32-LABEL: vpor:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpaddq LCPI2_0, %ymm0, %ymm0
+; X32-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vpor:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
@@ -34,10 +63,19 @@ entry:
   ret <4 x i64> %x
 }
 
-; CHECK: vpxor
-; CHECK: vpxor %ymm
-; CHECK: ret
 define <4 x i64> @vpxor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; X32-LABEL: vpxor:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpaddq LCPI3_0, %ymm0, %ymm0
+; X32-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vpxor:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
@@ -45,22 +83,46 @@ entry:
   ret <4 x i64> %x
 }
 
-; CHECK: vpblendvb
-; CHECK: vpblendvb %ymm
-; CHECK: ret
 define <32 x i8> @vpblendvb(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y) {
+; X32-LABEL: vpblendvb:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X32-NEXT:    vpand LCPI4_0, %ymm0, %ymm0
+; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vpblendvb:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; X64-NEXT:    retq
   %min = select <32 x i1> %cond, <32 x i8> %x, <32 x i8> %y
   ret <32 x i8> %min
 }
 
 define <8 x i32> @allOnes() nounwind {
-; CHECK: vpcmpeqd
-; CHECK-NOT: vinsert
+; X32-LABEL: allOnes:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: allOnes:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
         ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
 }
 
 define <16 x i16> @allOnes2() nounwind {
-; CHECK: vpcmpeqd
-; CHECK-NOT: vinsert
+; X32-LABEL: allOnes2:
+; X32:       ## BB#0:
+; X32-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: allOnes2:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
         ret <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 }

Modified: llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll Sat Oct  8 13:36:57 2016
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
 define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
-; CHECK-LABEL: phaddw1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phaddw1:
+; X32:       ## BB#0:
+; X32-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phaddw1:
+; X64:       ## BB#0:
+; X64-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
   %b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
   %r = add <16 x i16> %a, %b
@@ -13,10 +19,15 @@ define <16 x i16> @phaddw1(<16 x i16> %x
 }
 
 define <16 x i16> @phaddw2(<16 x i16> %x, <16 x i16> %y) {
-; CHECK-LABEL: phaddw2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phaddw2:
+; X32:       ## BB#0:
+; X32-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phaddw2:
+; X64:       ## BB#0:
+; X64-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
   %b = shufflevector <16 x i16> %y, <16 x i16> %x, <16 x i32> <i32 16, i32 18, i32 20, i32 22, i32 0, i32 2, i32 4, i32 6, i32 24, i32 26, i32 28, i32 30, i32 8, i32 10, i32 12, i32 14>
   %r = add <16 x i16> %a, %b
@@ -24,10 +35,15 @@ define <16 x i16> @phaddw2(<16 x i16> %x
 }
 
 define <8 x i32> @phaddd1(<8 x i32> %x, <8 x i32> %y) {
-; CHECK-LABEL: phaddd1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phaddd1:
+; X32:       ## BB#0:
+; X32-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phaddd1:
+; X64:       ## BB#0:
+; X64-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
   %b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
   %r = add <8 x i32> %a, %b
@@ -35,10 +51,15 @@ define <8 x i32> @phaddd1(<8 x i32> %x,
 }
 
 define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
-; CHECK-LABEL: phaddd2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phaddd2:
+; X32:       ## BB#0:
+; X32-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phaddd2:
+; X64:       ## BB#0:
+; X64-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
   %b = shufflevector <8 x i32> %y, <8 x i32> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
   %r = add <8 x i32> %a, %b
@@ -46,10 +67,15 @@ define <8 x i32> @phaddd2(<8 x i32> %x,
 }
 
 define <8 x i32> @phaddd3(<8 x i32> %x) {
-; CHECK-LABEL: phaddd3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phaddd3:
+; X32:       ## BB#0:
+; X32-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phaddd3:
+; X64:       ## BB#0:
+; X64-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = add <8 x i32> %a, %b
@@ -57,10 +83,15 @@ define <8 x i32> @phaddd3(<8 x i32> %x)
 }
 
 define <16 x i16> @phsubw1(<16 x i16> %x, <16 x i16> %y) {
-; CHECK-LABEL: phsubw1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phsubw1:
+; X32:       ## BB#0:
+; X32-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phsubw1:
+; X64:       ## BB#0:
+; X64-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
   %b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
   %r = sub <16 x i16> %a, %b
@@ -68,10 +99,15 @@ define <16 x i16> @phsubw1(<16 x i16> %x
 }
 
 define <8 x i32> @phsubd1(<8 x i32> %x, <8 x i32> %y) {
-; CHECK-LABEL: phsubd1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phsubd1:
+; X32:       ## BB#0:
+; X32-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phsubd1:
+; X64:       ## BB#0:
+; X64-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
   %b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
   %r = sub <8 x i32> %a, %b
@@ -79,10 +115,15 @@ define <8 x i32> @phsubd1(<8 x i32> %x,
 }
 
 define <8 x i32> @phsubd2(<8 x i32> %x, <8 x i32> %y) {
-; CHECK-LABEL: phsubd2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: phsubd2:
+; X32:       ## BB#0:
+; X32-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: phsubd2:
+; X64:       ## BB#0:
+; X64-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 undef, i32 8, i32 undef, i32 4, i32 6, i32 12, i32 14>
   %b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 undef, i32 9, i32 11, i32 5, i32 7, i32 undef, i32 15>
   %r = sub <8 x i32> %a, %b

Modified: llvm/trunk/test/CodeGen/X86/avx2-shift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-shift.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-shift.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-shift.ll Sat Oct  8 13:36:57 2016
@@ -1,301 +1,603 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
-; CHECK: variable_shl0
-; CHECK: psllvd
-; CHECK: ret
 define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) {
+; X32-LABEL: variable_shl0:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl0:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
   %k = shl <4 x i32> %x, %y
   ret <4 x i32> %k
 }
-; CHECK: variable_shl1
-; CHECK: psllvd
-; CHECK: ret
+
 define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) {
+; X32-LABEL: variable_shl1:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl1:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %k = shl <8 x i32> %x, %y
   ret <8 x i32> %k
 }
-; CHECK: variable_shl2
-; CHECK: psllvq
-; CHECK: ret
+
 define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: variable_shl2:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl2:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
   %k = shl <2 x i64> %x, %y
   ret <2 x i64> %k
 }
-; CHECK: variable_shl3
-; CHECK: psllvq
-; CHECK: ret
+
 define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) {
+; X32-LABEL: variable_shl3:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl3:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %k = shl <4 x i64> %x, %y
   ret <4 x i64> %k
 }
-; CHECK: variable_srl0
-; CHECK: psrlvd
-; CHECK: ret
+
 define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) {
+; X32-LABEL: variable_srl0:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl0:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
   %k = lshr <4 x i32> %x, %y
   ret <4 x i32> %k
 }
-; CHECK: variable_srl1
-; CHECK: psrlvd
-; CHECK: ret
+
 define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) {
+; X32-LABEL: variable_srl1:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl1:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %k = lshr <8 x i32> %x, %y
   ret <8 x i32> %k
 }
-; CHECK: variable_srl2
-; CHECK: psrlvq
-; CHECK: ret
+
 define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: variable_srl2:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl2:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
   %k = lshr <2 x i64> %x, %y
   ret <2 x i64> %k
 }
-; CHECK: variable_srl3
-; CHECK: psrlvq
-; CHECK: ret
+
 define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) {
+; X32-LABEL: variable_srl3:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl3:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %k = lshr <4 x i64> %x, %y
   ret <4 x i64> %k
 }
 
-; CHECK: variable_sra0
-; CHECK: vpsravd
-; CHECK: ret
 define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) {
+; X32-LABEL: variable_sra0:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_sra0:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
   %k = ashr <4 x i32> %x, %y
   ret <4 x i32> %k
 }
-; CHECK: variable_sra1
-; CHECK: vpsravd
-; CHECK: ret
+
 define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) {
+; X32-LABEL: variable_sra1:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_sra1:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %k = ashr <8 x i32> %x, %y
   ret <8 x i32> %k
 }
 
 ;;; Shift left
-; CHECK: vpslld
+
 define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
-  %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+; X32-LABEL: vshift00:
+; X32:       ## BB#0:
+; X32-NEXT:    vpslld $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift00:
+; X64:       ## BB#0:
+; X64-NEXT:    vpslld $2, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsllw
 define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
+; X32-LABEL: vshift01:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift01:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsllq
 define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
+; X32-LABEL: vshift02:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllq $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift02:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllq $2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
   ret <4 x i64> %s
 }
 
 ;;; Logical Shift right
-; CHECK: vpsrld
+
 define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
-  %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+; X32-LABEL: vshift03:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrld $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift03:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrld $2, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsrlw
 define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
+; X32-LABEL: vshift04:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlw $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift04:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlw $2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsrlq
 define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
+; X32-LABEL: vshift05:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlq $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift05:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlq $2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
   ret <4 x i64> %s
 }
 
 ;;; Arithmetic Shift right
-; CHECK: vpsrad
+
 define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
-  %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+; X32-LABEL: vshift06:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrad $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift06:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrad $2, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsraw
 define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
+; X32-LABEL: vshift07:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsraw $2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vshift07:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsraw $2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: variable_sra0_load
-; CHECK: vpsravd (%
-; CHECK: ret
 define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
+; X32-LABEL: variable_sra0_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsravd (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_sra0_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsravd (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %y1 = load <4 x i32>, <4 x i32>* %y
   %k = ashr <4 x i32> %x, %y1
   ret <4 x i32> %k
 }
 
-; CHECK: variable_sra1_load
-; CHECK: vpsravd (%
-; CHECK: ret
 define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
+; X32-LABEL: variable_sra1_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsravd (%eax), %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_sra1_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsravd (%rdi), %ymm0, %ymm0
+; X64-NEXT:    retq
   %y1 = load <8 x i32>, <8 x i32>* %y
   %k = ashr <8 x i32> %x, %y1
   ret <8 x i32> %k
 }
 
-; CHECK: variable_shl0_load
-; CHECK: vpsllvd (%
-; CHECK: ret
 define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
+; X32-LABEL: variable_shl0_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsllvd (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl0_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %y1 = load <4 x i32>, <4 x i32>* %y
   %k = shl <4 x i32> %x, %y1
   ret <4 x i32> %k
 }
-; CHECK: variable_shl1_load
-; CHECK: vpsllvd (%
-; CHECK: ret
+
 define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
+; X32-LABEL: variable_shl1_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsllvd (%eax), %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl1_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0
+; X64-NEXT:    retq
   %y1 = load <8 x i32>, <8 x i32>* %y
   %k = shl <8 x i32> %x, %y1
   ret <8 x i32> %k
 }
-; CHECK: variable_shl2_load
-; CHECK: vpsllvq (%
-; CHECK: ret
+
 define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
+; X32-LABEL: variable_shl2_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsllvq (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl2_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %y1 = load <2 x i64>, <2 x i64>* %y
   %k = shl <2 x i64> %x, %y1
   ret <2 x i64> %k
 }
-; CHECK: variable_shl3_load
-; CHECK: vpsllvq (%
-; CHECK: ret
+
 define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
+; X32-LABEL: variable_shl3_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsllvq (%eax), %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl3_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0
+; X64-NEXT:    retq
   %y1 = load <4 x i64>, <4 x i64>* %y
   %k = shl <4 x i64> %x, %y1
   ret <4 x i64> %k
 }
-; CHECK: variable_srl0_load
-; CHECK: vpsrlvd (%
-; CHECK: ret
+
 define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
+; X32-LABEL: variable_srl0_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsrlvd (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl0_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %y1 = load <4 x i32>, <4 x i32>* %y
   %k = lshr <4 x i32> %x, %y1
   ret <4 x i32> %k
 }
-; CHECK: variable_srl1_load
-; CHECK: vpsrlvd (%
-; CHECK: ret
+
 define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
+; X32-LABEL: variable_srl1_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsrlvd (%eax), %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl1_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0
+; X64-NEXT:    retq
   %y1 = load <8 x i32>, <8 x i32>* %y
   %k = lshr <8 x i32> %x, %y1
   ret <8 x i32> %k
 }
-; CHECK: variable_srl2_load
-; CHECK: vpsrlvq (%
-; CHECK: ret
+
 define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
+; X32-LABEL: variable_srl2_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsrlvq (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl2_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %y1 = load <2 x i64>, <2 x i64>* %y
   %k = lshr <2 x i64> %x, %y1
   ret <2 x i64> %k
 }
-; CHECK: variable_srl3_load
-; CHECK: vpsrlvq (%
-; CHECK: ret
+
 define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) {
+; X32-LABEL: variable_srl3_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpsrlvq (%eax), %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_srl3_load:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0
+; X64-NEXT:    retq
   %y1 = load <4 x i64>, <4 x i64>* %y
   %k = lshr <4 x i64> %x, %y1
   ret <4 x i64> %k
 }
 
 define <32 x i8> @shl9(<32 x i8> %A) nounwind {
+; X32-LABEL: shl9:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $3, %ymm0, %ymm0
+; X32-NEXT:    vpand LCPI28_0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shl9:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $3, %ymm0, %ymm0
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    retq
   %B = shl <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %B
-; CHECK-LABEL: shl9:
-; CHECK: vpsllw $3
-; CHECK: vpand
-; CHECK: ret
 }
 
 define <32 x i8> @shr9(<32 x i8> %A) nounwind {
+; X32-LABEL: shr9:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; X32-NEXT:    vpand LCPI29_0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shr9:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    retq
   %B = lshr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %B
-; CHECK-LABEL: shr9:
-; CHECK: vpsrlw $3
-; CHECK: vpand
-; CHECK: ret
 }
 
 define <32 x i8> @sra_v32i8_7(<32 x i8> %A) nounwind {
+; X32-LABEL: sra_v32i8_7:
+; X32:       ## BB#0:
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: sra_v32i8_7:
+; X64:       ## BB#0:
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %B = ashr <32 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   ret <32 x i8> %B
-; CHECK-LABEL: sra_v32i8_7:
-; CHECK: vpxor
-; CHECK: vpcmpgtb
-; CHECK: ret
 }
 
 define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
+; X32-LABEL: sra_v32i8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; X32-NEXT:    vpand LCPI31_0, %ymm0, %ymm0
+; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: sra_v32i8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X64-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %B = ashr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %B
-; CHECK-LABEL: sra_v32i8:
-; CHECK: vpsrlw $3
-; CHECK: vpand
-; CHECK: vpxor
-; CHECK: vpsubb
-; CHECK: ret
 }
 
-; CHECK: _sext_v16i16
-; CHECK: vpsllw
-; CHECK: vpsraw
-; CHECK-NOT: vinsertf128
 define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
+; X32-LABEL: sext_v16i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $8, %ymm0, %ymm0
+; X32-NEXT:    vpsraw $8, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: sext_v16i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $8, %ymm0, %ymm0
+; X64-NEXT:    vpsraw $8, %ymm0, %ymm0
+; X64-NEXT:    retq
   %b = trunc <16 x i16> %a to <16 x i8>
   %c = sext <16 x i8> %b to <16 x i16>
   ret <16 x i16> %c
 }
 
-; CHECK: _sext_v8i32
-; CHECK: vpslld
-; CHECK: vpsrad
-; CHECK-NOT: vinsertf128
 define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
+; X32-LABEL: sext_v8i32:
+; X32:       ## BB#0:
+; X32-NEXT:    vpslld $16, %ymm0, %ymm0
+; X32-NEXT:    vpsrad $16, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: sext_v8i32:
+; X64:       ## BB#0:
+; X64-NEXT:    vpslld $16, %ymm0, %ymm0
+; X64-NEXT:    vpsrad $16, %ymm0, %ymm0
+; X64-NEXT:    retq
   %b = trunc <8 x i32> %a to <8 x i16>
   %c = sext <8 x i16> %b to <8 x i32>
   ret <8 x i32> %c
 }
 
 define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8  x i16> %rhs) {
-; CHECK-LABEL: variable_shl16:
-; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
-; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
-; CHECK: vpsllvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
-; CHECK: vpshufb
-; CHECK: vpermq
+; X32-LABEL: variable_shl16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_shl16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %res = shl <8 x i16> %lhs, %rhs
   ret <8 x i16> %res
 }
 
 define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8  x i16> %rhs) {
-; CHECK-LABEL: variable_ashr16:
-; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
-; CHECK-DAG: vpmovsxwd %xmm0, [[LHS:%ymm[0-9]+]]
-; CHECK: vpsravd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
-; CHECK: vpshufb
-; CHECK: vpermq
+; X32-LABEL: variable_ashr16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
+; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_ashr16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
+; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %res = ashr <8 x i16> %lhs, %rhs
   ret <8 x i16> %res
 }
 
 define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8  x i16> %rhs) {
-; CHECK-LABEL: variable_lshr16:
-; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
-; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
-; CHECK: vpsrlvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
-; CHECK: vpshufb
-; CHECK: vpermq
+; X32-LABEL: variable_lshr16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: variable_lshr16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %res = lshr <8 x i16> %lhs, %rhs
   ret <8 x i16> %res
-}
\ No newline at end of file
+}

Modified: llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll Sat Oct  8 13:36:57 2016
@@ -1,102 +1,152 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
 ; AVX2 Logical Shift Left
 
 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
-; CHECK-LABEL: test_sllw_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sllw_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sllw_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
   ret <16 x i16> %shl
 }
 
 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
-; CHECK-LABEL: test_sllw_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sllw_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sllw_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <16 x i16> %shl
 }
 
 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
-; CHECK-LABEL: test_sllw_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsllw $15, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sllw_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsllw $15, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sllw_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsllw $15, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
-; CHECK-LABEL: test_slld_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_slld_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_slld_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shl
 }
 
 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
-; CHECK-LABEL: test_slld_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_slld_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_slld_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i32> %shl
 }
 
 define <8 x i32> @test_vpslld_var(i32 %shift) {
-; CHECK-LABEL: test_vpslld_var:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovd %edi, %xmm0
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
-; CHECK-NEXT:    vpslld %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_vpslld_var:
+; X32:       ## BB#0:
+; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
+; X32-NEXT:    vpslld %xmm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_vpslld_var:
+; X64:       ## BB#0:
+; X64-NEXT:    vmovd %edi, %xmm0
+; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
+; X64-NEXT:    vpslld %xmm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
   %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
   ret <8 x i32> %tmp
 }
 
 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
-; CHECK-LABEL: test_slld_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpslld $31, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_slld_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpslld $31, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_slld_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpslld $31, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
-; CHECK-LABEL: test_sllq_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sllq_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sllq_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
   ret <4 x i64> %shl
 }
 
 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
-; CHECK-LABEL: test_sllq_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sllq_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sllq_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i64> %shl
 }
 
 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
-; CHECK-LABEL: test_sllq_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsllq $63, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sllq_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsllq $63, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sllq_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsllq $63, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
   ret <4 x i64> %shl
@@ -105,58 +155,86 @@ entry:
 ; AVX2 Arithmetic Shift
 
 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
-; CHECK-LABEL: test_sraw_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sraw_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sraw_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
   ret <16 x i16> %shl
 }
 
 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
-; CHECK-LABEL: test_sraw_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsraw $1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sraw_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsraw $1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sraw_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsraw $1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <16 x i16> %shl
 }
 
 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
-; CHECK-LABEL: test_sraw_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsraw $15, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_sraw_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsraw $15, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_sraw_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsraw $15, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
-; CHECK-LABEL: test_srad_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srad_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srad_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shl
 }
 
 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
-; CHECK-LABEL: test_srad_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrad $1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srad_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrad $1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srad_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrad $1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i32> %shl
 }
 
 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
-; CHECK-LABEL: test_srad_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srad_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrad $31, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srad_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrad $31, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
@@ -165,102 +243,154 @@ entry:
 ; SSE Logical Shift Right
 
 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
-; CHECK-LABEL: test_srlw_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srlw_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srlw_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
   ret <16 x i16> %shl
 }
 
 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
-; CHECK-LABEL: test_srlw_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srlw_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srlw_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <16 x i16> %shl
 }
 
 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
-; CHECK-LABEL: test_srlw_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrlw $15, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srlw_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrlw $15, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srlw_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrlw $15, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
-; CHECK-LABEL: test_srld_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srld_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srld_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shl
 }
 
 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
-; CHECK-LABEL: test_srld_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrld $1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srld_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrld $1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srld_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrld $1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i32> %shl
 }
 
 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
-; CHECK-LABEL: test_srld_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srld_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrld $31, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srld_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrld $31, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
-; CHECK-LABEL: test_srlq_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srlq_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srlq_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    retq
 entry:
   %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
   ret <4 x i64> %shl
 }
 
 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
-; CHECK-LABEL: test_srlq_2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrlq $1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srlq_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srlq_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i64> %shl
 }
 
 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
-; CHECK-LABEL: test_srlq_3:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vpsrlq $63, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: test_srlq_3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpsrlq $63, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_srlq_3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpsrlq $63, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
   ret <4 x i64> %shl
 }
 
 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: srl_trunc_and_v4i64:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
-; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X32-LABEL: srl_trunc_and_v4i64:
+; X32:       ## BB#0:
+; X32-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; X32-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; X32-NEXT:    vpbroadcastd LCPI25_0, %xmm2
+; X32-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; X32-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: srl_trunc_and_v4i64:
+; X64:       ## BB#0:
+; X64-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; X64-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; X64-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; X64-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
   %trunc = trunc <4 x i64> %and to <4 x i32>
   %sra = lshr <4 x i32> %x, %trunc
@@ -272,171 +402,305 @@ define <4 x i32> @srl_trunc_and_v4i64(<4
 ;
 
 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: shl_8i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X32-LABEL: shl_8i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: shl_8i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %shl = shl <8 x i16> %r, %a
   ret <8 x i16> %shl
 }
 
 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: shl_16i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
-; CHECK-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT:    vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: shl_16i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
+; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shl_16i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X64-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
+; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT:    retq
   %shl = shl <16 x i16> %r, %a
   ret <16 x i16> %shl
 }
 
 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: shl_32i8:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT:    vpsllw $4, %ymm0, %ymm2
-; CHECK-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpsllw $2, %ymm0, %ymm2
-; CHECK-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; CHECK-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: shl_32i8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-NEXT:    vpsllw $4, %ymm0, %ymm2
+; X32-NEXT:    vpand LCPI28_0, %ymm2, %ymm2
+; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-NEXT:    vpsllw $2, %ymm0, %ymm2
+; X32-NEXT:    vpand LCPI28_1, %ymm2, %ymm2
+; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shl_32i8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X64-NEXT:    vpsllw $4, %ymm0, %ymm2
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpsllw $2, %ymm0, %ymm2
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %shl = shl <32 x i8> %r, %a
   ret <32 x i8> %shl
 }
 
 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: ashr_8i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
-; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X32-LABEL: ashr_8i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
+; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: ashr_8i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
+; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %ashr = ashr <8 x i16> %r, %a
   ret <8 x i16> %ashr
 }
 
 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: ashr_16i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
-; CHECK-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT:    vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: ashr_16i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
+; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: ashr_16i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X64-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
+; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT:    retq
   %ashr = ashr <16 x i16> %r, %a
   ret <16 x i16> %ashr
 }
 
 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: ashr_32i8:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-NEXT:    vpsraw $4, %ymm3, %ymm4
-; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; CHECK-NEXT:    vpsraw $2, %ymm3, %ymm4
-; CHECK-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; CHECK-NEXT:    vpsraw $1, %ymm3, %ymm4
-; CHECK-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; CHECK-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-NEXT:    vpsraw $4, %ymm0, %ymm3
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    vpsraw $2, %ymm0, %ymm3
-; CHECK-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    vpsraw $1, %ymm0, %ymm3
-; CHECK-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; CHECK-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: ashr_32i8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X32-NEXT:    vpsraw $4, %ymm3, %ymm4
+; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-NEXT:    vpsraw $2, %ymm3, %ymm4
+; X32-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-NEXT:    vpsraw $1, %ymm3, %ymm4
+; X32-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X32-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X32-NEXT:    vpsraw $4, %ymm0, %ymm3
+; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-NEXT:    vpsraw $2, %ymm0, %ymm3
+; X32-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-NEXT:    vpsraw $1, %ymm0, %ymm3
+; X32-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; X32-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: ashr_32i8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X64-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X64-NEXT:    vpsraw $4, %ymm3, %ymm4
+; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X64-NEXT:    vpsraw $2, %ymm3, %ymm4
+; X64-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X64-NEXT:    vpsraw $1, %ymm3, %ymm4
+; X64-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X64-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X64-NEXT:    vpsraw $4, %ymm0, %ymm3
+; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpsraw $2, %ymm0, %ymm3
+; X64-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpsraw $1, %ymm0, %ymm3
+; X64-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; X64-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %ashr = ashr <32 x i8> %r, %a
   ret <32 x i8> %ashr
 }
 
 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: lshr_8i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X32-LABEL: lshr_8i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: lshr_8i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %lshr = lshr <8 x i16> %r, %a
   ret <8 x i16> %lshr
 }
 
 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: lshr_16i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
-; CHECK-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT:    vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: lshr_16i16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
+; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: lshr_16i16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X64-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
+; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT:    retq
   %lshr = lshr <16 x i16> %r, %a
   ret <16 x i16> %lshr
 }
 
 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: lshr_32i8:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; CHECK-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; CHECK-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; CHECK-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: lshr_32i8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; X32-NEXT:    vpand LCPI34_0, %ymm2, %ymm2
+; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; X32-NEXT:    vpand LCPI34_1, %ymm2, %ymm2
+; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; X32-NEXT:    vpand LCPI34_2, %ymm2, %ymm2
+; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: lshr_32i8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X64-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %lshr = lshr <32 x i8> %r, %a
   ret <32 x i8> %lshr
 }

Modified: llvm/trunk/test/CodeGen/X86/avx2-vperm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vperm.ll?rev=283666&r1=283665&r2=283666&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-vperm.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-vperm.ll Sat Oct  8 13:36:57 2016
@@ -1,12 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
 
 define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone {
-; CHECK-LABEL: perm_cl_int_8x32:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: perm_cl_int_8x32:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
+; X32-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: perm_cl_int_8x32:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
+; X64-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
 entry:
   %B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> <i32 0, i32 7, i32 2, i32 1, i32 2, i32 7, i32 6, i32 0>
   ret <8 x i32> %B
@@ -14,31 +21,47 @@ entry:
 
 
 define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone {
-; CHECK-LABEL: perm_cl_fp_8x32:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: perm_cl_fp_8x32:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
+; X32-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: perm_cl_fp_8x32:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
+; X64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; X64-NEXT:    retq
 entry:
   %B = shufflevector <8 x float> %A, <8 x float> undef, <8 x i32> <i32 undef, i32 7, i32 2, i32 undef, i32 4, i32 undef, i32 1, i32 6>
   ret <8 x float> %B
 }
 
 define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone {
-; CHECK-LABEL: perm_cl_int_4x64:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1]
-; CHECK-NEXT:    retq
+; X32-LABEL: perm_cl_int_4x64:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: perm_cl_int_4x64:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; X64-NEXT:    retq
 entry:
   %B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x i64> %B
 }
 
 define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone {
-; CHECK-LABEL: perm_cl_fp_4x64:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
-; CHECK-NEXT:    retq
+; X32-LABEL: perm_cl_fp_4x64:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: perm_cl_fp_4x64:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; X64-NEXT:    retq
 entry:
   %B = shufflevector <4 x double> %A, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x double> %B




More information about the llvm-commits mailing list